diff --git a/app.py b/app.py index 721d6ed5d38494317e053557f057cbbb2fb2e286..2d70142153b39ed4d36fa1be8d6b34eef1b83237 100644 --- a/app.py +++ b/app.py @@ -1,58 +1,29 @@ import gradio as gr from pipeline.run_pipeline import * +''' +时间优化 +并发优化 +''' + # from run import * # ''' # 把一些文件移动到此文件路径下 # ''' # text = "A person is cutting a birthday cake with two red candles that spell out \"21\". The surface of the cake is round, and there is a balloon in the room. The person is using a silver knife to cut the cake." # image_path = "/newdisk3/wcx/val2014/COCO_val2014_000000297425.jpg" +pipeline = Pipeline(type="image-to-text", api_key="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") +# res,claim_list = pipeline.run(text=text, image_path=image_path,type="image-to-text") +# print(res) -def get_response(text, filepath): - pipeline = Pipeline() - res = pipeline.run(text=text, image_path=image_path) - return res +def get_response(text, filepath, type): + res, claim_list = pipeline.run(text=text, image_path=filepath, type=type) + return claim_list, res demo = gr.Interface( fn=get_response, - inputs=["text", gr.Image(type="filepath")], - outputs=["text"], + inputs=[gr.Textbox(placeholder="Input I2T model's response or T2I model's prompt", label="text input"), gr.Image(type="filepath", label="image input"), gr.Radio(['image-to-text','text-to-image'], label='task type', value='image-to-text')], + outputs=[gr.Textbox(label="claim list"), gr.Textbox(label="detect results")], ) -demo.launch() - -# def generate_mutimodal(title, context, img): -# return f"Title:{title}\nContext:{context}\n...{img}" - -# server = gr.Interface( -# fn=generate_mutimodal, -# inputs=[ -# gr.Textbox(lines=1, placeholder="请输入标题"), -# gr.Textbox(lines=2, placeholder="请输入正文"), -# gr.Image(shape=(200, 200), label="请上传图片(可选)") -# ], -# outputs="text" -# ) - -# server.launch() - -# import numpy as np -# import gradio as gr -# def sepia(input_img): -# #处理图像 -# sepia_filter = np.array([ -# [0.393, 0.769, 0.189], -# [0.349, 0.686, 0.168], -# [0.272, 0.534, 0.131] -# ]) -# sepia_img = input_img.dot(sepia_filter.T) -# sepia_img /= sepia_img.max() -# return sepia_img -# #shape设置输入图像大小 -# demo = gr.Interface(sepia, gr.Image(), "image") -# demo.launch() - -# Download human-readable labels for ImageNet. - -# gr.Interface(fn=sepia,inputs=gr.Image(type="pil"),outputs="image").launch() - +demo.queue().launch(share=True) diff --git a/models/dbnetpp.pth b/models/dbnetpp.pth new file mode 100644 index 0000000000000000000000000000000000000000..4d7e09cb7f002024608cccb63070e8dbfee85d3c --- /dev/null +++ b/models/dbnetpp.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ecb39ac54bd954fa44fd25b419b7a51539ded0f79408eb2419ca78f9f0299c6 +size 113299489 diff --git a/models/groundingdino_swint_ogc.pth b/models/groundingdino_swint_ogc.pth new file mode 100644 index 0000000000000000000000000000000000000000..5cdf6bcd10d491abf170a78eca4fcebf76aa791a --- /dev/null +++ b/models/groundingdino_swint_ogc.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b3ca2563c77c69f651d7bd133e97139c186df06231157a64c507099c52bc799 +size 693997677 diff --git a/models/maerec_b.pth b/models/maerec_b.pth new file mode 100644 index 0000000000000000000000000000000000000000..87e89e98fd3a3cf578f45745070c1eb3e6829734 --- /dev/null +++ b/models/maerec_b.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69a98230f5421517bce9fdc197261ec6845e353ceb7fa970194c04403e0ba89b +size 1764777173 diff --git a/pipeline/__pycache__/judge.cpython-39.pyc b/pipeline/__pycache__/judge.cpython-39.pyc index d36a59b1066a94a158bc916f72958cec11d29c8e..6a1204e8066cbeb29db960987e9d2311610ee8bc 100644 Binary files a/pipeline/__pycache__/judge.cpython-39.pyc and b/pipeline/__pycache__/judge.cpython-39.pyc differ diff --git a/pipeline/__pycache__/query_generate.cpython-39.pyc b/pipeline/__pycache__/query_generate.cpython-39.pyc index 7ce148c0bcc6097980300f9aae5cfc6eadabf698..2a1ef7a7e7b3ce6465777ee8d3d1987155ffdbe3 100644 Binary files a/pipeline/__pycache__/query_generate.cpython-39.pyc and b/pipeline/__pycache__/query_generate.cpython-39.pyc differ diff --git a/pipeline/__pycache__/run_pipeline.cpython-39.pyc b/pipeline/__pycache__/run_pipeline.cpython-39.pyc index 4854f28a2abed8a12141fe4dee6ecab9df73b750..00b93c167ce9bb1182f48d4ccb367a830128b498 100644 Binary files a/pipeline/__pycache__/run_pipeline.cpython-39.pyc and b/pipeline/__pycache__/run_pipeline.cpython-39.pyc differ diff --git a/pipeline/__pycache__/tool_execute.cpython-39.pyc b/pipeline/__pycache__/tool_execute.cpython-39.pyc index b6da77bd9343601626df1dfdc31319b1f2df7a53..657aa1cc2f33b1ddb287a8a35ae05930dfcb9374 100644 Binary files a/pipeline/__pycache__/tool_execute.cpython-39.pyc and b/pipeline/__pycache__/tool_execute.cpython-39.pyc differ diff --git a/pipeline/cache_files/COCO_val2014_000000297425.jpg b/pipeline/cache_files/COCO_val2014_000000297425.jpg new file mode 100644 index 0000000000000000000000000000000000000000..142889bbf05671c1770da931f8fe0915ca94d185 Binary files /dev/null and b/pipeline/cache_files/COCO_val2014_000000297425.jpg differ diff --git a/pipeline/cache_files/COCO_val2014_000000297425/2Axrpnw7ricrqoNoRqLPPm.jpg b/pipeline/cache_files/COCO_val2014_000000297425/2Axrpnw7ricrqoNoRqLPPm.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a3bc092c423f07f724885e86af9470280b1a1447 Binary files /dev/null and b/pipeline/cache_files/COCO_val2014_000000297425/2Axrpnw7ricrqoNoRqLPPm.jpg differ diff --git a/pipeline/cache_files/COCO_val2014_000000297425/N7r9ReNBe73cY2VL7gW9Az.jpg b/pipeline/cache_files/COCO_val2014_000000297425/N7r9ReNBe73cY2VL7gW9Az.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8b3364504a3886b65c90ec989db7ea7001e72206 Binary files /dev/null and b/pipeline/cache_files/COCO_val2014_000000297425/N7r9ReNBe73cY2VL7gW9Az.jpg differ diff --git a/pipeline/cache_files/COCO_val2014_000000297425/Wr7HRVVj6HtMGbsDbaXdU3.jpg b/pipeline/cache_files/COCO_val2014_000000297425/Wr7HRVVj6HtMGbsDbaXdU3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f0e1f3a9d4fd3efcabc9ed6a1bae73c707adc2a9 Binary files /dev/null and b/pipeline/cache_files/COCO_val2014_000000297425/Wr7HRVVj6HtMGbsDbaXdU3.jpg differ diff --git a/pipeline/cache_files/COCO_val2014_000000297425/max6iKdGyovWLMJzKQ4RHr.jpg b/pipeline/cache_files/COCO_val2014_000000297425/max6iKdGyovWLMJzKQ4RHr.jpg new file mode 100644 index 0000000000000000000000000000000000000000..41ac57871bdde7580d8f94264ceeae9ec9c30067 Binary files /dev/null and b/pipeline/cache_files/COCO_val2014_000000297425/max6iKdGyovWLMJzKQ4RHr.jpg differ diff --git a/pipeline/judge.py b/pipeline/judge.py index 1e815d2c595fd864cdae98578bda2d58c0ce9865..665a848299a4e3e5337b15dfafb62747674d8dca 100644 --- a/pipeline/judge.py +++ b/pipeline/judge.py @@ -2,9 +2,9 @@ import json import yaml import base64 class Judger: - def __init__(self, prompt_path, chat, type): + def __init__(self, prompt_path, chat): with open(prompt_path,"r",encoding='utf-8') as file: - self.prompt = yaml.load(file, yaml.FullLoader)[type] + self.prompt = yaml.load(file, yaml.FullLoader) self.chat = chat @@ -12,7 +12,7 @@ class Judger: with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode('utf-8') - def get_response(self, object_res, attribue_res, text_res, fact_res, claim_list, image_path): + def get_response(self, type, object_res, attribue_res, text_res, fact_res, claim_list, image_path): input = ''' Here is the object detection expert model's result: {object} @@ -40,24 +40,28 @@ class Judger: else: text_det = "none information" - - img1 = self.encode_image("/home/wcx/wcx/GroundingDINO/LVLM/cot/img_examples/animal.jpg") - img2 = self.encode_image("/home/wcx/wcx/GroundingDINO/LVLM/cot/img_examples/ball.jpg") + if type == "image-to-text": + img1 = self.encode_image("/home/wcx/wcx/GroundingDINO/LVLM/cot/img_examples/sandbeach.jpg") + img2 = self.encode_image("/home/wcx/wcx/GroundingDINO/LVLM/cot/img_examples/football.jpg") + else: + img1 = self.encode_image("/home/wcx/wcx/GroundingDINO/LVLM/cot/img_examples/animal.jpg") + img2 = self.encode_image("/home/wcx/wcx/GroundingDINO/LVLM/cot/img_examples/ball.jpg") base64_source_image = self.encode_image(image_path) content = [ - {"type": "text", "text": self.prompt["user"]}, + {"type": "text", "text": self.prompt[type]["user"]}, {"type": "image_url","image_url": f"data:image/jpeg;base64,{img1}"}, - {"type": "text", "text": self.prompt["example1"]}, + {"type": "text", "text": self.prompt[type]["example1"]}, {"type": "image_url","image_url": f"data:image/jpeg;base64,{img2}"}, - {"type": "text", "text": self.prompt["example2"]}, + {"type": "text", "text": self.prompt[type]["example2"]}, {"type": "image_url","image_url": f"data:image/jpeg;base64,{base64_source_image}"}, {"type": "text", "text": input.format(object=object_det,text=text_det,fact=fact_res,claims=claim_list)} ] - + + message = [ { 'role': 'system', - 'content': self.prompt["system"] + 'content': self.prompt[type]["system"] }, { "role": "user", diff --git a/pipeline/nltk/VERSION b/pipeline/nltk/VERSION deleted file mode 100644 index 0603aab1e29c286f7779a50891582571b616825c..0000000000000000000000000000000000000000 --- a/pipeline/nltk/VERSION +++ /dev/null @@ -1 +0,0 @@ -3.8.1 diff --git a/pipeline/nltk/__init__.py b/pipeline/nltk/__init__.py deleted file mode 100644 index b87cf230510581745ced457e373a7ecc7c3c9006..0000000000000000000000000000000000000000 --- a/pipeline/nltk/__init__.py +++ /dev/null @@ -1,209 +0,0 @@ -# Natural Language Toolkit (NLTK) -# -# Copyright (C) 2001-2023 NLTK Project -# Authors: Steven Bird -# Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -The Natural Language Toolkit (NLTK) is an open source Python library -for Natural Language Processing. A free online book is available. -(If you use the library for academic research, please cite the book.) - -Steven Bird, Ewan Klein, and Edward Loper (2009). -Natural Language Processing with Python. O'Reilly Media Inc. -https://www.nltk.org/book/ - -isort:skip_file -""" - -import os - -# ////////////////////////////////////////////////////// -# Metadata -# ////////////////////////////////////////////////////// - -# Version. For each new release, the version number should be updated -# in the file VERSION. -try: - # If a VERSION file exists, use it! - version_file = os.path.join(os.path.dirname(__file__), "VERSION") - with open(version_file) as infile: - __version__ = infile.read().strip() -except NameError: - __version__ = "unknown (running code interactively?)" -except OSError as ex: - __version__ = "unknown (%s)" % ex - -if __doc__ is not None: # fix for the ``python -OO`` - __doc__ += "\n@version: " + __version__ - - -# Copyright notice -__copyright__ = """\ -Copyright (C) 2001-2023 NLTK Project. - -Distributed and Licensed under the Apache License, Version 2.0, -which is included by reference. -""" - -__license__ = "Apache License, Version 2.0" -# Description of the toolkit, keywords, and the project's primary URL. -__longdescr__ = """\ -The Natural Language Toolkit (NLTK) is a Python package for -natural language processing. NLTK requires Python 3.7, 3.8, 3.9, 3.10 or 3.11.""" -__keywords__ = [ - "NLP", - "CL", - "natural language processing", - "computational linguistics", - "parsing", - "tagging", - "tokenizing", - "syntax", - "linguistics", - "language", - "natural language", - "text analytics", -] -__url__ = "https://www.nltk.org/" - -# Maintainer, contributors, etc. -__maintainer__ = "NLTK Team" -__maintainer_email__ = "nltk.team@gmail.com" -__author__ = __maintainer__ -__author_email__ = __maintainer_email__ - -# "Trove" classifiers for Python Package Index. -__classifiers__ = [ - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "Intended Audience :: Education", - "Intended Audience :: Information Technology", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Topic :: Scientific/Engineering", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - "Topic :: Scientific/Engineering :: Human Machine Interfaces", - "Topic :: Scientific/Engineering :: Information Analysis", - "Topic :: Text Processing", - "Topic :: Text Processing :: Filters", - "Topic :: Text Processing :: General", - "Topic :: Text Processing :: Indexing", - "Topic :: Text Processing :: Linguistic", -] - -from nltk.internals import config_java - -# support numpy from pypy -try: - import numpypy -except ImportError: - pass - -# Override missing methods on environments where it cannot be used like GAE. -import subprocess - -if not hasattr(subprocess, "PIPE"): - - def _fake_PIPE(*args, **kwargs): - raise NotImplementedError("subprocess.PIPE is not supported.") - - subprocess.PIPE = _fake_PIPE -if not hasattr(subprocess, "Popen"): - - def _fake_Popen(*args, **kwargs): - raise NotImplementedError("subprocess.Popen is not supported.") - - subprocess.Popen = _fake_Popen - -########################################################### -# TOP-LEVEL MODULES -########################################################### - -# Import top-level functionality into top-level namespace - -from nltk.collocations import * -from nltk.decorators import decorator, memoize -from nltk.featstruct import * -from nltk.grammar import * -from nltk.probability import * -from nltk.text import * -from nltk.util import * -from nltk.jsontags import * - -########################################################### -# PACKAGES -########################################################### - -from nltk.chunk import * -from nltk.classify import * -from nltk.inference import * -from nltk.metrics import * -from nltk.parse import * -from nltk.tag import * -from nltk.tokenize import * -from nltk.translate import * -from nltk.tree import * -from nltk.sem import * -from nltk.stem import * - -# Packages which can be lazily imported -# (a) we don't import * -# (b) they're slow to import or have run-time dependencies -# that can safely fail at run time - -from nltk import lazyimport - -app = lazyimport.LazyModule("app", locals(), globals()) -chat = lazyimport.LazyModule("chat", locals(), globals()) -corpus = lazyimport.LazyModule("corpus", locals(), globals()) -draw = lazyimport.LazyModule("draw", locals(), globals()) -toolbox = lazyimport.LazyModule("toolbox", locals(), globals()) - -# Optional loading - -try: - import numpy -except ImportError: - pass -else: - from nltk import cluster - -from nltk.downloader import download, download_shell - -try: - import tkinter -except ImportError: - pass -else: - try: - from nltk.downloader import download_gui - except RuntimeError as e: - import warnings - - warnings.warn( - "Corpus downloader GUI not loaded " - "(RuntimeError during import: %s)" % str(e) - ) - -# explicitly import all top-level modules (ensuring -# they override the same names inadvertently imported -# from a subpackage) - -from nltk import ccg, chunk, classify, collocations -from nltk import data, featstruct, grammar, help, inference, metrics -from nltk import misc, parse, probability, sem, stem, wsd -from nltk import tag, tbl, text, tokenize, translate, tree, util - - -# FIXME: override any accidentally imported demo, see https://github.com/nltk/nltk/issues/2116 -def demo(): - print("To run the demo code for a module, type nltk.module.demo()") diff --git a/pipeline/nltk/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index de956bcbcdf0f164168b5bbb2eb47143488bf9d7..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/book.cpython-39.pyc b/pipeline/nltk/__pycache__/book.cpython-39.pyc deleted file mode 100644 index edd09efba1302cf20a6997d460b9cf18150b0ec3..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/book.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/cli.cpython-39.pyc b/pipeline/nltk/__pycache__/cli.cpython-39.pyc deleted file mode 100644 index 1521328b326ec20aac1c0e066e60a48086d9c148..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/cli.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/collections.cpython-39.pyc b/pipeline/nltk/__pycache__/collections.cpython-39.pyc deleted file mode 100644 index d8f60fd6df814ff925b955e9cc1d9aca9d14dbf7..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/collections.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/collocations.cpython-39.pyc b/pipeline/nltk/__pycache__/collocations.cpython-39.pyc deleted file mode 100644 index 37c342e423f94e6aa374e93c2f714fdf13a59ee3..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/collocations.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/compat.cpython-39.pyc b/pipeline/nltk/__pycache__/compat.cpython-39.pyc deleted file mode 100644 index 76c1d828dd7bf2105735affe0a55b46dcd63384c..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/compat.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/data.cpython-39.pyc b/pipeline/nltk/__pycache__/data.cpython-39.pyc deleted file mode 100644 index 74e1d534e0ba0f141e6c3cf35f38177dc5040fb7..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/data.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/decorators.cpython-39.pyc b/pipeline/nltk/__pycache__/decorators.cpython-39.pyc deleted file mode 100644 index 4df156d13faa4c4169d6b8d1b6967069e542d9bd..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/decorators.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/downloader.cpython-39.pyc b/pipeline/nltk/__pycache__/downloader.cpython-39.pyc deleted file mode 100644 index a09d2a7a24c9ff48d5d98680265bd47974f2c8a4..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/downloader.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/featstruct.cpython-39.pyc b/pipeline/nltk/__pycache__/featstruct.cpython-39.pyc deleted file mode 100644 index 4ccf00e816b7be17a011548be39f38d6405e782e..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/featstruct.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/grammar.cpython-39.pyc b/pipeline/nltk/__pycache__/grammar.cpython-39.pyc deleted file mode 100644 index 874ef45adb4899837792e5c3627f95e46749abcb..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/grammar.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/help.cpython-39.pyc b/pipeline/nltk/__pycache__/help.cpython-39.pyc deleted file mode 100644 index 7039cf6c09b1ba49ca4935895ab85cd5f945843d..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/help.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/internals.cpython-39.pyc b/pipeline/nltk/__pycache__/internals.cpython-39.pyc deleted file mode 100644 index 8e4053befc0ea8a9bd40fe102045a8af230d36d6..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/internals.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/jsontags.cpython-39.pyc b/pipeline/nltk/__pycache__/jsontags.cpython-39.pyc deleted file mode 100644 index 4f29d6d6d2a1545384a3a79fd09f5d7516582c2b..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/jsontags.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/langnames.cpython-39.pyc b/pipeline/nltk/__pycache__/langnames.cpython-39.pyc deleted file mode 100644 index 7ab9dac874051171f1c1ddbe6d39436f081d3495..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/langnames.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/lazyimport.cpython-39.pyc b/pipeline/nltk/__pycache__/lazyimport.cpython-39.pyc deleted file mode 100644 index 477e58507c4e8505bea4151b0865dd06dee6e0e0..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/lazyimport.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/probability.cpython-39.pyc b/pipeline/nltk/__pycache__/probability.cpython-39.pyc deleted file mode 100644 index d6be8ff9529fd8ac708561fd3d3ee9e8d71f5782..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/probability.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/text.cpython-39.pyc b/pipeline/nltk/__pycache__/text.cpython-39.pyc deleted file mode 100644 index 58072be6b1b94d327366d721aba2bfb379759ce3..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/text.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/tgrep.cpython-39.pyc b/pipeline/nltk/__pycache__/tgrep.cpython-39.pyc deleted file mode 100644 index 53573c35f7101ce0ecfc43dc6d2e6d510e56b95b..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/tgrep.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/toolbox.cpython-39.pyc b/pipeline/nltk/__pycache__/toolbox.cpython-39.pyc deleted file mode 100644 index 5f1c710bc8f5d450f852389a76b80df0ca7e41b6..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/toolbox.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/treeprettyprinter.cpython-39.pyc b/pipeline/nltk/__pycache__/treeprettyprinter.cpython-39.pyc deleted file mode 100644 index 0e8902680ecd25298e029323bc622aa8d07e7769..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/treeprettyprinter.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/treetransforms.cpython-39.pyc b/pipeline/nltk/__pycache__/treetransforms.cpython-39.pyc deleted file mode 100644 index 810863f6f92785cf6f43494166c7ea219dba3779..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/treetransforms.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/util.cpython-39.pyc b/pipeline/nltk/__pycache__/util.cpython-39.pyc deleted file mode 100644 index 7d3db1fdf11ff182101475f5f994168ccaaa48e5..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/util.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/__pycache__/wsd.cpython-39.pyc b/pipeline/nltk/__pycache__/wsd.cpython-39.pyc deleted file mode 100644 index 79248e70d024c0ae4d27e7ede03c4faa61713d41..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/__pycache__/wsd.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/app/__init__.py b/pipeline/nltk/app/__init__.py deleted file mode 100644 index d4bbf1831e714c40514313293ae9027e181b8a77..0000000000000000000000000000000000000000 --- a/pipeline/nltk/app/__init__.py +++ /dev/null @@ -1,47 +0,0 @@ -# Natural Language Toolkit: Applications package -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird -# URL: -# For license information, see LICENSE.TXT - -""" -Interactive NLTK Applications: - -chartparser: Chart Parser -chunkparser: Regular-Expression Chunk Parser -collocations: Find collocations in text -concordance: Part-of-speech concordancer -nemo: Finding (and Replacing) Nemo regular expression tool -rdparser: Recursive Descent Parser -srparser: Shift-Reduce Parser -wordnet: WordNet Browser -""" - - -# Import Tkinter-based modules if Tkinter is installed -try: - import tkinter -except ImportError: - import warnings - - warnings.warn("nltk.app package not loaded (please install Tkinter library).") -else: - from nltk.app.chartparser_app import app as chartparser - from nltk.app.chunkparser_app import app as chunkparser - from nltk.app.collocations_app import app as collocations - from nltk.app.concordance_app import app as concordance - from nltk.app.nemo_app import app as nemo - from nltk.app.rdparser_app import app as rdparser - from nltk.app.srparser_app import app as srparser - from nltk.app.wordnet_app import app as wordnet - - try: - from matplotlib import pylab - except ImportError: - import warnings - - warnings.warn("nltk.app.wordfreq not loaded (requires the matplotlib library).") - else: - from nltk.app.wordfreq_app import app as wordfreq diff --git a/pipeline/nltk/app/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/app/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index ed06bb0113cb6e3da56a9446ea676b1eacc9a017..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/app/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/app/__pycache__/chartparser_app.cpython-39.pyc b/pipeline/nltk/app/__pycache__/chartparser_app.cpython-39.pyc deleted file mode 100644 index b501eaecb92d1f99989158f4bfb5563bd17add38..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/app/__pycache__/chartparser_app.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/app/__pycache__/chunkparser_app.cpython-39.pyc b/pipeline/nltk/app/__pycache__/chunkparser_app.cpython-39.pyc deleted file mode 100644 index 124a74a137bbc44ce4409a289db91db28a92e3eb..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/app/__pycache__/chunkparser_app.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/app/__pycache__/collocations_app.cpython-39.pyc b/pipeline/nltk/app/__pycache__/collocations_app.cpython-39.pyc deleted file mode 100644 index cbe4281857306aab434cb9f122ca5cd3790e0079..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/app/__pycache__/collocations_app.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/app/__pycache__/concordance_app.cpython-39.pyc b/pipeline/nltk/app/__pycache__/concordance_app.cpython-39.pyc deleted file mode 100644 index d48d6c40120613649447199c7a21f78b260da3a5..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/app/__pycache__/concordance_app.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/app/__pycache__/nemo_app.cpython-39.pyc b/pipeline/nltk/app/__pycache__/nemo_app.cpython-39.pyc deleted file mode 100644 index 6d5b21cff8273d1899852a53aa4afd148efcd205..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/app/__pycache__/nemo_app.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/app/__pycache__/rdparser_app.cpython-39.pyc b/pipeline/nltk/app/__pycache__/rdparser_app.cpython-39.pyc deleted file mode 100644 index d00a24fe7190ca552292ea9d340e22cdeed3231a..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/app/__pycache__/rdparser_app.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/app/__pycache__/srparser_app.cpython-39.pyc b/pipeline/nltk/app/__pycache__/srparser_app.cpython-39.pyc deleted file mode 100644 index c3ed547355ed30214d6bc67578a251d9a6cc90aa..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/app/__pycache__/srparser_app.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/app/__pycache__/wordfreq_app.cpython-39.pyc b/pipeline/nltk/app/__pycache__/wordfreq_app.cpython-39.pyc deleted file mode 100644 index 1052c7a3893fccc16a0ccf13a54dcca0ac154017..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/app/__pycache__/wordfreq_app.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/app/__pycache__/wordnet_app.cpython-39.pyc b/pipeline/nltk/app/__pycache__/wordnet_app.cpython-39.pyc deleted file mode 100644 index 58bed3a5cd3b4910fb116cee25307eb73a8a0f14..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/app/__pycache__/wordnet_app.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/app/chartparser_app.py b/pipeline/nltk/app/chartparser_app.py deleted file mode 100644 index 53a938c642c6dcfe23fc085205cac3a541821207..0000000000000000000000000000000000000000 --- a/pipeline/nltk/app/chartparser_app.py +++ /dev/null @@ -1,2569 +0,0 @@ -# Natural Language Toolkit: Chart Parser Application -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Jean Mark Gawron -# Steven Bird -# URL: -# For license information, see LICENSE.TXT - -""" -A graphical tool for exploring chart parsing. - -Chart parsing is a flexible parsing algorithm that uses a data -structure called a "chart" to record hypotheses about syntactic -constituents. Each hypothesis is represented by a single "edge" on -the chart. A set of "chart rules" determine when new edges can be -added to the chart. This set of rules controls the overall behavior -of the parser (e.g. whether it parses top-down or bottom-up). - -The chart parsing tool demonstrates the process of parsing a single -sentence, with a given grammar and lexicon. Its display is divided -into three sections: the bottom section displays the chart; the middle -section displays the sentence; and the top section displays the -partial syntax tree corresponding to the selected edge. Buttons along -the bottom of the window are used to control the execution of the -algorithm. - -The chart parsing tool allows for flexible control of the parsing -algorithm. At each step of the algorithm, you can select which rule -or strategy you wish to apply. This allows you to experiment with -mixing different strategies (e.g. top-down and bottom-up). You can -exercise fine-grained control over the algorithm by selecting which -edge you wish to apply a rule to. -""" - -# At some point, we should rewrite this tool to use the new canvas -# widget system. - - -import os.path -import pickle -from tkinter import ( - Button, - Canvas, - Checkbutton, - Frame, - IntVar, - Label, - Menu, - Scrollbar, - Tk, - Toplevel, -) -from tkinter.filedialog import askopenfilename, asksaveasfilename -from tkinter.font import Font -from tkinter.messagebox import showerror, showinfo - -from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment -from nltk.draw.util import ( - CanvasFrame, - ColorizedList, - EntryDialog, - MutableOptionMenu, - ShowText, - SymbolWidget, -) -from nltk.grammar import CFG, Nonterminal -from nltk.parse.chart import ( - BottomUpPredictCombineRule, - BottomUpPredictRule, - Chart, - LeafEdge, - LeafInitRule, - SingleEdgeFundamentalRule, - SteppingChartParser, - TopDownInitRule, - TopDownPredictRule, - TreeEdge, -) -from nltk.tree import Tree -from nltk.util import in_idle - -# Known bug: ChartView doesn't handle edges generated by epsilon -# productions (e.g., [Production: PP -> ]) very well. - -####################################################################### -# Edge List -####################################################################### - - -class EdgeList(ColorizedList): - ARROW = SymbolWidget.SYMBOLS["rightarrow"] - - def _init_colortags(self, textwidget, options): - textwidget.tag_config("terminal", foreground="#006000") - textwidget.tag_config("arrow", font="symbol", underline="0") - textwidget.tag_config("dot", foreground="#000000") - textwidget.tag_config( - "nonterminal", foreground="blue", font=("helvetica", -12, "bold") - ) - - def _item_repr(self, item): - contents = [] - contents.append(("%s\t" % item.lhs(), "nonterminal")) - contents.append((self.ARROW, "arrow")) - for i, elt in enumerate(item.rhs()): - if i == item.dot(): - contents.append((" *", "dot")) - if isinstance(elt, Nonterminal): - contents.append((" %s" % elt.symbol(), "nonterminal")) - else: - contents.append((" %r" % elt, "terminal")) - if item.is_complete(): - contents.append((" *", "dot")) - return contents - - -####################################################################### -# Chart Matrix View -####################################################################### - - -class ChartMatrixView: - """ - A view of a chart that displays the contents of the corresponding matrix. - """ - - def __init__( - self, parent, chart, toplevel=True, title="Chart Matrix", show_numedges=False - ): - self._chart = chart - self._cells = [] - self._marks = [] - - self._selected_cell = None - - if toplevel: - self._root = Toplevel(parent) - self._root.title(title) - self._root.bind("", self.destroy) - self._init_quit(self._root) - else: - self._root = Frame(parent) - - self._init_matrix(self._root) - self._init_list(self._root) - if show_numedges: - self._init_numedges(self._root) - else: - self._numedges_label = None - - self._callbacks = {} - - self._num_edges = 0 - - self.draw() - - def _init_quit(self, root): - quit = Button(root, text="Quit", command=self.destroy) - quit.pack(side="bottom", expand=0, fill="none") - - def _init_matrix(self, root): - cframe = Frame(root, border=2, relief="sunken") - cframe.pack(expand=0, fill="none", padx=1, pady=3, side="top") - self._canvas = Canvas(cframe, width=200, height=200, background="white") - self._canvas.pack(expand=0, fill="none") - - def _init_numedges(self, root): - self._numedges_label = Label(root, text="0 edges") - self._numedges_label.pack(expand=0, fill="none", side="top") - - def _init_list(self, root): - self._list = EdgeList(root, [], width=20, height=5) - self._list.pack(side="top", expand=1, fill="both", pady=3) - - def cb(edge, self=self): - self._fire_callbacks("select", edge) - - self._list.add_callback("select", cb) - self._list.focus() - - def destroy(self, *e): - if self._root is None: - return - try: - self._root.destroy() - except: - pass - self._root = None - - def set_chart(self, chart): - if chart is not self._chart: - self._chart = chart - self._num_edges = 0 - self.draw() - - def update(self): - if self._root is None: - return - - # Count the edges in each cell - N = len(self._cells) - cell_edges = [[0 for i in range(N)] for j in range(N)] - for edge in self._chart: - cell_edges[edge.start()][edge.end()] += 1 - - # Color the cells correspondingly. - for i in range(N): - for j in range(i, N): - if cell_edges[i][j] == 0: - color = "gray20" - else: - color = "#00{:02x}{:02x}".format( - min(255, 50 + 128 * cell_edges[i][j] / 10), - max(0, 128 - 128 * cell_edges[i][j] / 10), - ) - cell_tag = self._cells[i][j] - self._canvas.itemconfig(cell_tag, fill=color) - if (i, j) == self._selected_cell: - self._canvas.itemconfig(cell_tag, outline="#00ffff", width=3) - self._canvas.tag_raise(cell_tag) - else: - self._canvas.itemconfig(cell_tag, outline="black", width=1) - - # Update the edge list. - edges = list(self._chart.select(span=self._selected_cell)) - self._list.set(edges) - - # Update our edge count. - self._num_edges = self._chart.num_edges() - if self._numedges_label is not None: - self._numedges_label["text"] = "%d edges" % self._num_edges - - def activate(self): - self._canvas.itemconfig("inactivebox", state="hidden") - self.update() - - def inactivate(self): - self._canvas.itemconfig("inactivebox", state="normal") - self.update() - - def add_callback(self, event, func): - self._callbacks.setdefault(event, {})[func] = 1 - - def remove_callback(self, event, func=None): - if func is None: - del self._callbacks[event] - else: - try: - del self._callbacks[event][func] - except: - pass - - def _fire_callbacks(self, event, *args): - if event not in self._callbacks: - return - for cb_func in list(self._callbacks[event].keys()): - cb_func(*args) - - def select_cell(self, i, j): - if self._root is None: - return - - # If the cell is already selected (and the chart contents - # haven't changed), then do nothing. - if (i, j) == self._selected_cell and self._chart.num_edges() == self._num_edges: - return - - self._selected_cell = (i, j) - self.update() - - # Fire the callback. - self._fire_callbacks("select_cell", i, j) - - def deselect_cell(self): - if self._root is None: - return - self._selected_cell = None - self._list.set([]) - self.update() - - def _click_cell(self, i, j): - if self._selected_cell == (i, j): - self.deselect_cell() - else: - self.select_cell(i, j) - - def view_edge(self, edge): - self.select_cell(*edge.span()) - self._list.view(edge) - - def mark_edge(self, edge): - if self._root is None: - return - self.select_cell(*edge.span()) - self._list.mark(edge) - - def unmark_edge(self, edge=None): - if self._root is None: - return - self._list.unmark(edge) - - def markonly_edge(self, edge): - if self._root is None: - return - self.select_cell(*edge.span()) - self._list.markonly(edge) - - def draw(self): - if self._root is None: - return - LEFT_MARGIN = BOT_MARGIN = 15 - TOP_MARGIN = 5 - c = self._canvas - c.delete("all") - N = self._chart.num_leaves() + 1 - dx = (int(c["width"]) - LEFT_MARGIN) / N - dy = (int(c["height"]) - TOP_MARGIN - BOT_MARGIN) / N - - c.delete("all") - - # Labels and dotted lines - for i in range(N): - c.create_text( - LEFT_MARGIN - 2, i * dy + dy / 2 + TOP_MARGIN, text=repr(i), anchor="e" - ) - c.create_text( - i * dx + dx / 2 + LEFT_MARGIN, - N * dy + TOP_MARGIN + 1, - text=repr(i), - anchor="n", - ) - c.create_line( - LEFT_MARGIN, - dy * (i + 1) + TOP_MARGIN, - dx * N + LEFT_MARGIN, - dy * (i + 1) + TOP_MARGIN, - dash=".", - ) - c.create_line( - dx * i + LEFT_MARGIN, - TOP_MARGIN, - dx * i + LEFT_MARGIN, - dy * N + TOP_MARGIN, - dash=".", - ) - - # A box around the whole thing - c.create_rectangle( - LEFT_MARGIN, TOP_MARGIN, LEFT_MARGIN + dx * N, dy * N + TOP_MARGIN, width=2 - ) - - # Cells - self._cells = [[None for i in range(N)] for j in range(N)] - for i in range(N): - for j in range(i, N): - t = c.create_rectangle( - j * dx + LEFT_MARGIN, - i * dy + TOP_MARGIN, - (j + 1) * dx + LEFT_MARGIN, - (i + 1) * dy + TOP_MARGIN, - fill="gray20", - ) - self._cells[i][j] = t - - def cb(event, self=self, i=i, j=j): - self._click_cell(i, j) - - c.tag_bind(t, "", cb) - - # Inactive box - xmax, ymax = int(c["width"]), int(c["height"]) - t = c.create_rectangle( - -100, - -100, - xmax + 100, - ymax + 100, - fill="gray50", - state="hidden", - tag="inactivebox", - ) - c.tag_lower(t) - - # Update the cells. - self.update() - - def pack(self, *args, **kwargs): - self._root.pack(*args, **kwargs) - - -####################################################################### -# Chart Results View -####################################################################### - - -class ChartResultsView: - def __init__(self, parent, chart, grammar, toplevel=True): - self._chart = chart - self._grammar = grammar - self._trees = [] - self._y = 10 - self._treewidgets = [] - self._selection = None - self._selectbox = None - - if toplevel: - self._root = Toplevel(parent) - self._root.title("Chart Parser Application: Results") - self._root.bind("", self.destroy) - else: - self._root = Frame(parent) - - # Buttons - if toplevel: - buttons = Frame(self._root) - buttons.pack(side="bottom", expand=0, fill="x") - Button(buttons, text="Quit", command=self.destroy).pack(side="right") - Button(buttons, text="Print All", command=self.print_all).pack(side="left") - Button(buttons, text="Print Selection", command=self.print_selection).pack( - side="left" - ) - - # Canvas frame. - self._cframe = CanvasFrame(self._root, closeenough=20) - self._cframe.pack(side="top", expand=1, fill="both") - - # Initial update - self.update() - - def update(self, edge=None): - if self._root is None: - return - # If the edge isn't a parse edge, do nothing. - if edge is not None: - if edge.lhs() != self._grammar.start(): - return - if edge.span() != (0, self._chart.num_leaves()): - return - - for parse in self._chart.parses(self._grammar.start()): - if parse not in self._trees: - self._add(parse) - - def _add(self, parse): - # Add it to self._trees. - self._trees.append(parse) - - # Create a widget for it. - c = self._cframe.canvas() - treewidget = tree_to_treesegment(c, parse) - - # Add it to the canvas frame. - self._treewidgets.append(treewidget) - self._cframe.add_widget(treewidget, 10, self._y) - - # Register callbacks. - treewidget.bind_click(self._click) - - # Update y. - self._y = treewidget.bbox()[3] + 10 - - def _click(self, widget): - c = self._cframe.canvas() - if self._selection is not None: - c.delete(self._selectbox) - self._selection = widget - (x1, y1, x2, y2) = widget.bbox() - self._selectbox = c.create_rectangle(x1, y1, x2, y2, width=2, outline="#088") - - def _color(self, treewidget, color): - treewidget.label()["color"] = color - for child in treewidget.subtrees(): - if isinstance(child, TreeSegmentWidget): - self._color(child, color) - else: - child["color"] = color - - def print_all(self, *e): - if self._root is None: - return - self._cframe.print_to_file() - - def print_selection(self, *e): - if self._root is None: - return - if self._selection is None: - showerror("Print Error", "No tree selected") - else: - c = self._cframe.canvas() - for widget in self._treewidgets: - if widget is not self._selection: - self._cframe.destroy_widget(widget) - c.delete(self._selectbox) - (x1, y1, x2, y2) = self._selection.bbox() - self._selection.move(10 - x1, 10 - y1) - c["scrollregion"] = f"0 0 {x2 - x1 + 20} {y2 - y1 + 20}" - self._cframe.print_to_file() - - # Restore our state. - self._treewidgets = [self._selection] - self.clear() - self.update() - - def clear(self): - if self._root is None: - return - for treewidget in self._treewidgets: - self._cframe.destroy_widget(treewidget) - self._trees = [] - self._treewidgets = [] - if self._selection is not None: - self._cframe.canvas().delete(self._selectbox) - self._selection = None - self._y = 10 - - def set_chart(self, chart): - self.clear() - self._chart = chart - self.update() - - def set_grammar(self, grammar): - self.clear() - self._grammar = grammar - self.update() - - def destroy(self, *e): - if self._root is None: - return - try: - self._root.destroy() - except: - pass - self._root = None - - def pack(self, *args, **kwargs): - self._root.pack(*args, **kwargs) - - -####################################################################### -# Chart Comparer -####################################################################### - - -class ChartComparer: - """ - - :ivar _root: The root window - - :ivar _charts: A dictionary mapping names to charts. When - charts are loaded, they are added to this dictionary. - - :ivar _left_chart: The left ``Chart``. - :ivar _left_name: The name ``_left_chart`` (derived from filename) - :ivar _left_matrix: The ``ChartMatrixView`` for ``_left_chart`` - :ivar _left_selector: The drop-down ``MutableOptionsMenu`` used - to select ``_left_chart``. - - :ivar _right_chart: The right ``Chart``. - :ivar _right_name: The name ``_right_chart`` (derived from filename) - :ivar _right_matrix: The ``ChartMatrixView`` for ``_right_chart`` - :ivar _right_selector: The drop-down ``MutableOptionsMenu`` used - to select ``_right_chart``. - - :ivar _out_chart: The out ``Chart``. - :ivar _out_name: The name ``_out_chart`` (derived from filename) - :ivar _out_matrix: The ``ChartMatrixView`` for ``_out_chart`` - :ivar _out_label: The label for ``_out_chart``. - - :ivar _op_label: A Label containing the most recent operation. - """ - - _OPSYMBOL = { - "-": "-", - "and": SymbolWidget.SYMBOLS["intersection"], - "or": SymbolWidget.SYMBOLS["union"], - } - - def __init__(self, *chart_filenames): - # This chart is displayed when we don't have a value (eg - # before any chart is loaded). - faketok = [""] * 8 - self._emptychart = Chart(faketok) - - # The left & right charts start out empty. - self._left_name = "None" - self._right_name = "None" - self._left_chart = self._emptychart - self._right_chart = self._emptychart - - # The charts that have been loaded. - self._charts = {"None": self._emptychart} - - # The output chart. - self._out_chart = self._emptychart - - # The most recent operation - self._operator = None - - # Set up the root window. - self._root = Tk() - self._root.title("Chart Comparison") - self._root.bind("", self.destroy) - self._root.bind("", self.destroy) - - # Initialize all widgets, etc. - self._init_menubar(self._root) - self._init_chartviews(self._root) - self._init_divider(self._root) - self._init_buttons(self._root) - self._init_bindings(self._root) - - # Load any specified charts. - for filename in chart_filenames: - self.load_chart(filename) - - def destroy(self, *e): - if self._root is None: - return - try: - self._root.destroy() - except: - pass - self._root = None - - def mainloop(self, *args, **kwargs): - return - self._root.mainloop(*args, **kwargs) - - # //////////////////////////////////////////////////////////// - # Initialization - # //////////////////////////////////////////////////////////// - - def _init_menubar(self, root): - menubar = Menu(root) - - # File menu - filemenu = Menu(menubar, tearoff=0) - filemenu.add_command( - label="Load Chart", - accelerator="Ctrl-o", - underline=0, - command=self.load_chart_dialog, - ) - filemenu.add_command( - label="Save Output", - accelerator="Ctrl-s", - underline=0, - command=self.save_chart_dialog, - ) - filemenu.add_separator() - filemenu.add_command( - label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x" - ) - menubar.add_cascade(label="File", underline=0, menu=filemenu) - - # Compare menu - opmenu = Menu(menubar, tearoff=0) - opmenu.add_command( - label="Intersection", command=self._intersection, accelerator="+" - ) - opmenu.add_command(label="Union", command=self._union, accelerator="*") - opmenu.add_command( - label="Difference", command=self._difference, accelerator="-" - ) - opmenu.add_separator() - opmenu.add_command(label="Swap Charts", command=self._swapcharts) - menubar.add_cascade(label="Compare", underline=0, menu=opmenu) - - # Add the menu - self._root.config(menu=menubar) - - def _init_divider(self, root): - divider = Frame(root, border=2, relief="sunken") - divider.pack(side="top", fill="x", ipady=2) - - def _init_chartviews(self, root): - opfont = ("symbol", -36) # Font for operator. - eqfont = ("helvetica", -36) # Font for equals sign. - - frame = Frame(root, background="#c0c0c0") - frame.pack(side="top", expand=1, fill="both") - - # The left matrix. - cv1_frame = Frame(frame, border=3, relief="groove") - cv1_frame.pack(side="left", padx=8, pady=7, expand=1, fill="both") - self._left_selector = MutableOptionMenu( - cv1_frame, list(self._charts.keys()), command=self._select_left - ) - self._left_selector.pack(side="top", pady=5, fill="x") - self._left_matrix = ChartMatrixView( - cv1_frame, self._emptychart, toplevel=False, show_numedges=True - ) - self._left_matrix.pack(side="bottom", padx=5, pady=5, expand=1, fill="both") - self._left_matrix.add_callback("select", self.select_edge) - self._left_matrix.add_callback("select_cell", self.select_cell) - self._left_matrix.inactivate() - - # The operator. - self._op_label = Label( - frame, text=" ", width=3, background="#c0c0c0", font=opfont - ) - self._op_label.pack(side="left", padx=5, pady=5) - - # The right matrix. - cv2_frame = Frame(frame, border=3, relief="groove") - cv2_frame.pack(side="left", padx=8, pady=7, expand=1, fill="both") - self._right_selector = MutableOptionMenu( - cv2_frame, list(self._charts.keys()), command=self._select_right - ) - self._right_selector.pack(side="top", pady=5, fill="x") - self._right_matrix = ChartMatrixView( - cv2_frame, self._emptychart, toplevel=False, show_numedges=True - ) - self._right_matrix.pack(side="bottom", padx=5, pady=5, expand=1, fill="both") - self._right_matrix.add_callback("select", self.select_edge) - self._right_matrix.add_callback("select_cell", self.select_cell) - self._right_matrix.inactivate() - - # The equals sign - Label(frame, text="=", width=3, background="#c0c0c0", font=eqfont).pack( - side="left", padx=5, pady=5 - ) - - # The output matrix. - out_frame = Frame(frame, border=3, relief="groove") - out_frame.pack(side="left", padx=8, pady=7, expand=1, fill="both") - self._out_label = Label(out_frame, text="Output") - self._out_label.pack(side="top", pady=9) - self._out_matrix = ChartMatrixView( - out_frame, self._emptychart, toplevel=False, show_numedges=True - ) - self._out_matrix.pack(side="bottom", padx=5, pady=5, expand=1, fill="both") - self._out_matrix.add_callback("select", self.select_edge) - self._out_matrix.add_callback("select_cell", self.select_cell) - self._out_matrix.inactivate() - - def _init_buttons(self, root): - buttons = Frame(root) - buttons.pack(side="bottom", pady=5, fill="x", expand=0) - Button(buttons, text="Intersection", command=self._intersection).pack( - side="left" - ) - Button(buttons, text="Union", command=self._union).pack(side="left") - Button(buttons, text="Difference", command=self._difference).pack(side="left") - Frame(buttons, width=20).pack(side="left") - Button(buttons, text="Swap Charts", command=self._swapcharts).pack(side="left") - - Button(buttons, text="Detach Output", command=self._detach_out).pack( - side="right" - ) - - def _init_bindings(self, root): - # root.bind('', self.save_chart) - root.bind("", self.load_chart_dialog) - # root.bind('', self.reset) - - # //////////////////////////////////////////////////////////// - # Input Handling - # //////////////////////////////////////////////////////////// - - def _select_left(self, name): - self._left_name = name - self._left_chart = self._charts[name] - self._left_matrix.set_chart(self._left_chart) - if name == "None": - self._left_matrix.inactivate() - self._apply_op() - - def _select_right(self, name): - self._right_name = name - self._right_chart = self._charts[name] - self._right_matrix.set_chart(self._right_chart) - if name == "None": - self._right_matrix.inactivate() - self._apply_op() - - def _apply_op(self): - if self._operator == "-": - self._difference() - elif self._operator == "or": - self._union() - elif self._operator == "and": - self._intersection() - - # //////////////////////////////////////////////////////////// - # File - # //////////////////////////////////////////////////////////// - CHART_FILE_TYPES = [("Pickle file", ".pickle"), ("All files", "*")] - - def save_chart_dialog(self, *args): - filename = asksaveasfilename( - filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle" - ) - if not filename: - return - try: - with open(filename, "wb") as outfile: - pickle.dump(self._out_chart, outfile) - except Exception as e: - showerror("Error Saving Chart", f"Unable to open file: {filename!r}\n{e}") - - def load_chart_dialog(self, *args): - filename = askopenfilename( - filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle" - ) - if not filename: - return - try: - self.load_chart(filename) - except Exception as e: - showerror("Error Loading Chart", f"Unable to open file: {filename!r}\n{e}") - - def load_chart(self, filename): - with open(filename, "rb") as infile: - chart = pickle.load(infile) - name = os.path.basename(filename) - if name.endswith(".pickle"): - name = name[:-7] - if name.endswith(".chart"): - name = name[:-6] - self._charts[name] = chart - self._left_selector.add(name) - self._right_selector.add(name) - - # If either left_matrix or right_matrix is empty, then - # display the new chart. - if self._left_chart is self._emptychart: - self._left_selector.set(name) - elif self._right_chart is self._emptychart: - self._right_selector.set(name) - - def _update_chartviews(self): - self._left_matrix.update() - self._right_matrix.update() - self._out_matrix.update() - - # //////////////////////////////////////////////////////////// - # Selection - # //////////////////////////////////////////////////////////// - - def select_edge(self, edge): - if edge in self._left_chart: - self._left_matrix.markonly_edge(edge) - else: - self._left_matrix.unmark_edge() - if edge in self._right_chart: - self._right_matrix.markonly_edge(edge) - else: - self._right_matrix.unmark_edge() - if edge in self._out_chart: - self._out_matrix.markonly_edge(edge) - else: - self._out_matrix.unmark_edge() - - def select_cell(self, i, j): - self._left_matrix.select_cell(i, j) - self._right_matrix.select_cell(i, j) - self._out_matrix.select_cell(i, j) - - # //////////////////////////////////////////////////////////// - # Operations - # //////////////////////////////////////////////////////////// - - def _difference(self): - if not self._checkcompat(): - return - - out_chart = Chart(self._left_chart.tokens()) - for edge in self._left_chart: - if edge not in self._right_chart: - out_chart.insert(edge, []) - - self._update("-", out_chart) - - def _intersection(self): - if not self._checkcompat(): - return - - out_chart = Chart(self._left_chart.tokens()) - for edge in self._left_chart: - if edge in self._right_chart: - out_chart.insert(edge, []) - - self._update("and", out_chart) - - def _union(self): - if not self._checkcompat(): - return - - out_chart = Chart(self._left_chart.tokens()) - for edge in self._left_chart: - out_chart.insert(edge, []) - for edge in self._right_chart: - out_chart.insert(edge, []) - - self._update("or", out_chart) - - def _swapcharts(self): - left, right = self._left_name, self._right_name - self._left_selector.set(right) - self._right_selector.set(left) - - def _checkcompat(self): - if ( - self._left_chart.tokens() != self._right_chart.tokens() - or self._left_chart.property_names() != self._right_chart.property_names() - or self._left_chart == self._emptychart - or self._right_chart == self._emptychart - ): - # Clear & inactivate the output chart. - self._out_chart = self._emptychart - self._out_matrix.set_chart(self._out_chart) - self._out_matrix.inactivate() - self._out_label["text"] = "Output" - # Issue some other warning? - return False - else: - return True - - def _update(self, operator, out_chart): - self._operator = operator - self._op_label["text"] = self._OPSYMBOL[operator] - self._out_chart = out_chart - self._out_matrix.set_chart(out_chart) - self._out_label["text"] = "{} {} {}".format( - self._left_name, - self._operator, - self._right_name, - ) - - def _clear_out_chart(self): - self._out_chart = self._emptychart - self._out_matrix.set_chart(self._out_chart) - self._op_label["text"] = " " - self._out_matrix.inactivate() - - def _detach_out(self): - ChartMatrixView(self._root, self._out_chart, title=self._out_label["text"]) - - -####################################################################### -# Chart View -####################################################################### - - -class ChartView: - """ - A component for viewing charts. This is used by ``ChartParserApp`` to - allow students to interactively experiment with various chart - parsing techniques. It is also used by ``Chart.draw()``. - - :ivar _chart: The chart that we are giving a view of. This chart - may be modified; after it is modified, you should call - ``update``. - :ivar _sentence: The list of tokens that the chart spans. - - :ivar _root: The root window. - :ivar _chart_canvas: The canvas we're using to display the chart - itself. - :ivar _tree_canvas: The canvas we're using to display the tree - that each edge spans. May be None, if we're not displaying - trees. - :ivar _sentence_canvas: The canvas we're using to display the sentence - text. May be None, if we're not displaying the sentence text. - :ivar _edgetags: A dictionary mapping from edges to the tags of - the canvas elements (lines, etc) used to display that edge. - The values of this dictionary have the form - ``(linetag, rhstag1, dottag, rhstag2, lhstag)``. - :ivar _treetags: A list of all the tags that make up the tree; - used to erase the tree (without erasing the loclines). - :ivar _chart_height: The height of the chart canvas. - :ivar _sentence_height: The height of the sentence canvas. - :ivar _tree_height: The height of the tree - - :ivar _text_height: The height of a text string (in the normal - font). - - :ivar _edgelevels: A list of edges at each level of the chart (the - top level is the 0th element). This list is used to remember - where edges should be drawn; and to make sure that no edges - are overlapping on the chart view. - - :ivar _unitsize: Pixel size of one unit (from the location). This - is determined by the span of the chart's location, and the - width of the chart display canvas. - - :ivar _fontsize: The current font size - - :ivar _marks: A dictionary from edges to marks. Marks are - strings, specifying colors (e.g. 'green'). - """ - - _LEAF_SPACING = 10 - _MARGIN = 10 - _TREE_LEVEL_SIZE = 12 - _CHART_LEVEL_SIZE = 40 - - def __init__(self, chart, root=None, **kw): - """ - Construct a new ``Chart`` display. - """ - # Process keyword args. - draw_tree = kw.get("draw_tree", 0) - draw_sentence = kw.get("draw_sentence", 1) - self._fontsize = kw.get("fontsize", -12) - - # The chart! - self._chart = chart - - # Callback functions - self._callbacks = {} - - # Keep track of drawn edges - self._edgelevels = [] - self._edgetags = {} - - # Keep track of which edges are marked. - self._marks = {} - - # These are used to keep track of the set of tree tokens - # currently displayed in the tree canvas. - self._treetoks = [] - self._treetoks_edge = None - self._treetoks_index = 0 - - # Keep track of the tags used to draw the tree - self._tree_tags = [] - - # Put multiple edges on each level? - self._compact = 0 - - # If they didn't provide a main window, then set one up. - if root is None: - top = Tk() - top.title("Chart View") - - def destroy1(e, top=top): - top.destroy() - - def destroy2(top=top): - top.destroy() - - top.bind("q", destroy1) - b = Button(top, text="Done", command=destroy2) - b.pack(side="bottom") - self._root = top - else: - self._root = root - - # Create some fonts. - self._init_fonts(root) - - # Create the chart canvas. - (self._chart_sb, self._chart_canvas) = self._sb_canvas(self._root) - self._chart_canvas["height"] = 300 - self._chart_canvas["closeenough"] = 15 - - # Create the sentence canvas. - if draw_sentence: - cframe = Frame(self._root, relief="sunk", border=2) - cframe.pack(fill="both", side="bottom") - self._sentence_canvas = Canvas(cframe, height=50) - self._sentence_canvas["background"] = "#e0e0e0" - self._sentence_canvas.pack(fill="both") - # self._sentence_canvas['height'] = self._sentence_height - else: - self._sentence_canvas = None - - # Create the tree canvas. - if draw_tree: - (sb, canvas) = self._sb_canvas(self._root, "n", "x") - (self._tree_sb, self._tree_canvas) = (sb, canvas) - self._tree_canvas["height"] = 200 - else: - self._tree_canvas = None - - # Do some analysis to figure out how big the window should be - self._analyze() - self.draw() - self._resize() - self._grow() - - # Set up the configure callback, which will be called whenever - # the window is resized. - self._chart_canvas.bind("", self._configure) - - def _init_fonts(self, root): - self._boldfont = Font(family="helvetica", weight="bold", size=self._fontsize) - self._font = Font(family="helvetica", size=self._fontsize) - # See: - self._sysfont = Font(font=Button()["font"]) - root.option_add("*Font", self._sysfont) - - def _sb_canvas(self, root, expand="y", fill="both", side="bottom"): - """ - Helper for __init__: construct a canvas with a scrollbar. - """ - cframe = Frame(root, relief="sunk", border=2) - cframe.pack(fill=fill, expand=expand, side=side) - canvas = Canvas(cframe, background="#e0e0e0") - - # Give the canvas a scrollbar. - sb = Scrollbar(cframe, orient="vertical") - sb.pack(side="right", fill="y") - canvas.pack(side="left", fill=fill, expand="yes") - - # Connect the scrollbars to the canvas. - sb["command"] = canvas.yview - canvas["yscrollcommand"] = sb.set - - return (sb, canvas) - - def scroll_up(self, *e): - self._chart_canvas.yview("scroll", -1, "units") - - def scroll_down(self, *e): - self._chart_canvas.yview("scroll", 1, "units") - - def page_up(self, *e): - self._chart_canvas.yview("scroll", -1, "pages") - - def page_down(self, *e): - self._chart_canvas.yview("scroll", 1, "pages") - - def _grow(self): - """ - Grow the window, if necessary - """ - # Grow, if need-be - N = self._chart.num_leaves() - width = max( - int(self._chart_canvas["width"]), N * self._unitsize + ChartView._MARGIN * 2 - ) - - # It won't resize without the second (height) line, but I - # don't understand why not. - self._chart_canvas.configure(width=width) - self._chart_canvas.configure(height=self._chart_canvas["height"]) - - self._unitsize = (width - 2 * ChartView._MARGIN) / N - - # Reset the height for the sentence window. - if self._sentence_canvas is not None: - self._sentence_canvas["height"] = self._sentence_height - - def set_font_size(self, size): - self._font.configure(size=-abs(size)) - self._boldfont.configure(size=-abs(size)) - self._sysfont.configure(size=-abs(size)) - self._analyze() - self._grow() - self.draw() - - def get_font_size(self): - return abs(self._fontsize) - - def _configure(self, e): - """ - The configure callback. This is called whenever the window is - resized. It is also called when the window is first mapped. - It figures out the unit size, and redraws the contents of each - canvas. - """ - N = self._chart.num_leaves() - self._unitsize = (e.width - 2 * ChartView._MARGIN) / N - self.draw() - - def update(self, chart=None): - """ - Draw any edges that have not been drawn. This is typically - called when a after modifies the canvas that a CanvasView is - displaying. ``update`` will cause any edges that have been - added to the chart to be drawn. - - If update is given a ``chart`` argument, then it will replace - the current chart with the given chart. - """ - if chart is not None: - self._chart = chart - self._edgelevels = [] - self._marks = {} - self._analyze() - self._grow() - self.draw() - self.erase_tree() - self._resize() - else: - for edge in self._chart: - if edge not in self._edgetags: - self._add_edge(edge) - self._resize() - - def _edge_conflict(self, edge, lvl): - """ - Return True if the given edge overlaps with any edge on the given - level. This is used by _add_edge to figure out what level a - new edge should be added to. - """ - (s1, e1) = edge.span() - for otheredge in self._edgelevels[lvl]: - (s2, e2) = otheredge.span() - if (s1 <= s2 < e1) or (s2 <= s1 < e2) or (s1 == s2 == e1 == e2): - return True - return False - - def _analyze_edge(self, edge): - """ - Given a new edge, recalculate: - - - _text_height - - _unitsize (if the edge text is too big for the current - _unitsize, then increase _unitsize) - """ - c = self._chart_canvas - - if isinstance(edge, TreeEdge): - lhs = edge.lhs() - rhselts = [] - for elt in edge.rhs(): - if isinstance(elt, Nonterminal): - rhselts.append(str(elt.symbol())) - else: - rhselts.append(repr(elt)) - rhs = " ".join(rhselts) - else: - lhs = edge.lhs() - rhs = "" - - for s in (lhs, rhs): - tag = c.create_text( - 0, 0, text=s, font=self._boldfont, anchor="nw", justify="left" - ) - bbox = c.bbox(tag) - c.delete(tag) - width = bbox[2] # + ChartView._LEAF_SPACING - edgelen = max(edge.length(), 1) - self._unitsize = max(self._unitsize, width / edgelen) - self._text_height = max(self._text_height, bbox[3] - bbox[1]) - - def _add_edge(self, edge, minlvl=0): - """ - Add a single edge to the ChartView: - - - Call analyze_edge to recalculate display parameters - - Find an available level - - Call _draw_edge - """ - # Do NOT show leaf edges in the chart. - if isinstance(edge, LeafEdge): - return - - if edge in self._edgetags: - return - self._analyze_edge(edge) - self._grow() - - if not self._compact: - self._edgelevels.append([edge]) - lvl = len(self._edgelevels) - 1 - self._draw_edge(edge, lvl) - self._resize() - return - - # Figure out what level to draw the edge on. - lvl = 0 - while True: - # If this level doesn't exist yet, create it. - while lvl >= len(self._edgelevels): - self._edgelevels.append([]) - self._resize() - - # Check if we can fit the edge in this level. - if lvl >= minlvl and not self._edge_conflict(edge, lvl): - # Go ahead and draw it. - self._edgelevels[lvl].append(edge) - break - - # Try the next level. - lvl += 1 - - self._draw_edge(edge, lvl) - - def view_edge(self, edge): - level = None - for i in range(len(self._edgelevels)): - if edge in self._edgelevels[i]: - level = i - break - if level is None: - return - # Try to view the new edge.. - y = (level + 1) * self._chart_level_size - dy = self._text_height + 10 - self._chart_canvas.yview("moveto", 1.0) - if self._chart_height != 0: - self._chart_canvas.yview("moveto", (y - dy) / self._chart_height) - - def _draw_edge(self, edge, lvl): - """ - Draw a single edge on the ChartView. - """ - c = self._chart_canvas - - # Draw the arrow. - x1 = edge.start() * self._unitsize + ChartView._MARGIN - x2 = edge.end() * self._unitsize + ChartView._MARGIN - if x2 == x1: - x2 += max(4, self._unitsize / 5) - y = (lvl + 1) * self._chart_level_size - linetag = c.create_line(x1, y, x2, y, arrow="last", width=3) - - # Draw a label for the edge. - if isinstance(edge, TreeEdge): - rhs = [] - for elt in edge.rhs(): - if isinstance(elt, Nonterminal): - rhs.append(str(elt.symbol())) - else: - rhs.append(repr(elt)) - pos = edge.dot() - else: - rhs = [] - pos = 0 - - rhs1 = " ".join(rhs[:pos]) - rhs2 = " ".join(rhs[pos:]) - rhstag1 = c.create_text(x1 + 3, y, text=rhs1, font=self._font, anchor="nw") - dotx = c.bbox(rhstag1)[2] + 6 - doty = (c.bbox(rhstag1)[1] + c.bbox(rhstag1)[3]) / 2 - dottag = c.create_oval(dotx - 2, doty - 2, dotx + 2, doty + 2) - rhstag2 = c.create_text(dotx + 6, y, text=rhs2, font=self._font, anchor="nw") - lhstag = c.create_text( - (x1 + x2) / 2, y, text=str(edge.lhs()), anchor="s", font=self._boldfont - ) - - # Keep track of the edge's tags. - self._edgetags[edge] = (linetag, rhstag1, dottag, rhstag2, lhstag) - - # Register a callback for clicking on the edge. - def cb(event, self=self, edge=edge): - self._fire_callbacks("select", edge) - - c.tag_bind(rhstag1, "", cb) - c.tag_bind(rhstag2, "", cb) - c.tag_bind(linetag, "", cb) - c.tag_bind(dottag, "", cb) - c.tag_bind(lhstag, "", cb) - - self._color_edge(edge) - - def _color_edge(self, edge, linecolor=None, textcolor=None): - """ - Color in an edge with the given colors. - If no colors are specified, use intelligent defaults - (dependent on selection, etc.) - """ - if edge not in self._edgetags: - return - c = self._chart_canvas - - if linecolor is not None and textcolor is not None: - if edge in self._marks: - linecolor = self._marks[edge] - tags = self._edgetags[edge] - c.itemconfig(tags[0], fill=linecolor) - c.itemconfig(tags[1], fill=textcolor) - c.itemconfig(tags[2], fill=textcolor, outline=textcolor) - c.itemconfig(tags[3], fill=textcolor) - c.itemconfig(tags[4], fill=textcolor) - return - else: - N = self._chart.num_leaves() - if edge in self._marks: - self._color_edge(self._marks[edge]) - if edge.is_complete() and edge.span() == (0, N): - self._color_edge(edge, "#084", "#042") - elif isinstance(edge, LeafEdge): - self._color_edge(edge, "#48c", "#246") - else: - self._color_edge(edge, "#00f", "#008") - - def mark_edge(self, edge, mark="#0df"): - """ - Mark an edge - """ - self._marks[edge] = mark - self._color_edge(edge) - - def unmark_edge(self, edge=None): - """ - Unmark an edge (or all edges) - """ - if edge is None: - old_marked_edges = list(self._marks.keys()) - self._marks = {} - for edge in old_marked_edges: - self._color_edge(edge) - else: - del self._marks[edge] - self._color_edge(edge) - - def markonly_edge(self, edge, mark="#0df"): - self.unmark_edge() - self.mark_edge(edge, mark) - - def _analyze(self): - """ - Analyze the sentence string, to figure out how big a unit needs - to be, How big the tree should be, etc. - """ - # Figure out the text height and the unit size. - unitsize = 70 # min unitsize - text_height = 0 - c = self._chart_canvas - - # Check against all tokens - for leaf in self._chart.leaves(): - tag = c.create_text( - 0, 0, text=repr(leaf), font=self._font, anchor="nw", justify="left" - ) - bbox = c.bbox(tag) - c.delete(tag) - width = bbox[2] + ChartView._LEAF_SPACING - unitsize = max(width, unitsize) - text_height = max(text_height, bbox[3] - bbox[1]) - - self._unitsize = unitsize - self._text_height = text_height - self._sentence_height = self._text_height + 2 * ChartView._MARGIN - - # Check against edges. - for edge in self._chart.edges(): - self._analyze_edge(edge) - - # Size of chart levels - self._chart_level_size = self._text_height * 2 - - # Default tree size.. - self._tree_height = 3 * (ChartView._TREE_LEVEL_SIZE + self._text_height) - - # Resize the scrollregions. - self._resize() - - def _resize(self): - """ - Update the scroll-regions for each canvas. This ensures that - everything is within a scroll-region, so the user can use the - scrollbars to view the entire display. This does *not* - resize the window. - """ - c = self._chart_canvas - - # Reset the chart scroll region - width = self._chart.num_leaves() * self._unitsize + ChartView._MARGIN * 2 - - levels = len(self._edgelevels) - self._chart_height = (levels + 2) * self._chart_level_size - c["scrollregion"] = (0, 0, width, self._chart_height) - - # Reset the tree scroll region - if self._tree_canvas: - self._tree_canvas["scrollregion"] = (0, 0, width, self._tree_height) - - def _draw_loclines(self): - """ - Draw location lines. These are vertical gridlines used to - show where each location unit is. - """ - BOTTOM = 50000 - c1 = self._tree_canvas - c2 = self._sentence_canvas - c3 = self._chart_canvas - margin = ChartView._MARGIN - self._loclines = [] - for i in range(0, self._chart.num_leaves() + 1): - x = i * self._unitsize + margin - - if c1: - t1 = c1.create_line(x, 0, x, BOTTOM) - c1.tag_lower(t1) - if c2: - t2 = c2.create_line(x, 0, x, self._sentence_height) - c2.tag_lower(t2) - t3 = c3.create_line(x, 0, x, BOTTOM) - c3.tag_lower(t3) - t4 = c3.create_text(x + 2, 0, text=repr(i), anchor="nw", font=self._font) - c3.tag_lower(t4) - # if i % 4 == 0: - # if c1: c1.itemconfig(t1, width=2, fill='gray60') - # if c2: c2.itemconfig(t2, width=2, fill='gray60') - # c3.itemconfig(t3, width=2, fill='gray60') - if i % 2 == 0: - if c1: - c1.itemconfig(t1, fill="gray60") - if c2: - c2.itemconfig(t2, fill="gray60") - c3.itemconfig(t3, fill="gray60") - else: - if c1: - c1.itemconfig(t1, fill="gray80") - if c2: - c2.itemconfig(t2, fill="gray80") - c3.itemconfig(t3, fill="gray80") - - def _draw_sentence(self): - """Draw the sentence string.""" - if self._chart.num_leaves() == 0: - return - c = self._sentence_canvas - margin = ChartView._MARGIN - y = ChartView._MARGIN - - for i, leaf in enumerate(self._chart.leaves()): - x1 = i * self._unitsize + margin - x2 = x1 + self._unitsize - x = (x1 + x2) / 2 - tag = c.create_text( - x, y, text=repr(leaf), font=self._font, anchor="n", justify="left" - ) - bbox = c.bbox(tag) - rt = c.create_rectangle( - x1 + 2, - bbox[1] - (ChartView._LEAF_SPACING / 2), - x2 - 2, - bbox[3] + (ChartView._LEAF_SPACING / 2), - fill="#f0f0f0", - outline="#f0f0f0", - ) - c.tag_lower(rt) - - def erase_tree(self): - for tag in self._tree_tags: - self._tree_canvas.delete(tag) - self._treetoks = [] - self._treetoks_edge = None - self._treetoks_index = 0 - - def draw_tree(self, edge=None): - if edge is None and self._treetoks_edge is None: - return - if edge is None: - edge = self._treetoks_edge - - # If it's a new edge, then get a new list of treetoks. - if self._treetoks_edge != edge: - self._treetoks = [t for t in self._chart.trees(edge) if isinstance(t, Tree)] - self._treetoks_edge = edge - self._treetoks_index = 0 - - # Make sure there's something to draw. - if len(self._treetoks) == 0: - return - - # Erase the old tree. - for tag in self._tree_tags: - self._tree_canvas.delete(tag) - - # Draw the new tree. - tree = self._treetoks[self._treetoks_index] - self._draw_treetok(tree, edge.start()) - - # Show how many trees are available for the edge. - self._draw_treecycle() - - # Update the scroll region. - w = self._chart.num_leaves() * self._unitsize + 2 * ChartView._MARGIN - h = tree.height() * (ChartView._TREE_LEVEL_SIZE + self._text_height) - self._tree_canvas["scrollregion"] = (0, 0, w, h) - - def cycle_tree(self): - self._treetoks_index = (self._treetoks_index + 1) % len(self._treetoks) - self.draw_tree(self._treetoks_edge) - - def _draw_treecycle(self): - if len(self._treetoks) <= 1: - return - - # Draw the label. - label = "%d Trees" % len(self._treetoks) - c = self._tree_canvas - margin = ChartView._MARGIN - right = self._chart.num_leaves() * self._unitsize + margin - 2 - tag = c.create_text(right, 2, anchor="ne", text=label, font=self._boldfont) - self._tree_tags.append(tag) - _, _, _, y = c.bbox(tag) - - # Draw the triangles. - for i in range(len(self._treetoks)): - x = right - 20 * (len(self._treetoks) - i - 1) - if i == self._treetoks_index: - fill = "#084" - else: - fill = "#fff" - tag = c.create_polygon( - x, y + 10, x - 5, y, x - 10, y + 10, fill=fill, outline="black" - ) - self._tree_tags.append(tag) - - # Set up a callback: show the tree if they click on its - # triangle. - def cb(event, self=self, i=i): - self._treetoks_index = i - self.draw_tree() - - c.tag_bind(tag, "", cb) - - def _draw_treetok(self, treetok, index, depth=0): - """ - :param index: The index of the first leaf in the tree. - :return: The index of the first leaf after the tree. - """ - c = self._tree_canvas - margin = ChartView._MARGIN - - # Draw the children - child_xs = [] - for child in treetok: - if isinstance(child, Tree): - child_x, index = self._draw_treetok(child, index, depth + 1) - child_xs.append(child_x) - else: - child_xs.append((2 * index + 1) * self._unitsize / 2 + margin) - index += 1 - - # If we have children, then get the node's x by averaging their - # node x's. Otherwise, make room for ourselves. - if child_xs: - nodex = sum(child_xs) / len(child_xs) - else: - # [XX] breaks for null productions. - nodex = (2 * index + 1) * self._unitsize / 2 + margin - index += 1 - - # Draw the node - nodey = depth * (ChartView._TREE_LEVEL_SIZE + self._text_height) - tag = c.create_text( - nodex, - nodey, - anchor="n", - justify="center", - text=str(treetok.label()), - fill="#042", - font=self._boldfont, - ) - self._tree_tags.append(tag) - - # Draw lines to the children. - childy = nodey + ChartView._TREE_LEVEL_SIZE + self._text_height - for childx, child in zip(child_xs, treetok): - if isinstance(child, Tree) and child: - # A "real" tree token: - tag = c.create_line( - nodex, - nodey + self._text_height, - childx, - childy, - width=2, - fill="#084", - ) - self._tree_tags.append(tag) - if isinstance(child, Tree) and not child: - # An unexpanded tree token: - tag = c.create_line( - nodex, - nodey + self._text_height, - childx, - childy, - width=2, - fill="#048", - dash="2 3", - ) - self._tree_tags.append(tag) - if not isinstance(child, Tree): - # A leaf: - tag = c.create_line( - nodex, - nodey + self._text_height, - childx, - 10000, - width=2, - fill="#084", - ) - self._tree_tags.append(tag) - - return nodex, index - - def draw(self): - """ - Draw everything (from scratch). - """ - if self._tree_canvas: - self._tree_canvas.delete("all") - self.draw_tree() - - if self._sentence_canvas: - self._sentence_canvas.delete("all") - self._draw_sentence() - - self._chart_canvas.delete("all") - self._edgetags = {} - - # Redraw any edges we erased. - for lvl in range(len(self._edgelevels)): - for edge in self._edgelevels[lvl]: - self._draw_edge(edge, lvl) - - for edge in self._chart: - self._add_edge(edge) - - self._draw_loclines() - - def add_callback(self, event, func): - self._callbacks.setdefault(event, {})[func] = 1 - - def remove_callback(self, event, func=None): - if func is None: - del self._callbacks[event] - else: - try: - del self._callbacks[event][func] - except: - pass - - def _fire_callbacks(self, event, *args): - if event not in self._callbacks: - return - for cb_func in list(self._callbacks[event].keys()): - cb_func(*args) - - -####################################################################### -# Edge Rules -####################################################################### -# These version of the chart rules only apply to a specific edge. -# This lets the user select an edge, and then apply a rule. - - -class EdgeRule: - """ - To create an edge rule, make an empty base class that uses - EdgeRule as the first base class, and the basic rule as the - second base class. (Order matters!) - """ - - def __init__(self, edge): - super = self.__class__.__bases__[1] - self._edge = edge - self.NUM_EDGES = super.NUM_EDGES - 1 - - def apply(self, chart, grammar, *edges): - super = self.__class__.__bases__[1] - edges += (self._edge,) - yield from super.apply(self, chart, grammar, *edges) - - def __str__(self): - super = self.__class__.__bases__[1] - return super.__str__(self) - - -class TopDownPredictEdgeRule(EdgeRule, TopDownPredictRule): - pass - - -class BottomUpEdgeRule(EdgeRule, BottomUpPredictRule): - pass - - -class BottomUpLeftCornerEdgeRule(EdgeRule, BottomUpPredictCombineRule): - pass - - -class FundamentalEdgeRule(EdgeRule, SingleEdgeFundamentalRule): - pass - - -####################################################################### -# Chart Parser Application -####################################################################### - - -class ChartParserApp: - def __init__(self, grammar, tokens, title="Chart Parser Application"): - # Initialize the parser - self._init_parser(grammar, tokens) - - self._root = None - try: - # Create the root window. - self._root = Tk() - self._root.title(title) - self._root.bind("", self.destroy) - - # Set up some frames. - frame3 = Frame(self._root) - frame2 = Frame(self._root) - frame1 = Frame(self._root) - frame3.pack(side="bottom", fill="none") - frame2.pack(side="bottom", fill="x") - frame1.pack(side="bottom", fill="both", expand=1) - - self._init_fonts(self._root) - self._init_animation() - self._init_chartview(frame1) - self._init_rulelabel(frame2) - self._init_buttons(frame3) - self._init_menubar() - - self._matrix = None - self._results = None - - # Set up keyboard bindings. - self._init_bindings() - - except: - print("Error creating Tree View") - self.destroy() - raise - - def destroy(self, *args): - if self._root is None: - return - self._root.destroy() - self._root = None - - def mainloop(self, *args, **kwargs): - """ - Enter the Tkinter mainloop. This function must be called if - this demo is created from a non-interactive program (e.g. - from a secript); otherwise, the demo will close as soon as - the script completes. - """ - if in_idle(): - return - self._root.mainloop(*args, **kwargs) - - # //////////////////////////////////////////////////////////// - # Initialization Helpers - # //////////////////////////////////////////////////////////// - - def _init_parser(self, grammar, tokens): - self._grammar = grammar - self._tokens = tokens - self._reset_parser() - - def _reset_parser(self): - self._cp = SteppingChartParser(self._grammar) - self._cp.initialize(self._tokens) - self._chart = self._cp.chart() - - # Insert LeafEdges before the parsing starts. - for _new_edge in LeafInitRule().apply(self._chart, self._grammar): - pass - - # The step iterator -- use this to generate new edges - self._cpstep = self._cp.step() - - # The currently selected edge - self._selection = None - - def _init_fonts(self, root): - # See: - self._sysfont = Font(font=Button()["font"]) - root.option_add("*Font", self._sysfont) - - # TWhat's our font size (default=same as sysfont) - self._size = IntVar(root) - self._size.set(self._sysfont.cget("size")) - - self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get()) - self._font = Font(family="helvetica", size=self._size.get()) - - def _init_animation(self): - # Are we stepping? (default=yes) - self._step = IntVar(self._root) - self._step.set(1) - - # What's our animation speed (default=fast) - self._animate = IntVar(self._root) - self._animate.set(3) # Default speed = fast - - # Are we currently animating? - self._animating = 0 - - def _init_chartview(self, parent): - self._cv = ChartView(self._chart, parent, draw_tree=1, draw_sentence=1) - self._cv.add_callback("select", self._click_cv_edge) - - def _init_rulelabel(self, parent): - ruletxt = "Last edge generated by:" - - self._rulelabel1 = Label(parent, text=ruletxt, font=self._boldfont) - self._rulelabel2 = Label( - parent, width=40, relief="groove", anchor="w", font=self._boldfont - ) - self._rulelabel1.pack(side="left") - self._rulelabel2.pack(side="left") - step = Checkbutton(parent, variable=self._step, text="Step") - step.pack(side="right") - - def _init_buttons(self, parent): - frame1 = Frame(parent) - frame2 = Frame(parent) - frame1.pack(side="bottom", fill="x") - frame2.pack(side="top", fill="none") - - Button( - frame1, - text="Reset\nParser", - background="#90c0d0", - foreground="black", - command=self.reset, - ).pack(side="right") - # Button(frame1, text='Pause', - # background='#90c0d0', foreground='black', - # command=self.pause).pack(side='left') - - Button( - frame1, - text="Top Down\nStrategy", - background="#90c0d0", - foreground="black", - command=self.top_down_strategy, - ).pack(side="left") - Button( - frame1, - text="Bottom Up\nStrategy", - background="#90c0d0", - foreground="black", - command=self.bottom_up_strategy, - ).pack(side="left") - Button( - frame1, - text="Bottom Up\nLeft-Corner Strategy", - background="#90c0d0", - foreground="black", - command=self.bottom_up_leftcorner_strategy, - ).pack(side="left") - - Button( - frame2, - text="Top Down Init\nRule", - background="#90f090", - foreground="black", - command=self.top_down_init, - ).pack(side="left") - Button( - frame2, - text="Top Down Predict\nRule", - background="#90f090", - foreground="black", - command=self.top_down_predict, - ).pack(side="left") - Frame(frame2, width=20).pack(side="left") - - Button( - frame2, - text="Bottom Up Predict\nRule", - background="#90f090", - foreground="black", - command=self.bottom_up, - ).pack(side="left") - Frame(frame2, width=20).pack(side="left") - - Button( - frame2, - text="Bottom Up Left-Corner\nPredict Rule", - background="#90f090", - foreground="black", - command=self.bottom_up_leftcorner, - ).pack(side="left") - Frame(frame2, width=20).pack(side="left") - - Button( - frame2, - text="Fundamental\nRule", - background="#90f090", - foreground="black", - command=self.fundamental, - ).pack(side="left") - - def _init_bindings(self): - self._root.bind("", self._cv.scroll_up) - self._root.bind("", self._cv.scroll_down) - self._root.bind("", self._cv.page_up) - self._root.bind("", self._cv.page_down) - self._root.bind("", self.destroy) - self._root.bind("", self.destroy) - self._root.bind("", self.help) - - self._root.bind("", self.save_chart) - self._root.bind("", self.load_chart) - self._root.bind("", self.reset) - - self._root.bind("t", self.top_down_strategy) - self._root.bind("b", self.bottom_up_strategy) - self._root.bind("c", self.bottom_up_leftcorner_strategy) - self._root.bind("", self._stop_animation) - - self._root.bind("", self.edit_grammar) - self._root.bind("", self.edit_sentence) - - # Animation speed control - self._root.bind("-", lambda e, a=self._animate: a.set(1)) - self._root.bind("=", lambda e, a=self._animate: a.set(2)) - self._root.bind("+", lambda e, a=self._animate: a.set(3)) - - # Step control - self._root.bind("s", lambda e, s=self._step: s.set(not s.get())) - - def _init_menubar(self): - menubar = Menu(self._root) - - filemenu = Menu(menubar, tearoff=0) - filemenu.add_command( - label="Save Chart", - underline=0, - command=self.save_chart, - accelerator="Ctrl-s", - ) - filemenu.add_command( - label="Load Chart", - underline=0, - command=self.load_chart, - accelerator="Ctrl-o", - ) - filemenu.add_command( - label="Reset Chart", underline=0, command=self.reset, accelerator="Ctrl-r" - ) - filemenu.add_separator() - filemenu.add_command(label="Save Grammar", command=self.save_grammar) - filemenu.add_command(label="Load Grammar", command=self.load_grammar) - filemenu.add_separator() - filemenu.add_command( - label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x" - ) - menubar.add_cascade(label="File", underline=0, menu=filemenu) - - editmenu = Menu(menubar, tearoff=0) - editmenu.add_command( - label="Edit Grammar", - underline=5, - command=self.edit_grammar, - accelerator="Ctrl-g", - ) - editmenu.add_command( - label="Edit Text", - underline=5, - command=self.edit_sentence, - accelerator="Ctrl-t", - ) - menubar.add_cascade(label="Edit", underline=0, menu=editmenu) - - viewmenu = Menu(menubar, tearoff=0) - viewmenu.add_command( - label="Chart Matrix", underline=6, command=self.view_matrix - ) - viewmenu.add_command(label="Results", underline=0, command=self.view_results) - menubar.add_cascade(label="View", underline=0, menu=viewmenu) - - rulemenu = Menu(menubar, tearoff=0) - rulemenu.add_command( - label="Top Down Strategy", - underline=0, - command=self.top_down_strategy, - accelerator="t", - ) - rulemenu.add_command( - label="Bottom Up Strategy", - underline=0, - command=self.bottom_up_strategy, - accelerator="b", - ) - rulemenu.add_command( - label="Bottom Up Left-Corner Strategy", - underline=0, - command=self.bottom_up_leftcorner_strategy, - accelerator="c", - ) - rulemenu.add_separator() - rulemenu.add_command(label="Bottom Up Rule", command=self.bottom_up) - rulemenu.add_command( - label="Bottom Up Left-Corner Rule", command=self.bottom_up_leftcorner - ) - rulemenu.add_command(label="Top Down Init Rule", command=self.top_down_init) - rulemenu.add_command( - label="Top Down Predict Rule", command=self.top_down_predict - ) - rulemenu.add_command(label="Fundamental Rule", command=self.fundamental) - menubar.add_cascade(label="Apply", underline=0, menu=rulemenu) - - animatemenu = Menu(menubar, tearoff=0) - animatemenu.add_checkbutton( - label="Step", underline=0, variable=self._step, accelerator="s" - ) - animatemenu.add_separator() - animatemenu.add_radiobutton( - label="No Animation", underline=0, variable=self._animate, value=0 - ) - animatemenu.add_radiobutton( - label="Slow Animation", - underline=0, - variable=self._animate, - value=1, - accelerator="-", - ) - animatemenu.add_radiobutton( - label="Normal Animation", - underline=0, - variable=self._animate, - value=2, - accelerator="=", - ) - animatemenu.add_radiobutton( - label="Fast Animation", - underline=0, - variable=self._animate, - value=3, - accelerator="+", - ) - menubar.add_cascade(label="Animate", underline=1, menu=animatemenu) - - zoommenu = Menu(menubar, tearoff=0) - zoommenu.add_radiobutton( - label="Tiny", - variable=self._size, - underline=0, - value=10, - command=self.resize, - ) - zoommenu.add_radiobutton( - label="Small", - variable=self._size, - underline=0, - value=12, - command=self.resize, - ) - zoommenu.add_radiobutton( - label="Medium", - variable=self._size, - underline=0, - value=14, - command=self.resize, - ) - zoommenu.add_radiobutton( - label="Large", - variable=self._size, - underline=0, - value=18, - command=self.resize, - ) - zoommenu.add_radiobutton( - label="Huge", - variable=self._size, - underline=0, - value=24, - command=self.resize, - ) - menubar.add_cascade(label="Zoom", underline=0, menu=zoommenu) - - helpmenu = Menu(menubar, tearoff=0) - helpmenu.add_command(label="About", underline=0, command=self.about) - helpmenu.add_command( - label="Instructions", underline=0, command=self.help, accelerator="F1" - ) - menubar.add_cascade(label="Help", underline=0, menu=helpmenu) - - self._root.config(menu=menubar) - - # //////////////////////////////////////////////////////////// - # Selection Handling - # //////////////////////////////////////////////////////////// - - def _click_cv_edge(self, edge): - if edge != self._selection: - # Clicking on a new edge selects it. - self._select_edge(edge) - else: - # Repeated clicks on one edge cycle its trees. - self._cv.cycle_tree() - # [XX] this can get confused if animation is running - # faster than the callbacks... - - def _select_matrix_edge(self, edge): - self._select_edge(edge) - self._cv.view_edge(edge) - - def _select_edge(self, edge): - self._selection = edge - # Update the chart view. - self._cv.markonly_edge(edge, "#f00") - self._cv.draw_tree(edge) - # Update the matrix view. - if self._matrix: - self._matrix.markonly_edge(edge) - if self._matrix: - self._matrix.view_edge(edge) - - def _deselect_edge(self): - self._selection = None - # Update the chart view. - self._cv.unmark_edge() - self._cv.erase_tree() - # Update the matrix view - if self._matrix: - self._matrix.unmark_edge() - - def _show_new_edge(self, edge): - self._display_rule(self._cp.current_chartrule()) - # Update the chart view. - self._cv.update() - self._cv.draw_tree(edge) - self._cv.markonly_edge(edge, "#0df") - self._cv.view_edge(edge) - # Update the matrix view. - if self._matrix: - self._matrix.update() - if self._matrix: - self._matrix.markonly_edge(edge) - if self._matrix: - self._matrix.view_edge(edge) - # Update the results view. - if self._results: - self._results.update(edge) - - # //////////////////////////////////////////////////////////// - # Help/usage - # //////////////////////////////////////////////////////////// - - def help(self, *e): - self._animating = 0 - # The default font's not very legible; try using 'fixed' instead. - try: - ShowText( - self._root, - "Help: Chart Parser Application", - (__doc__ or "").strip(), - width=75, - font="fixed", - ) - except: - ShowText( - self._root, - "Help: Chart Parser Application", - (__doc__ or "").strip(), - width=75, - ) - - def about(self, *e): - ABOUT = "NLTK Chart Parser Application\n" + "Written by Edward Loper" - showinfo("About: Chart Parser Application", ABOUT) - - # //////////////////////////////////////////////////////////// - # File Menu - # //////////////////////////////////////////////////////////// - - CHART_FILE_TYPES = [("Pickle file", ".pickle"), ("All files", "*")] - GRAMMAR_FILE_TYPES = [ - ("Plaintext grammar file", ".cfg"), - ("Pickle file", ".pickle"), - ("All files", "*"), - ] - - def load_chart(self, *args): - "Load a chart from a pickle file" - filename = askopenfilename( - filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle" - ) - if not filename: - return - try: - with open(filename, "rb") as infile: - chart = pickle.load(infile) - self._chart = chart - self._cv.update(chart) - if self._matrix: - self._matrix.set_chart(chart) - if self._matrix: - self._matrix.deselect_cell() - if self._results: - self._results.set_chart(chart) - self._cp.set_chart(chart) - except Exception as e: - raise - showerror("Error Loading Chart", "Unable to open file: %r" % filename) - - def save_chart(self, *args): - "Save a chart to a pickle file" - filename = asksaveasfilename( - filetypes=self.CHART_FILE_TYPES, defaultextension=".pickle" - ) - if not filename: - return - try: - with open(filename, "wb") as outfile: - pickle.dump(self._chart, outfile) - except Exception as e: - raise - showerror("Error Saving Chart", "Unable to open file: %r" % filename) - - def load_grammar(self, *args): - "Load a grammar from a pickle file" - filename = askopenfilename( - filetypes=self.GRAMMAR_FILE_TYPES, defaultextension=".cfg" - ) - if not filename: - return - try: - if filename.endswith(".pickle"): - with open(filename, "rb") as infile: - grammar = pickle.load(infile) - else: - with open(filename) as infile: - grammar = CFG.fromstring(infile.read()) - self.set_grammar(grammar) - except Exception as e: - showerror("Error Loading Grammar", "Unable to open file: %r" % filename) - - def save_grammar(self, *args): - filename = asksaveasfilename( - filetypes=self.GRAMMAR_FILE_TYPES, defaultextension=".cfg" - ) - if not filename: - return - try: - if filename.endswith(".pickle"): - with open(filename, "wb") as outfile: - pickle.dump((self._chart, self._tokens), outfile) - else: - with open(filename, "w") as outfile: - prods = self._grammar.productions() - start = [p for p in prods if p.lhs() == self._grammar.start()] - rest = [p for p in prods if p.lhs() != self._grammar.start()] - for prod in start: - outfile.write("%s\n" % prod) - for prod in rest: - outfile.write("%s\n" % prod) - except Exception as e: - showerror("Error Saving Grammar", "Unable to open file: %r" % filename) - - def reset(self, *args): - self._animating = 0 - self._reset_parser() - self._cv.update(self._chart) - if self._matrix: - self._matrix.set_chart(self._chart) - if self._matrix: - self._matrix.deselect_cell() - if self._results: - self._results.set_chart(self._chart) - - # //////////////////////////////////////////////////////////// - # Edit - # //////////////////////////////////////////////////////////// - - def edit_grammar(self, *e): - CFGEditor(self._root, self._grammar, self.set_grammar) - - def set_grammar(self, grammar): - self._grammar = grammar - self._cp.set_grammar(grammar) - if self._results: - self._results.set_grammar(grammar) - - def edit_sentence(self, *e): - sentence = " ".join(self._tokens) - title = "Edit Text" - instr = "Enter a new sentence to parse." - EntryDialog(self._root, sentence, instr, self.set_sentence, title) - - def set_sentence(self, sentence): - self._tokens = list(sentence.split()) - self.reset() - - # //////////////////////////////////////////////////////////// - # View Menu - # //////////////////////////////////////////////////////////// - - def view_matrix(self, *e): - if self._matrix is not None: - self._matrix.destroy() - self._matrix = ChartMatrixView(self._root, self._chart) - self._matrix.add_callback("select", self._select_matrix_edge) - - def view_results(self, *e): - if self._results is not None: - self._results.destroy() - self._results = ChartResultsView(self._root, self._chart, self._grammar) - - # //////////////////////////////////////////////////////////// - # Zoom Menu - # //////////////////////////////////////////////////////////// - - def resize(self): - self._animating = 0 - self.set_font_size(self._size.get()) - - def set_font_size(self, size): - self._cv.set_font_size(size) - self._font.configure(size=-abs(size)) - self._boldfont.configure(size=-abs(size)) - self._sysfont.configure(size=-abs(size)) - - def get_font_size(self): - return abs(self._size.get()) - - # //////////////////////////////////////////////////////////// - # Parsing - # //////////////////////////////////////////////////////////// - - def apply_strategy(self, strategy, edge_strategy=None): - # If we're animating, then stop. - if self._animating: - self._animating = 0 - return - - # Clear the rule display & mark. - self._display_rule(None) - # self._cv.unmark_edge() - - if self._step.get(): - selection = self._selection - if (selection is not None) and (edge_strategy is not None): - # Apply the given strategy to the selected edge. - self._cp.set_strategy([edge_strategy(selection)]) - newedge = self._apply_strategy() - - # If it failed, then clear the selection. - if newedge is None: - self._cv.unmark_edge() - self._selection = None - else: - self._cp.set_strategy(strategy) - self._apply_strategy() - - else: - self._cp.set_strategy(strategy) - if self._animate.get(): - self._animating = 1 - self._animate_strategy() - else: - for edge in self._cpstep: - if edge is None: - break - self._cv.update() - if self._matrix: - self._matrix.update() - if self._results: - self._results.update() - - def _stop_animation(self, *e): - self._animating = 0 - - def _animate_strategy(self, speed=1): - if self._animating == 0: - return - if self._apply_strategy() is not None: - if self._animate.get() == 0 or self._step.get() == 1: - return - if self._animate.get() == 1: - self._root.after(3000, self._animate_strategy) - elif self._animate.get() == 2: - self._root.after(1000, self._animate_strategy) - else: - self._root.after(20, self._animate_strategy) - - def _apply_strategy(self): - new_edge = next(self._cpstep) - - if new_edge is not None: - self._show_new_edge(new_edge) - return new_edge - - def _display_rule(self, rule): - if rule is None: - self._rulelabel2["text"] = "" - else: - name = str(rule) - self._rulelabel2["text"] = name - size = self._cv.get_font_size() - - # //////////////////////////////////////////////////////////// - # Parsing Strategies - # //////////////////////////////////////////////////////////// - - # Basic rules: - _TD_INIT = [TopDownInitRule()] - _TD_PREDICT = [TopDownPredictRule()] - _BU_RULE = [BottomUpPredictRule()] - _BU_LC_RULE = [BottomUpPredictCombineRule()] - _FUNDAMENTAL = [SingleEdgeFundamentalRule()] - - # Complete strategies: - _TD_STRATEGY = _TD_INIT + _TD_PREDICT + _FUNDAMENTAL - _BU_STRATEGY = _BU_RULE + _FUNDAMENTAL - _BU_LC_STRATEGY = _BU_LC_RULE + _FUNDAMENTAL - - # Button callback functions: - def top_down_init(self, *e): - self.apply_strategy(self._TD_INIT, None) - - def top_down_predict(self, *e): - self.apply_strategy(self._TD_PREDICT, TopDownPredictEdgeRule) - - def bottom_up(self, *e): - self.apply_strategy(self._BU_RULE, BottomUpEdgeRule) - - def bottom_up_leftcorner(self, *e): - self.apply_strategy(self._BU_LC_RULE, BottomUpLeftCornerEdgeRule) - - def fundamental(self, *e): - self.apply_strategy(self._FUNDAMENTAL, FundamentalEdgeRule) - - def bottom_up_strategy(self, *e): - self.apply_strategy(self._BU_STRATEGY, BottomUpEdgeRule) - - def bottom_up_leftcorner_strategy(self, *e): - self.apply_strategy(self._BU_LC_STRATEGY, BottomUpLeftCornerEdgeRule) - - def top_down_strategy(self, *e): - self.apply_strategy(self._TD_STRATEGY, TopDownPredictEdgeRule) - - -def app(): - grammar = CFG.fromstring( - """ - # Grammatical productions. - S -> NP VP - VP -> VP PP | V NP | V - NP -> Det N | NP PP - PP -> P NP - # Lexical productions. - NP -> 'John' | 'I' - Det -> 'the' | 'my' | 'a' - N -> 'dog' | 'cookie' | 'table' | 'cake' | 'fork' - V -> 'ate' | 'saw' - P -> 'on' | 'under' | 'with' - """ - ) - - sent = "John ate the cake on the table with a fork" - sent = "John ate the cake on the table" - tokens = list(sent.split()) - - print("grammar= (") - for rule in grammar.productions(): - print((" ", repr(rule) + ",")) - print(")") - print("tokens = %r" % tokens) - print('Calling "ChartParserApp(grammar, tokens)"...') - ChartParserApp(grammar, tokens).mainloop() - - -if __name__ == "__main__": - app() - - # Chart comparer: - # charts = ['/tmp/earley.pickle', - # '/tmp/topdown.pickle', - # '/tmp/bottomup.pickle'] - # ChartComparer(*charts).mainloop() - - # import profile - # profile.run('demo2()', '/tmp/profile.out') - # import pstats - # p = pstats.Stats('/tmp/profile.out') - # p.strip_dirs().sort_stats('time', 'cum').print_stats(60) - # p.strip_dirs().sort_stats('cum', 'time').print_stats(60) - -__all__ = ["app"] diff --git a/pipeline/nltk/app/chunkparser_app.py b/pipeline/nltk/app/chunkparser_app.py deleted file mode 100644 index 54a10a1e7db3dde0f3a18447575130e658ba3c51..0000000000000000000000000000000000000000 --- a/pipeline/nltk/app/chunkparser_app.py +++ /dev/null @@ -1,1500 +0,0 @@ -# Natural Language Toolkit: Regexp Chunk Parser Application -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -A graphical tool for exploring the regular expression based chunk -parser ``nltk.chunk.RegexpChunkParser``. -""" - -# Todo: Add a way to select the development set from the menubar. This -# might just need to be a selection box (conll vs treebank etc) plus -# configuration parameters to select what's being chunked (eg VP vs NP) -# and what part of the data is being used as the development set. - -import random -import re -import textwrap -import time -from tkinter import ( - Button, - Canvas, - Checkbutton, - Frame, - IntVar, - Label, - Menu, - Scrollbar, - Text, - Tk, -) -from tkinter.filedialog import askopenfilename, asksaveasfilename -from tkinter.font import Font - -from nltk.chunk import ChunkScore, RegexpChunkParser -from nltk.chunk.regexp import RegexpChunkRule -from nltk.corpus import conll2000, treebank_chunk -from nltk.draw.util import ShowText -from nltk.tree import Tree -from nltk.util import in_idle - - -class RegexpChunkApp: - """ - A graphical tool for exploring the regular expression based chunk - parser ``nltk.chunk.RegexpChunkParser``. - - See ``HELP`` for instructional text. - """ - - ##///////////////////////////////////////////////////////////////// - ## Help Text - ##///////////////////////////////////////////////////////////////// - - #: A dictionary mapping from part of speech tags to descriptions, - #: which is used in the help text. (This should probably live with - #: the conll and/or treebank corpus instead.) - TAGSET = { - "CC": "Coordinating conjunction", - "PRP$": "Possessive pronoun", - "CD": "Cardinal number", - "RB": "Adverb", - "DT": "Determiner", - "RBR": "Adverb, comparative", - "EX": "Existential there", - "RBS": "Adverb, superlative", - "FW": "Foreign word", - "RP": "Particle", - "JJ": "Adjective", - "TO": "to", - "JJR": "Adjective, comparative", - "UH": "Interjection", - "JJS": "Adjective, superlative", - "VB": "Verb, base form", - "LS": "List item marker", - "VBD": "Verb, past tense", - "MD": "Modal", - "NNS": "Noun, plural", - "NN": "Noun, singular or masps", - "VBN": "Verb, past participle", - "VBZ": "Verb,3rd ps. sing. present", - "NNP": "Proper noun, singular", - "NNPS": "Proper noun plural", - "WDT": "wh-determiner", - "PDT": "Predeterminer", - "WP": "wh-pronoun", - "POS": "Possessive ending", - "WP$": "Possessive wh-pronoun", - "PRP": "Personal pronoun", - "WRB": "wh-adverb", - "(": "open parenthesis", - ")": "close parenthesis", - "``": "open quote", - ",": "comma", - "''": "close quote", - ".": "period", - "#": "pound sign (currency marker)", - "$": "dollar sign (currency marker)", - "IN": "Preposition/subord. conjunction", - "SYM": "Symbol (mathematical or scientific)", - "VBG": "Verb, gerund/present participle", - "VBP": "Verb, non-3rd ps. sing. present", - ":": "colon", - } - - #: Contents for the help box. This is a list of tuples, one for - #: each help page, where each tuple has four elements: - #: - A title (displayed as a tab) - #: - A string description of tabstops (see Tkinter.Text for details) - #: - The text contents for the help page. You can use expressions - #: like ... to colorize the text; see ``HELP_AUTOTAG`` - #: for a list of tags you can use for colorizing. - HELP = [ - ( - "Help", - "20", - "Welcome to the regular expression chunk-parser grammar editor. " - "You can use this editor to develop and test chunk parser grammars " - "based on NLTK's RegexpChunkParser class.\n\n" - # Help box. - "Use this box ('Help') to learn more about the editor; click on the " - "tabs for help on specific topics:" - "\n" - "Rules: grammar rule types\n" - "Regexps: regular expression syntax\n" - "Tags: part of speech tags\n\n" - # Grammar. - "Use the upper-left box ('Grammar') to edit your grammar. " - "Each line of your grammar specifies a single 'rule', " - "which performs an action such as creating a chunk or merging " - "two chunks.\n\n" - # Dev set. - "The lower-left box ('Development Set') runs your grammar on the " - "development set, and displays the results. " - "Your grammar's chunks are highlighted, and " - "the correct (gold standard) chunks are " - "underlined. If they " - "match, they are displayed in green; otherwise, " - "they are displayed in red. The box displays a single " - "sentence from the development set at a time; use the scrollbar or " - "the next/previous buttons view additional sentences.\n\n" - # Performance - "The lower-right box ('Evaluation') tracks the performance of " - "your grammar on the development set. The 'precision' axis " - "indicates how many of your grammar's chunks are correct; and " - "the 'recall' axis indicates how many of the gold standard " - "chunks your system generated. Typically, you should try to " - "design a grammar that scores high on both metrics. The " - "exact precision and recall of the current grammar, as well " - "as their harmonic mean (the 'f-score'), are displayed in " - "the status bar at the bottom of the window.", - ), - ( - "Rules", - "10", - "

{...regexp...}

" - "\nChunk rule: creates new chunks from words matching " - "regexp.\n\n" - "

}...regexp...{

" - "\nStrip rule: removes words matching regexp from existing " - "chunks.\n\n" - "

...regexp1...}{...regexp2...

" - "\nSplit rule: splits chunks that match regexp1 followed by " - "regexp2 in two.\n\n" - "

...regexp...{}...regexp...

" - "\nMerge rule: joins consecutive chunks that match regexp1 " - "and regexp2\n", - ), - ( - "Regexps", - "10 60", - # "Regular Expression Syntax Summary:\n\n" - "

Pattern\t\tMatches...

\n" - "" - "\t<T>\ta word with tag T " - "(where T may be a regexp).\n" - "\tx?\tan optional x\n" - "\tx+\ta sequence of 1 or more x's\n" - "\tx*\ta sequence of 0 or more x's\n" - "\tx|y\tx or y\n" - "\t.\tmatches any character\n" - "\t(x)\tTreats x as a group\n" - "\t# x...\tTreats x... " - "(to the end of the line) as a comment\n" - "\t\\C\tmatches character C " - "(useful when C is a special character " - "like + or #)\n" - "" - "\n

Examples:

\n" - "" - "\t\n" - '\t\tMatches "cow/NN"\n' - '\t\tMatches "green/NN"\n' - "\t\n" - '\t\tMatches "eating/VBG"\n' - '\t\tMatches "ate/VBD"\n' - "\t
\n" - '\t\tMatches "on/IN the/DT car/NN"\n' - "\t?\n" - '\t\tMatches "ran/VBD"\n' - '\t\tMatches "slowly/RB ate/VBD"\n' - r"\t<\#> # This is a comment...\n" - '\t\tMatches "#/# 100/CD"\n' - "", - ), - ( - "Tags", - "10 60", - "

Part of Speech Tags:

\n" - + "" - + "<>" - + "\n", # this gets auto-substituted w/ self.TAGSET - ), - ] - - HELP_AUTOTAG = [ - ("red", dict(foreground="#a00")), - ("green", dict(foreground="#080")), - ("highlight", dict(background="#ddd")), - ("underline", dict(underline=True)), - ("h1", dict(underline=True)), - ("indent", dict(lmargin1=20, lmargin2=20)), - ("hangindent", dict(lmargin1=0, lmargin2=60)), - ("var", dict(foreground="#88f")), - ("regexp", dict(foreground="#ba7")), - ("match", dict(foreground="#6a6")), - ] - - ##///////////////////////////////////////////////////////////////// - ## Config Parameters - ##///////////////////////////////////////////////////////////////// - - _EVAL_DELAY = 1 - """If the user has not pressed any key for this amount of time (in - seconds), and the current grammar has not been evaluated, then - the eval demon will evaluate it.""" - - _EVAL_CHUNK = 15 - """The number of sentences that should be evaluated by the eval - demon each time it runs.""" - _EVAL_FREQ = 0.2 - """The frequency (in seconds) at which the eval demon is run""" - _EVAL_DEMON_MIN = 0.02 - """The minimum amount of time that the eval demon should take each time - it runs -- if it takes less than this time, _EVAL_CHUNK will be - modified upwards.""" - _EVAL_DEMON_MAX = 0.04 - """The maximum amount of time that the eval demon should take each time - it runs -- if it takes more than this time, _EVAL_CHUNK will be - modified downwards.""" - - _GRAMMARBOX_PARAMS = dict( - width=40, - height=12, - background="#efe", - highlightbackground="#efe", - highlightthickness=1, - relief="groove", - border=2, - wrap="word", - ) - _HELPBOX_PARAMS = dict( - width=15, - height=15, - background="#efe", - highlightbackground="#efe", - foreground="#555", - highlightthickness=1, - relief="groove", - border=2, - wrap="word", - ) - _DEVSETBOX_PARAMS = dict( - width=70, - height=10, - background="#eef", - highlightbackground="#eef", - highlightthickness=1, - relief="groove", - border=2, - wrap="word", - tabs=(30,), - ) - _STATUS_PARAMS = dict(background="#9bb", relief="groove", border=2) - _FONT_PARAMS = dict(family="helvetica", size=-20) - _FRAME_PARAMS = dict(background="#777", padx=2, pady=2, border=3) - _EVALBOX_PARAMS = dict( - background="#eef", - highlightbackground="#eef", - highlightthickness=1, - relief="groove", - border=2, - width=300, - height=280, - ) - _BUTTON_PARAMS = dict( - background="#777", activebackground="#777", highlightbackground="#777" - ) - _HELPTAB_BG_COLOR = "#aba" - _HELPTAB_FG_COLOR = "#efe" - - _HELPTAB_FG_PARAMS = dict(background="#efe") - _HELPTAB_BG_PARAMS = dict(background="#aba") - _HELPTAB_SPACER = 6 - - def normalize_grammar(self, grammar): - # Strip comments - grammar = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", grammar) - # Normalize whitespace - grammar = re.sub(" +", " ", grammar) - grammar = re.sub(r"\n\s+", r"\n", grammar) - grammar = grammar.strip() - # [xx] Hack: automatically backslash $! - grammar = re.sub(r"([^\\])\$", r"\1\\$", grammar) - return grammar - - def __init__( - self, - devset_name="conll2000", - devset=None, - grammar="", - chunk_label="NP", - tagset=None, - ): - """ - :param devset_name: The name of the development set; used for - display & for save files. If either the name 'treebank' - or the name 'conll2000' is used, and devset is None, then - devset will be set automatically. - :param devset: A list of chunked sentences - :param grammar: The initial grammar to display. - :param tagset: Dictionary from tags to string descriptions, used - for the help page. Defaults to ``self.TAGSET``. - """ - self._chunk_label = chunk_label - - if tagset is None: - tagset = self.TAGSET - self.tagset = tagset - - # Named development sets: - if devset is None: - if devset_name == "conll2000": - devset = conll2000.chunked_sents("train.txt") # [:100] - elif devset == "treebank": - devset = treebank_chunk.chunked_sents() # [:100] - else: - raise ValueError("Unknown development set %s" % devset_name) - - self.chunker = None - """The chunker built from the grammar string""" - - self.grammar = grammar - """The unparsed grammar string""" - - self.normalized_grammar = None - """A normalized version of ``self.grammar``.""" - - self.grammar_changed = 0 - """The last time() that the grammar was changed.""" - - self.devset = devset - """The development set -- a list of chunked sentences.""" - - self.devset_name = devset_name - """The name of the development set (for save files).""" - - self.devset_index = -1 - """The index into the development set of the first instance - that's currently being viewed.""" - - self._last_keypress = 0 - """The time() when a key was most recently pressed""" - - self._history = [] - """A list of (grammar, precision, recall, fscore) tuples for - grammars that the user has already tried.""" - - self._history_index = 0 - """When the user is scrolling through previous grammars, this - is used to keep track of which grammar they're looking at.""" - - self._eval_grammar = None - """The grammar that is being currently evaluated by the eval - demon.""" - - self._eval_normalized_grammar = None - """A normalized copy of ``_eval_grammar``.""" - - self._eval_index = 0 - """The index of the next sentence in the development set that - should be looked at by the eval demon.""" - - self._eval_score = ChunkScore(chunk_label=chunk_label) - """The ``ChunkScore`` object that's used to keep track of the score - of the current grammar on the development set.""" - - # Set up the main window. - top = self.top = Tk() - top.geometry("+50+50") - top.title("Regexp Chunk Parser App") - top.bind("", self.destroy) - - # Variable that restricts how much of the devset we look at. - self._devset_size = IntVar(top) - self._devset_size.set(100) - - # Set up all the tkinter widgets - self._init_fonts(top) - self._init_widgets(top) - self._init_bindings(top) - self._init_menubar(top) - self.grammarbox.focus() - - # If a grammar was given, then display it. - if grammar: - self.grammarbox.insert("end", grammar + "\n") - self.grammarbox.mark_set("insert", "1.0") - - # Display the first item in the development set - self.show_devset(0) - self.update() - - def _init_bindings(self, top): - top.bind("", self._devset_next) - top.bind("", self._devset_prev) - top.bind("", self.toggle_show_trace) - top.bind("", self.update) - top.bind("", lambda e: self.save_grammar()) - top.bind("", lambda e: self.load_grammar()) - self.grammarbox.bind("", self.toggle_show_trace) - self.grammarbox.bind("", self._devset_next) - self.grammarbox.bind("", self._devset_prev) - - # Redraw the eval graph when the window size changes - self.evalbox.bind("", self._eval_plot) - - def _init_fonts(self, top): - # TWhat's our font size (default=same as sysfont) - self._size = IntVar(top) - self._size.set(20) - self._font = Font(family="helvetica", size=-self._size.get()) - self._smallfont = Font( - family="helvetica", size=-(int(self._size.get() * 14 // 20)) - ) - - def _init_menubar(self, parent): - menubar = Menu(parent) - - filemenu = Menu(menubar, tearoff=0) - filemenu.add_command(label="Reset Application", underline=0, command=self.reset) - filemenu.add_command( - label="Save Current Grammar", - underline=0, - accelerator="Ctrl-s", - command=self.save_grammar, - ) - filemenu.add_command( - label="Load Grammar", - underline=0, - accelerator="Ctrl-o", - command=self.load_grammar, - ) - - filemenu.add_command( - label="Save Grammar History", underline=13, command=self.save_history - ) - - filemenu.add_command( - label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q" - ) - menubar.add_cascade(label="File", underline=0, menu=filemenu) - - viewmenu = Menu(menubar, tearoff=0) - viewmenu.add_radiobutton( - label="Tiny", - variable=self._size, - underline=0, - value=10, - command=self.resize, - ) - viewmenu.add_radiobutton( - label="Small", - variable=self._size, - underline=0, - value=16, - command=self.resize, - ) - viewmenu.add_radiobutton( - label="Medium", - variable=self._size, - underline=0, - value=20, - command=self.resize, - ) - viewmenu.add_radiobutton( - label="Large", - variable=self._size, - underline=0, - value=24, - command=self.resize, - ) - viewmenu.add_radiobutton( - label="Huge", - variable=self._size, - underline=0, - value=34, - command=self.resize, - ) - menubar.add_cascade(label="View", underline=0, menu=viewmenu) - - devsetmenu = Menu(menubar, tearoff=0) - devsetmenu.add_radiobutton( - label="50 sentences", - variable=self._devset_size, - value=50, - command=self.set_devset_size, - ) - devsetmenu.add_radiobutton( - label="100 sentences", - variable=self._devset_size, - value=100, - command=self.set_devset_size, - ) - devsetmenu.add_radiobutton( - label="200 sentences", - variable=self._devset_size, - value=200, - command=self.set_devset_size, - ) - devsetmenu.add_radiobutton( - label="500 sentences", - variable=self._devset_size, - value=500, - command=self.set_devset_size, - ) - menubar.add_cascade(label="Development-Set", underline=0, menu=devsetmenu) - - helpmenu = Menu(menubar, tearoff=0) - helpmenu.add_command(label="About", underline=0, command=self.about) - menubar.add_cascade(label="Help", underline=0, menu=helpmenu) - - parent.config(menu=menubar) - - def toggle_show_trace(self, *e): - if self._showing_trace: - self.show_devset() - else: - self.show_trace() - return "break" - - _SCALE_N = 5 # center on the last 5 examples. - _DRAW_LINES = False - - def _eval_plot(self, *e, **config): - width = config.get("width", self.evalbox.winfo_width()) - height = config.get("height", self.evalbox.winfo_height()) - - # Clear the canvas - self.evalbox.delete("all") - - # Draw the precision & recall labels. - tag = self.evalbox.create_text( - 10, height // 2 - 10, justify="left", anchor="w", text="Precision" - ) - left, right = self.evalbox.bbox(tag)[2] + 5, width - 10 - tag = self.evalbox.create_text( - left + (width - left) // 2, - height - 10, - anchor="s", - text="Recall", - justify="center", - ) - top, bot = 10, self.evalbox.bbox(tag)[1] - 10 - - # Draw masks for clipping the plot. - bg = self._EVALBOX_PARAMS["background"] - self.evalbox.lower( - self.evalbox.create_rectangle(0, 0, left - 1, 5000, fill=bg, outline=bg) - ) - self.evalbox.lower( - self.evalbox.create_rectangle(0, bot + 1, 5000, 5000, fill=bg, outline=bg) - ) - - # Calculate the plot's scale. - if self._autoscale.get() and len(self._history) > 1: - max_precision = max_recall = 0 - min_precision = min_recall = 1 - for i in range(1, min(len(self._history), self._SCALE_N + 1)): - grammar, precision, recall, fmeasure = self._history[-i] - min_precision = min(precision, min_precision) - min_recall = min(recall, min_recall) - max_precision = max(precision, max_precision) - max_recall = max(recall, max_recall) - # if max_precision-min_precision > max_recall-min_recall: - # min_recall -= (max_precision-min_precision)/2 - # max_recall += (max_precision-min_precision)/2 - # else: - # min_precision -= (max_recall-min_recall)/2 - # max_precision += (max_recall-min_recall)/2 - # if min_recall < 0: - # max_recall -= min_recall - # min_recall = 0 - # if min_precision < 0: - # max_precision -= min_precision - # min_precision = 0 - min_precision = max(min_precision - 0.01, 0) - min_recall = max(min_recall - 0.01, 0) - max_precision = min(max_precision + 0.01, 1) - max_recall = min(max_recall + 0.01, 1) - else: - min_precision = min_recall = 0 - max_precision = max_recall = 1 - - # Draw the axis lines & grid lines - for i in range(11): - x = left + (right - left) * ( - (i / 10.0 - min_recall) / (max_recall - min_recall) - ) - y = bot - (bot - top) * ( - (i / 10.0 - min_precision) / (max_precision - min_precision) - ) - if left < x < right: - self.evalbox.create_line(x, top, x, bot, fill="#888") - if top < y < bot: - self.evalbox.create_line(left, y, right, y, fill="#888") - self.evalbox.create_line(left, top, left, bot) - self.evalbox.create_line(left, bot, right, bot) - - # Display the plot's scale - self.evalbox.create_text( - left - 3, - bot, - justify="right", - anchor="se", - text="%d%%" % (100 * min_precision), - ) - self.evalbox.create_text( - left - 3, - top, - justify="right", - anchor="ne", - text="%d%%" % (100 * max_precision), - ) - self.evalbox.create_text( - left, - bot + 3, - justify="center", - anchor="nw", - text="%d%%" % (100 * min_recall), - ) - self.evalbox.create_text( - right, - bot + 3, - justify="center", - anchor="ne", - text="%d%%" % (100 * max_recall), - ) - - # Display the scores. - prev_x = prev_y = None - for i, (_, precision, recall, fscore) in enumerate(self._history): - x = left + (right - left) * ( - (recall - min_recall) / (max_recall - min_recall) - ) - y = bot - (bot - top) * ( - (precision - min_precision) / (max_precision - min_precision) - ) - if i == self._history_index: - self.evalbox.create_oval( - x - 2, y - 2, x + 2, y + 2, fill="#0f0", outline="#000" - ) - self.status["text"] = ( - "Precision: %.2f%%\t" % (precision * 100) - + "Recall: %.2f%%\t" % (recall * 100) - + "F-score: %.2f%%" % (fscore * 100) - ) - else: - self.evalbox.lower( - self.evalbox.create_oval( - x - 2, y - 2, x + 2, y + 2, fill="#afa", outline="#8c8" - ) - ) - if prev_x is not None and self._eval_lines.get(): - self.evalbox.lower( - self.evalbox.create_line(prev_x, prev_y, x, y, fill="#8c8") - ) - prev_x, prev_y = x, y - - _eval_demon_running = False - - def _eval_demon(self): - if self.top is None: - return - if self.chunker is None: - self._eval_demon_running = False - return - - # Note our starting time. - t0 = time.time() - - # If are still typing, then wait for them to finish. - if ( - time.time() - self._last_keypress < self._EVAL_DELAY - and self.normalized_grammar != self._eval_normalized_grammar - ): - self._eval_demon_running = True - return self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon) - - # If the grammar changed, restart the evaluation. - if self.normalized_grammar != self._eval_normalized_grammar: - # Check if we've seen this grammar already. If so, then - # just use the old evaluation values. - for (g, p, r, f) in self._history: - if self.normalized_grammar == self.normalize_grammar(g): - self._history.append((g, p, r, f)) - self._history_index = len(self._history) - 1 - self._eval_plot() - self._eval_demon_running = False - self._eval_normalized_grammar = None - return - self._eval_index = 0 - self._eval_score = ChunkScore(chunk_label=self._chunk_label) - self._eval_grammar = self.grammar - self._eval_normalized_grammar = self.normalized_grammar - - # If the grammar is empty, the don't bother evaluating it, or - # recording it in history -- the score will just be 0. - if self.normalized_grammar.strip() == "": - # self._eval_index = self._devset_size.get() - self._eval_demon_running = False - return - - # Score the next set of examples - for gold in self.devset[ - self._eval_index : min( - self._eval_index + self._EVAL_CHUNK, self._devset_size.get() - ) - ]: - guess = self._chunkparse(gold.leaves()) - self._eval_score.score(gold, guess) - - # update our index in the devset. - self._eval_index += self._EVAL_CHUNK - - # Check if we're done - if self._eval_index >= self._devset_size.get(): - self._history.append( - ( - self._eval_grammar, - self._eval_score.precision(), - self._eval_score.recall(), - self._eval_score.f_measure(), - ) - ) - self._history_index = len(self._history) - 1 - self._eval_plot() - self._eval_demon_running = False - self._eval_normalized_grammar = None - else: - progress = 100 * self._eval_index / self._devset_size.get() - self.status["text"] = "Evaluating on Development Set (%d%%)" % progress - self._eval_demon_running = True - self._adaptively_modify_eval_chunk(time.time() - t0) - self.top.after(int(self._EVAL_FREQ * 1000), self._eval_demon) - - def _adaptively_modify_eval_chunk(self, t): - """ - Modify _EVAL_CHUNK to try to keep the amount of time that the - eval demon takes between _EVAL_DEMON_MIN and _EVAL_DEMON_MAX. - - :param t: The amount of time that the eval demon took. - """ - if t > self._EVAL_DEMON_MAX and self._EVAL_CHUNK > 5: - self._EVAL_CHUNK = min( - self._EVAL_CHUNK - 1, - max( - int(self._EVAL_CHUNK * (self._EVAL_DEMON_MAX / t)), - self._EVAL_CHUNK - 10, - ), - ) - elif t < self._EVAL_DEMON_MIN: - self._EVAL_CHUNK = max( - self._EVAL_CHUNK + 1, - min( - int(self._EVAL_CHUNK * (self._EVAL_DEMON_MIN / t)), - self._EVAL_CHUNK + 10, - ), - ) - - def _init_widgets(self, top): - frame0 = Frame(top, **self._FRAME_PARAMS) - frame0.grid_columnconfigure(0, weight=4) - frame0.grid_columnconfigure(3, weight=2) - frame0.grid_rowconfigure(1, weight=1) - frame0.grid_rowconfigure(5, weight=1) - - # The grammar - self.grammarbox = Text(frame0, font=self._font, **self._GRAMMARBOX_PARAMS) - self.grammarlabel = Label( - frame0, - font=self._font, - text="Grammar:", - highlightcolor="black", - background=self._GRAMMARBOX_PARAMS["background"], - ) - self.grammarlabel.grid(column=0, row=0, sticky="SW") - self.grammarbox.grid(column=0, row=1, sticky="NEWS") - - # Scroll bar for grammar - grammar_scrollbar = Scrollbar(frame0, command=self.grammarbox.yview) - grammar_scrollbar.grid(column=1, row=1, sticky="NWS") - self.grammarbox.config(yscrollcommand=grammar_scrollbar.set) - - # grammar buttons - bg = self._FRAME_PARAMS["background"] - frame3 = Frame(frame0, background=bg) - frame3.grid(column=0, row=2, sticky="EW") - Button( - frame3, - text="Prev Grammar", - command=self._history_prev, - **self._BUTTON_PARAMS, - ).pack(side="left") - Button( - frame3, - text="Next Grammar", - command=self._history_next, - **self._BUTTON_PARAMS, - ).pack(side="left") - - # Help box - self.helpbox = Text(frame0, font=self._smallfont, **self._HELPBOX_PARAMS) - self.helpbox.grid(column=3, row=1, sticky="NEWS") - self.helptabs = {} - bg = self._FRAME_PARAMS["background"] - helptab_frame = Frame(frame0, background=bg) - helptab_frame.grid(column=3, row=0, sticky="SW") - for i, (tab, tabstops, text) in enumerate(self.HELP): - label = Label(helptab_frame, text=tab, font=self._smallfont) - label.grid(column=i * 2, row=0, sticky="S") - # help_frame.grid_columnconfigure(i, weight=1) - # label.pack(side='left') - label.bind("", lambda e, tab=tab: self.show_help(tab)) - self.helptabs[tab] = label - Frame( - helptab_frame, height=1, width=self._HELPTAB_SPACER, background=bg - ).grid(column=i * 2 + 1, row=0) - self.helptabs[self.HELP[0][0]].configure(font=self._font) - self.helpbox.tag_config("elide", elide=True) - for (tag, params) in self.HELP_AUTOTAG: - self.helpbox.tag_config("tag-%s" % tag, **params) - self.show_help(self.HELP[0][0]) - - # Scroll bar for helpbox - help_scrollbar = Scrollbar(frame0, command=self.helpbox.yview) - self.helpbox.config(yscrollcommand=help_scrollbar.set) - help_scrollbar.grid(column=4, row=1, sticky="NWS") - - # The dev set - frame4 = Frame(frame0, background=self._FRAME_PARAMS["background"]) - self.devsetbox = Text(frame4, font=self._font, **self._DEVSETBOX_PARAMS) - self.devsetbox.pack(expand=True, fill="both") - self.devsetlabel = Label( - frame0, - font=self._font, - text="Development Set:", - justify="right", - background=self._DEVSETBOX_PARAMS["background"], - ) - self.devsetlabel.grid(column=0, row=4, sticky="SW") - frame4.grid(column=0, row=5, sticky="NEWS") - - # dev set scrollbars - self.devset_scroll = Scrollbar(frame0, command=self._devset_scroll) - self.devset_scroll.grid(column=1, row=5, sticky="NWS") - self.devset_xscroll = Scrollbar( - frame4, command=self.devsetbox.xview, orient="horiz" - ) - self.devsetbox["xscrollcommand"] = self.devset_xscroll.set - self.devset_xscroll.pack(side="bottom", fill="x") - - # dev set buttons - bg = self._FRAME_PARAMS["background"] - frame1 = Frame(frame0, background=bg) - frame1.grid(column=0, row=7, sticky="EW") - Button( - frame1, - text="Prev Example (Ctrl-p)", - command=self._devset_prev, - **self._BUTTON_PARAMS, - ).pack(side="left") - Button( - frame1, - text="Next Example (Ctrl-n)", - command=self._devset_next, - **self._BUTTON_PARAMS, - ).pack(side="left") - self.devset_button = Button( - frame1, - text="Show example", - command=self.show_devset, - state="disabled", - **self._BUTTON_PARAMS, - ) - self.devset_button.pack(side="right") - self.trace_button = Button( - frame1, text="Show trace", command=self.show_trace, **self._BUTTON_PARAMS - ) - self.trace_button.pack(side="right") - - # evaluation box - self.evalbox = Canvas(frame0, **self._EVALBOX_PARAMS) - label = Label( - frame0, - font=self._font, - text="Evaluation:", - justify="right", - background=self._EVALBOX_PARAMS["background"], - ) - label.grid(column=3, row=4, sticky="SW") - self.evalbox.grid(column=3, row=5, sticky="NEWS", columnspan=2) - - # evaluation box buttons - bg = self._FRAME_PARAMS["background"] - frame2 = Frame(frame0, background=bg) - frame2.grid(column=3, row=7, sticky="EW") - self._autoscale = IntVar(self.top) - self._autoscale.set(False) - Checkbutton( - frame2, - variable=self._autoscale, - command=self._eval_plot, - text="Zoom", - **self._BUTTON_PARAMS, - ).pack(side="left") - self._eval_lines = IntVar(self.top) - self._eval_lines.set(False) - Checkbutton( - frame2, - variable=self._eval_lines, - command=self._eval_plot, - text="Lines", - **self._BUTTON_PARAMS, - ).pack(side="left") - Button(frame2, text="History", **self._BUTTON_PARAMS).pack(side="right") - - # The status label - self.status = Label(frame0, font=self._font, **self._STATUS_PARAMS) - self.status.grid(column=0, row=9, sticky="NEW", padx=3, pady=2, columnspan=5) - - # Help box & devset box can't be edited. - self.helpbox["state"] = "disabled" - self.devsetbox["state"] = "disabled" - - # Spacers - bg = self._FRAME_PARAMS["background"] - Frame(frame0, height=10, width=0, background=bg).grid(column=0, row=3) - Frame(frame0, height=0, width=10, background=bg).grid(column=2, row=0) - Frame(frame0, height=6, width=0, background=bg).grid(column=0, row=8) - - # pack the frame. - frame0.pack(fill="both", expand=True) - - # Set up colors for the devset box - self.devsetbox.tag_config("true-pos", background="#afa", underline="True") - self.devsetbox.tag_config("false-neg", underline="True", foreground="#800") - self.devsetbox.tag_config("false-pos", background="#faa") - self.devsetbox.tag_config("trace", foreground="#666", wrap="none") - self.devsetbox.tag_config("wrapindent", lmargin2=30, wrap="none") - self.devsetbox.tag_config("error", foreground="#800") - - # And for the grammarbox - self.grammarbox.tag_config("error", background="#fec") - self.grammarbox.tag_config("comment", foreground="#840") - self.grammarbox.tag_config("angle", foreground="#00f") - self.grammarbox.tag_config("brace", foreground="#0a0") - self.grammarbox.tag_config("hangindent", lmargin1=0, lmargin2=40) - - _showing_trace = False - - def show_trace(self, *e): - self._showing_trace = True - self.trace_button["state"] = "disabled" - self.devset_button["state"] = "normal" - - self.devsetbox["state"] = "normal" - # self.devsetbox['wrap'] = 'none' - self.devsetbox.delete("1.0", "end") - self.devsetlabel["text"] = "Development Set (%d/%d)" % ( - (self.devset_index + 1, self._devset_size.get()) - ) - - if self.chunker is None: - self.devsetbox.insert("1.0", "Trace: waiting for a valid grammar.") - self.devsetbox.tag_add("error", "1.0", "end") - return # can't do anything more - - gold_tree = self.devset[self.devset_index] - rules = self.chunker.rules() - - # Calculate the tag sequence - tagseq = "\t" - charnum = [1] - for wordnum, (word, pos) in enumerate(gold_tree.leaves()): - tagseq += "%s " % pos - charnum.append(len(tagseq)) - self.charnum = { - (i, j): charnum[j] - for i in range(len(rules) + 1) - for j in range(len(charnum)) - } - self.linenum = {i: i * 2 + 2 for i in range(len(rules) + 1)} - - for i in range(len(rules) + 1): - if i == 0: - self.devsetbox.insert("end", "Start:\n") - self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c") - else: - self.devsetbox.insert("end", "Apply %s:\n" % rules[i - 1]) - self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c") - # Display the tag sequence. - self.devsetbox.insert("end", tagseq + "\n") - self.devsetbox.tag_add("wrapindent", "end -2c linestart", "end -2c") - # Run a partial parser, and extract gold & test chunks - chunker = RegexpChunkParser(rules[:i]) - test_tree = self._chunkparse(gold_tree.leaves()) - gold_chunks = self._chunks(gold_tree) - test_chunks = self._chunks(test_tree) - # Compare them. - for chunk in gold_chunks.intersection(test_chunks): - self._color_chunk(i, chunk, "true-pos") - for chunk in gold_chunks - test_chunks: - self._color_chunk(i, chunk, "false-neg") - for chunk in test_chunks - gold_chunks: - self._color_chunk(i, chunk, "false-pos") - self.devsetbox.insert("end", "Finished.\n") - self.devsetbox.tag_add("trace", "end -2c linestart", "end -2c") - - # This is a hack, because the x-scrollbar isn't updating its - # position right -- I'm not sure what the underlying cause is - # though. (This is on OS X w/ python 2.5) - self.top.after(100, self.devset_xscroll.set, 0, 0.3) - - def show_help(self, tab): - self.helpbox["state"] = "normal" - self.helpbox.delete("1.0", "end") - for (name, tabstops, text) in self.HELP: - if name == tab: - text = text.replace( - "<>", - "\n".join( - "\t%s\t%s" % item - for item in sorted( - list(self.tagset.items()), - key=lambda t_w: re.match(r"\w+", t_w[0]) - and (0, t_w[0]) - or (1, t_w[0]), - ) - ), - ) - - self.helptabs[name].config(**self._HELPTAB_FG_PARAMS) - self.helpbox.config(tabs=tabstops) - self.helpbox.insert("1.0", text + "\n" * 20) - C = "1.0 + %d chars" - for (tag, params) in self.HELP_AUTOTAG: - pattern = f"(?s)(<{tag}>)(.*?)()" - for m in re.finditer(pattern, text): - self.helpbox.tag_add("elide", C % m.start(1), C % m.end(1)) - self.helpbox.tag_add( - "tag-%s" % tag, C % m.start(2), C % m.end(2) - ) - self.helpbox.tag_add("elide", C % m.start(3), C % m.end(3)) - else: - self.helptabs[name].config(**self._HELPTAB_BG_PARAMS) - self.helpbox["state"] = "disabled" - - def _history_prev(self, *e): - self._view_history(self._history_index - 1) - return "break" - - def _history_next(self, *e): - self._view_history(self._history_index + 1) - return "break" - - def _view_history(self, index): - # Bounds & sanity checking: - index = max(0, min(len(self._history) - 1, index)) - if not self._history: - return - # Already viewing the requested history item? - if index == self._history_index: - return - # Show the requested grammar. It will get added to _history - # only if they edit it (causing self.update() to get run.) - self.grammarbox["state"] = "normal" - self.grammarbox.delete("1.0", "end") - self.grammarbox.insert("end", self._history[index][0]) - self.grammarbox.mark_set("insert", "1.0") - self._history_index = index - self._syntax_highlight_grammar(self._history[index][0]) - # Record the normalized grammar & regenerate the chunker. - self.normalized_grammar = self.normalize_grammar(self._history[index][0]) - if self.normalized_grammar: - rules = [ - RegexpChunkRule.fromstring(line) - for line in self.normalized_grammar.split("\n") - ] - else: - rules = [] - self.chunker = RegexpChunkParser(rules) - # Show the score. - self._eval_plot() - # Update the devset box - self._highlight_devset() - if self._showing_trace: - self.show_trace() - # Update the grammar label - if self._history_index < len(self._history) - 1: - self.grammarlabel["text"] = "Grammar {}/{}:".format( - self._history_index + 1, - len(self._history), - ) - else: - self.grammarlabel["text"] = "Grammar:" - - def _devset_next(self, *e): - self._devset_scroll("scroll", 1, "page") - return "break" - - def _devset_prev(self, *e): - self._devset_scroll("scroll", -1, "page") - return "break" - - def destroy(self, *e): - if self.top is None: - return - self.top.destroy() - self.top = None - - def _devset_scroll(self, command, *args): - N = 1 # size of a page -- one sentence. - showing_trace = self._showing_trace - if command == "scroll" and args[1].startswith("unit"): - self.show_devset(self.devset_index + int(args[0])) - elif command == "scroll" and args[1].startswith("page"): - self.show_devset(self.devset_index + N * int(args[0])) - elif command == "moveto": - self.show_devset(int(float(args[0]) * self._devset_size.get())) - else: - assert 0, f"bad scroll command {command} {args}" - if showing_trace: - self.show_trace() - - def show_devset(self, index=None): - if index is None: - index = self.devset_index - - # Bounds checking - index = min(max(0, index), self._devset_size.get() - 1) - - if index == self.devset_index and not self._showing_trace: - return - self.devset_index = index - - self._showing_trace = False - self.trace_button["state"] = "normal" - self.devset_button["state"] = "disabled" - - # Clear the text box. - self.devsetbox["state"] = "normal" - self.devsetbox["wrap"] = "word" - self.devsetbox.delete("1.0", "end") - self.devsetlabel["text"] = "Development Set (%d/%d)" % ( - (self.devset_index + 1, self._devset_size.get()) - ) - - # Add the sentences - sample = self.devset[self.devset_index : self.devset_index + 1] - self.charnum = {} - self.linenum = {0: 1} - for sentnum, sent in enumerate(sample): - linestr = "" - for wordnum, (word, pos) in enumerate(sent.leaves()): - self.charnum[sentnum, wordnum] = len(linestr) - linestr += f"{word}/{pos} " - self.charnum[sentnum, wordnum + 1] = len(linestr) - self.devsetbox.insert("end", linestr[:-1] + "\n\n") - - # Highlight chunks in the dev set - if self.chunker is not None: - self._highlight_devset() - self.devsetbox["state"] = "disabled" - - # Update the scrollbar - first = self.devset_index / self._devset_size.get() - last = (self.devset_index + 2) / self._devset_size.get() - self.devset_scroll.set(first, last) - - def _chunks(self, tree): - chunks = set() - wordnum = 0 - for child in tree: - if isinstance(child, Tree): - if child.label() == self._chunk_label: - chunks.add((wordnum, wordnum + len(child))) - wordnum += len(child) - else: - wordnum += 1 - return chunks - - def _syntax_highlight_grammar(self, grammar): - if self.top is None: - return - self.grammarbox.tag_remove("comment", "1.0", "end") - self.grammarbox.tag_remove("angle", "1.0", "end") - self.grammarbox.tag_remove("brace", "1.0", "end") - self.grammarbox.tag_add("hangindent", "1.0", "end") - for lineno, line in enumerate(grammar.split("\n")): - if not line.strip(): - continue - m = re.match(r"(\\.|[^#])*(#.*)?", line) - comment_start = None - if m.group(2): - comment_start = m.start(2) - s = "%d.%d" % (lineno + 1, m.start(2)) - e = "%d.%d" % (lineno + 1, m.end(2)) - self.grammarbox.tag_add("comment", s, e) - for m in re.finditer("[<>{}]", line): - if comment_start is not None and m.start() >= comment_start: - break - s = "%d.%d" % (lineno + 1, m.start()) - e = "%d.%d" % (lineno + 1, m.end()) - if m.group() in "<>": - self.grammarbox.tag_add("angle", s, e) - else: - self.grammarbox.tag_add("brace", s, e) - - def _grammarcheck(self, grammar): - if self.top is None: - return - self.grammarbox.tag_remove("error", "1.0", "end") - self._grammarcheck_errs = [] - for lineno, line in enumerate(grammar.split("\n")): - line = re.sub(r"((\\.|[^#])*)(#.*)?", r"\1", line) - line = line.strip() - if line: - try: - RegexpChunkRule.fromstring(line) - except ValueError as e: - self.grammarbox.tag_add( - "error", "%s.0" % (lineno + 1), "%s.0 lineend" % (lineno + 1) - ) - self.status["text"] = "" - - def update(self, *event): - # Record when update was called (for grammarcheck) - if event: - self._last_keypress = time.time() - - # Read the grammar from the Text box. - self.grammar = grammar = self.grammarbox.get("1.0", "end") - - # If the grammar hasn't changed, do nothing: - normalized_grammar = self.normalize_grammar(grammar) - if normalized_grammar == self.normalized_grammar: - return - else: - self.normalized_grammar = normalized_grammar - - # If the grammar has changed, and we're looking at history, - # then stop looking at history. - if self._history_index < len(self._history) - 1: - self.grammarlabel["text"] = "Grammar:" - - self._syntax_highlight_grammar(grammar) - - # The grammar has changed; try parsing it. If it doesn't - # parse, do nothing. (flag error location?) - try: - # Note: the normalized grammar has no blank lines. - if normalized_grammar: - rules = [ - RegexpChunkRule.fromstring(line) - for line in normalized_grammar.split("\n") - ] - else: - rules = [] - except ValueError as e: - # Use the un-normalized grammar for error highlighting. - self._grammarcheck(grammar) - self.chunker = None - return - - self.chunker = RegexpChunkParser(rules) - self.grammarbox.tag_remove("error", "1.0", "end") - self.grammar_changed = time.time() - # Display the results - if self._showing_trace: - self.show_trace() - else: - self._highlight_devset() - # Start the eval demon - if not self._eval_demon_running: - self._eval_demon() - - def _highlight_devset(self, sample=None): - if sample is None: - sample = self.devset[self.devset_index : self.devset_index + 1] - - self.devsetbox.tag_remove("true-pos", "1.0", "end") - self.devsetbox.tag_remove("false-neg", "1.0", "end") - self.devsetbox.tag_remove("false-pos", "1.0", "end") - - # Run the grammar on the test cases. - for sentnum, gold_tree in enumerate(sample): - # Run the chunk parser - test_tree = self._chunkparse(gold_tree.leaves()) - # Extract gold & test chunks - gold_chunks = self._chunks(gold_tree) - test_chunks = self._chunks(test_tree) - # Compare them. - for chunk in gold_chunks.intersection(test_chunks): - self._color_chunk(sentnum, chunk, "true-pos") - for chunk in gold_chunks - test_chunks: - self._color_chunk(sentnum, chunk, "false-neg") - for chunk in test_chunks - gold_chunks: - self._color_chunk(sentnum, chunk, "false-pos") - - def _chunkparse(self, words): - try: - return self.chunker.parse(words) - except (ValueError, IndexError) as e: - # There's an error somewhere in the grammar, but we're not sure - # exactly where, so just mark the whole grammar as bad. - # E.g., this is caused by: "({})" - self.grammarbox.tag_add("error", "1.0", "end") - # Treat it as tagging nothing: - return words - - def _color_chunk(self, sentnum, chunk, tag): - start, end = chunk - self.devsetbox.tag_add( - tag, - f"{self.linenum[sentnum]}.{self.charnum[sentnum, start]}", - f"{self.linenum[sentnum]}.{self.charnum[sentnum, end] - 1}", - ) - - def reset(self): - # Clear various variables - self.chunker = None - self.grammar = None - self.normalized_grammar = None - self.grammar_changed = 0 - self._history = [] - self._history_index = 0 - # Update the on-screen display. - self.grammarbox.delete("1.0", "end") - self.show_devset(0) - self.update() - # self._eval_plot() - - SAVE_GRAMMAR_TEMPLATE = ( - "# Regexp Chunk Parsing Grammar\n" - "# Saved %(date)s\n" - "#\n" - "# Development set: %(devset)s\n" - "# Precision: %(precision)s\n" - "# Recall: %(recall)s\n" - "# F-score: %(fscore)s\n\n" - "%(grammar)s\n" - ) - - def save_grammar(self, filename=None): - if not filename: - ftypes = [("Chunk Gramamr", ".chunk"), ("All files", "*")] - filename = asksaveasfilename(filetypes=ftypes, defaultextension=".chunk") - if not filename: - return - if self._history and self.normalized_grammar == self.normalize_grammar( - self._history[-1][0] - ): - precision, recall, fscore = ( - "%.2f%%" % (100 * v) for v in self._history[-1][1:] - ) - elif self.chunker is None: - precision = recall = fscore = "Grammar not well formed" - else: - precision = recall = fscore = "Not finished evaluation yet" - - with open(filename, "w") as outfile: - outfile.write( - self.SAVE_GRAMMAR_TEMPLATE - % dict( - date=time.ctime(), - devset=self.devset_name, - precision=precision, - recall=recall, - fscore=fscore, - grammar=self.grammar.strip(), - ) - ) - - def load_grammar(self, filename=None): - if not filename: - ftypes = [("Chunk Gramamr", ".chunk"), ("All files", "*")] - filename = askopenfilename(filetypes=ftypes, defaultextension=".chunk") - if not filename: - return - self.grammarbox.delete("1.0", "end") - self.update() - with open(filename) as infile: - grammar = infile.read() - grammar = re.sub( - r"^\# Regexp Chunk Parsing Grammar[\s\S]*" "F-score:.*\n", "", grammar - ).lstrip() - self.grammarbox.insert("1.0", grammar) - self.update() - - def save_history(self, filename=None): - if not filename: - ftypes = [("Chunk Gramamr History", ".txt"), ("All files", "*")] - filename = asksaveasfilename(filetypes=ftypes, defaultextension=".txt") - if not filename: - return - - with open(filename, "w") as outfile: - outfile.write("# Regexp Chunk Parsing Grammar History\n") - outfile.write("# Saved %s\n" % time.ctime()) - outfile.write("# Development set: %s\n" % self.devset_name) - for i, (g, p, r, f) in enumerate(self._history): - hdr = ( - "Grammar %d/%d (precision=%.2f%%, recall=%.2f%%, " - "fscore=%.2f%%)" - % (i + 1, len(self._history), p * 100, r * 100, f * 100) - ) - outfile.write("\n%s\n" % hdr) - outfile.write("".join(" %s\n" % line for line in g.strip().split())) - - if not ( - self._history - and self.normalized_grammar - == self.normalize_grammar(self._history[-1][0]) - ): - if self.chunker is None: - outfile.write("\nCurrent Grammar (not well-formed)\n") - else: - outfile.write("\nCurrent Grammar (not evaluated)\n") - outfile.write( - "".join(" %s\n" % line for line in self.grammar.strip().split()) - ) - - def about(self, *e): - ABOUT = "NLTK RegExp Chunk Parser Application\n" + "Written by Edward Loper" - TITLE = "About: Regular Expression Chunk Parser Application" - try: - from tkinter.messagebox import Message - - Message(message=ABOUT, title=TITLE).show() - except: - ShowText(self.top, TITLE, ABOUT) - - def set_devset_size(self, size=None): - if size is not None: - self._devset_size.set(size) - self._devset_size.set(min(len(self.devset), self._devset_size.get())) - self.show_devset(1) - self.show_devset(0) - # what about history? Evaluated at diff dev set sizes! - - def resize(self, size=None): - if size is not None: - self._size.set(size) - size = self._size.get() - self._font.configure(size=-(abs(size))) - self._smallfont.configure(size=min(-10, -(abs(size)) * 14 // 20)) - - def mainloop(self, *args, **kwargs): - """ - Enter the Tkinter mainloop. This function must be called if - this demo is created from a non-interactive program (e.g. - from a secript); otherwise, the demo will close as soon as - the script completes. - """ - if in_idle(): - return - self.top.mainloop(*args, **kwargs) - - -def app(): - RegexpChunkApp().mainloop() - - -if __name__ == "__main__": - app() - -__all__ = ["app"] diff --git a/pipeline/nltk/app/collocations_app.py b/pipeline/nltk/app/collocations_app.py deleted file mode 100644 index 19c661368fd9e96d1a4bf1a47ebfbd07a4bb3d80..0000000000000000000000000000000000000000 --- a/pipeline/nltk/app/collocations_app.py +++ /dev/null @@ -1,438 +0,0 @@ -# Natural Language Toolkit: Collocations Application -# Much of the GUI code is imported from concordance.py; We intend to merge these tools together -# Copyright (C) 2001-2023 NLTK Project -# Author: Sumukh Ghodke -# URL: -# For license information, see LICENSE.TXT -# - - -import queue as q -import threading -from tkinter import ( - END, - LEFT, - SUNKEN, - Button, - Frame, - IntVar, - Label, - Menu, - OptionMenu, - Scrollbar, - StringVar, - Text, - Tk, -) -from tkinter.font import Font - -from nltk.corpus import ( - alpino, - brown, - cess_cat, - cess_esp, - floresta, - indian, - mac_morpho, - machado, - nps_chat, - sinica_treebank, - treebank, -) -from nltk.probability import FreqDist -from nltk.util import in_idle - -CORPUS_LOADED_EVENT = "<>" -ERROR_LOADING_CORPUS_EVENT = "<>" -POLL_INTERVAL = 100 - -_DEFAULT = "English: Brown Corpus (Humor)" -_CORPORA = { - "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(), - "English: Brown Corpus": lambda: brown.words(), - "English: Brown Corpus (Press)": lambda: brown.words( - categories=["news", "editorial", "reviews"] - ), - "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"), - "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"), - "English: Brown Corpus (Science Fiction)": lambda: brown.words( - categories="science_fiction" - ), - "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"), - "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"), - "English: NPS Chat Corpus": lambda: nps_chat.words(), - "English: Wall Street Journal Corpus": lambda: treebank.words(), - "Chinese: Sinica Corpus": lambda: sinica_treebank.words(), - "Dutch: Alpino Corpus": lambda: alpino.words(), - "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"), - "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(), - "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(), - "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(), - "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(), -} - - -class CollocationsView: - _BACKGROUND_COLOUR = "#FFF" # white - - def __init__(self): - self.queue = q.Queue() - self.model = CollocationsModel(self.queue) - self.top = Tk() - self._init_top(self.top) - self._init_menubar() - self._init_widgets(self.top) - self.load_corpus(self.model.DEFAULT_CORPUS) - self.after = self.top.after(POLL_INTERVAL, self._poll) - - def _init_top(self, top): - top.geometry("550x650+50+50") - top.title("NLTK Collocations List") - top.bind("", self.destroy) - top.protocol("WM_DELETE_WINDOW", self.destroy) - top.minsize(550, 650) - - def _init_widgets(self, parent): - self.main_frame = Frame( - parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1) - ) - self._init_corpus_select(self.main_frame) - self._init_results_box(self.main_frame) - self._init_paging(self.main_frame) - self._init_status(self.main_frame) - self.main_frame.pack(fill="both", expand=True) - - def _init_corpus_select(self, parent): - innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) - self.var = StringVar(innerframe) - self.var.set(self.model.DEFAULT_CORPUS) - Label( - innerframe, - justify=LEFT, - text=" Corpus: ", - background=self._BACKGROUND_COLOUR, - padx=2, - pady=1, - border=0, - ).pack(side="left") - - other_corpora = list(self.model.CORPORA.keys()).remove( - self.model.DEFAULT_CORPUS - ) - om = OptionMenu( - innerframe, - self.var, - self.model.DEFAULT_CORPUS, - command=self.corpus_selected, - *self.model.non_default_corpora() - ) - om["borderwidth"] = 0 - om["highlightthickness"] = 1 - om.pack(side="left") - innerframe.pack(side="top", fill="x", anchor="n") - - def _init_status(self, parent): - self.status = Label( - parent, - justify=LEFT, - relief=SUNKEN, - background=self._BACKGROUND_COLOUR, - border=0, - padx=1, - pady=0, - ) - self.status.pack(side="top", anchor="sw") - - def _init_menubar(self): - self._result_size = IntVar(self.top) - menubar = Menu(self.top) - - filemenu = Menu(menubar, tearoff=0, borderwidth=0) - filemenu.add_command( - label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q" - ) - menubar.add_cascade(label="File", underline=0, menu=filemenu) - - editmenu = Menu(menubar, tearoff=0) - rescntmenu = Menu(editmenu, tearoff=0) - rescntmenu.add_radiobutton( - label="20", - variable=self._result_size, - underline=0, - value=20, - command=self.set_result_size, - ) - rescntmenu.add_radiobutton( - label="50", - variable=self._result_size, - underline=0, - value=50, - command=self.set_result_size, - ) - rescntmenu.add_radiobutton( - label="100", - variable=self._result_size, - underline=0, - value=100, - command=self.set_result_size, - ) - rescntmenu.invoke(1) - editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu) - - menubar.add_cascade(label="Edit", underline=0, menu=editmenu) - self.top.config(menu=menubar) - - def set_result_size(self, **kwargs): - self.model.result_count = self._result_size.get() - - def _init_results_box(self, parent): - innerframe = Frame(parent) - i1 = Frame(innerframe) - i2 = Frame(innerframe) - vscrollbar = Scrollbar(i1, borderwidth=1) - hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz") - self.results_box = Text( - i1, - font=Font(family="courier", size="16"), - state="disabled", - borderwidth=1, - yscrollcommand=vscrollbar.set, - xscrollcommand=hscrollbar.set, - wrap="none", - width="40", - height="20", - exportselection=1, - ) - self.results_box.pack(side="left", fill="both", expand=True) - vscrollbar.pack(side="left", fill="y", anchor="e") - vscrollbar.config(command=self.results_box.yview) - hscrollbar.pack(side="left", fill="x", expand=True, anchor="w") - hscrollbar.config(command=self.results_box.xview) - # there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!! - Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack( - side="left", anchor="e" - ) - i1.pack(side="top", fill="both", expand=True, anchor="n") - i2.pack(side="bottom", fill="x", anchor="s") - innerframe.pack(side="top", fill="both", expand=True) - - def _init_paging(self, parent): - innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) - self.prev = prev = Button( - innerframe, - text="Previous", - command=self.previous, - width="10", - borderwidth=1, - highlightthickness=1, - state="disabled", - ) - prev.pack(side="left", anchor="center") - self.next = next = Button( - innerframe, - text="Next", - command=self.__next__, - width="10", - borderwidth=1, - highlightthickness=1, - state="disabled", - ) - next.pack(side="right", anchor="center") - innerframe.pack(side="top", fill="y") - self.reset_current_page() - - def reset_current_page(self): - self.current_page = -1 - - def _poll(self): - try: - event = self.queue.get(block=False) - except q.Empty: - pass - else: - if event == CORPUS_LOADED_EVENT: - self.handle_corpus_loaded(event) - elif event == ERROR_LOADING_CORPUS_EVENT: - self.handle_error_loading_corpus(event) - self.after = self.top.after(POLL_INTERVAL, self._poll) - - def handle_error_loading_corpus(self, event): - self.status["text"] = "Error in loading " + self.var.get() - self.unfreeze_editable() - self.clear_results_box() - self.freeze_editable() - self.reset_current_page() - - def handle_corpus_loaded(self, event): - self.status["text"] = self.var.get() + " is loaded" - self.unfreeze_editable() - self.clear_results_box() - self.reset_current_page() - # self.next() - collocations = self.model.next(self.current_page + 1) - self.write_results(collocations) - self.current_page += 1 - - def corpus_selected(self, *args): - new_selection = self.var.get() - self.load_corpus(new_selection) - - def previous(self): - self.freeze_editable() - collocations = self.model.prev(self.current_page - 1) - self.current_page = self.current_page - 1 - self.clear_results_box() - self.write_results(collocations) - self.unfreeze_editable() - - def __next__(self): - self.freeze_editable() - collocations = self.model.next(self.current_page + 1) - self.clear_results_box() - self.write_results(collocations) - self.current_page += 1 - self.unfreeze_editable() - - def load_corpus(self, selection): - if self.model.selected_corpus != selection: - self.status["text"] = "Loading " + selection + "..." - self.freeze_editable() - self.model.load_corpus(selection) - - def freeze_editable(self): - self.prev["state"] = "disabled" - self.next["state"] = "disabled" - - def clear_results_box(self): - self.results_box["state"] = "normal" - self.results_box.delete("1.0", END) - self.results_box["state"] = "disabled" - - def fire_event(self, event): - # Firing an event so that rendering of widgets happen in the mainloop thread - self.top.event_generate(event, when="tail") - - def destroy(self, *e): - if self.top is None: - return - self.top.after_cancel(self.after) - self.top.destroy() - self.top = None - - def mainloop(self, *args, **kwargs): - if in_idle(): - return - self.top.mainloop(*args, **kwargs) - - def unfreeze_editable(self): - self.set_paging_button_states() - - def set_paging_button_states(self): - if self.current_page == -1 or self.current_page == 0: - self.prev["state"] = "disabled" - else: - self.prev["state"] = "normal" - if self.model.is_last_page(self.current_page): - self.next["state"] = "disabled" - else: - self.next["state"] = "normal" - - def write_results(self, results): - self.results_box["state"] = "normal" - row = 1 - for each in results: - self.results_box.insert(str(row) + ".0", each[0] + " " + each[1] + "\n") - row += 1 - self.results_box["state"] = "disabled" - - -class CollocationsModel: - def __init__(self, queue): - self.result_count = None - self.selected_corpus = None - self.collocations = None - self.CORPORA = _CORPORA - self.DEFAULT_CORPUS = _DEFAULT - self.queue = queue - self.reset_results() - - def reset_results(self): - self.result_pages = [] - self.results_returned = 0 - - def load_corpus(self, name): - self.selected_corpus = name - self.collocations = None - runner_thread = self.LoadCorpus(name, self) - runner_thread.start() - self.reset_results() - - def non_default_corpora(self): - copy = [] - copy.extend(list(self.CORPORA.keys())) - copy.remove(self.DEFAULT_CORPUS) - copy.sort() - return copy - - def is_last_page(self, number): - if number < len(self.result_pages): - return False - return self.results_returned + ( - number - len(self.result_pages) - ) * self.result_count >= len(self.collocations) - - def next(self, page): - if (len(self.result_pages) - 1) < page: - for i in range(page - (len(self.result_pages) - 1)): - self.result_pages.append( - self.collocations[ - self.results_returned : self.results_returned - + self.result_count - ] - ) - self.results_returned += self.result_count - return self.result_pages[page] - - def prev(self, page): - if page == -1: - return [] - return self.result_pages[page] - - class LoadCorpus(threading.Thread): - def __init__(self, name, model): - threading.Thread.__init__(self) - self.model, self.name = model, name - - def run(self): - try: - words = self.model.CORPORA[self.name]() - from operator import itemgetter - - text = [w for w in words if len(w) > 2] - fd = FreqDist(tuple(text[i : i + 2]) for i in range(len(text) - 1)) - vocab = FreqDist(text) - scored = [ - ((w1, w2), fd[(w1, w2)] ** 3 / (vocab[w1] * vocab[w2])) - for w1, w2 in fd - ] - scored.sort(key=itemgetter(1), reverse=True) - self.model.collocations = list(map(itemgetter(0), scored)) - self.model.queue.put(CORPUS_LOADED_EVENT) - except Exception as e: - print(e) - self.model.queue.put(ERROR_LOADING_CORPUS_EVENT) - - -# def collocations(): -# colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations[:num]] - - -def app(): - c = CollocationsView() - c.mainloop() - - -if __name__ == "__main__": - app() - -__all__ = ["app"] diff --git a/pipeline/nltk/app/concordance_app.py b/pipeline/nltk/app/concordance_app.py deleted file mode 100644 index 8bd9a991a0a969f87bf03986a915a0af18cd9b5f..0000000000000000000000000000000000000000 --- a/pipeline/nltk/app/concordance_app.py +++ /dev/null @@ -1,709 +0,0 @@ -# Natural Language Toolkit: Concordance Application -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Sumukh Ghodke -# URL: -# For license information, see LICENSE.TXT - -import queue as q -import re -import threading -from tkinter import ( - END, - LEFT, - SUNKEN, - Button, - Entry, - Frame, - IntVar, - Label, - Menu, - OptionMenu, - Scrollbar, - StringVar, - Text, - Tk, -) -from tkinter.font import Font - -from nltk.corpus import ( - alpino, - brown, - cess_cat, - cess_esp, - floresta, - indian, - mac_morpho, - nps_chat, - sinica_treebank, - treebank, -) -from nltk.draw.util import ShowText -from nltk.util import in_idle - -WORD_OR_TAG = "[^/ ]+" -BOUNDARY = r"\b" - -CORPUS_LOADED_EVENT = "<>" -SEARCH_TERMINATED_EVENT = "<>" -SEARCH_ERROR_EVENT = "<>" -ERROR_LOADING_CORPUS_EVENT = "<>" - -POLL_INTERVAL = 50 - -# NB All corpora must be specified in a lambda expression so as not to be -# loaded when the module is imported. - -_DEFAULT = "English: Brown Corpus (Humor, simplified)" -_CORPORA = { - "Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents( - tagset="universal" - ), - "English: Brown Corpus": lambda: brown.tagged_sents(), - "English: Brown Corpus (simplified)": lambda: brown.tagged_sents( - tagset="universal" - ), - "English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents( - categories=["news", "editorial", "reviews"], tagset="universal" - ), - "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents( - categories="religion", tagset="universal" - ), - "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents( - categories="learned", tagset="universal" - ), - "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents( - categories="science_fiction", tagset="universal" - ), - "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents( - categories="romance", tagset="universal" - ), - "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents( - categories="humor", tagset="universal" - ), - "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(), - "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts( - tagset="universal" - ), - "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(), - "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents( - tagset="universal" - ), - "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(), - "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents( - tagset="universal" - ), - "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(), - "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents( - tagset="universal" - ), - "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"), - "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents( - files="hindi.pos", tagset="universal" - ), - "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(), - "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents( - tagset="universal" - ), - "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(), - "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents( - tagset="universal" - ), - "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents( - tagset="universal" - ), -} - - -class ConcordanceSearchView: - _BACKGROUND_COLOUR = "#FFF" # white - - # Colour of highlighted results - _HIGHLIGHT_WORD_COLOUR = "#F00" # red - _HIGHLIGHT_WORD_TAG = "HL_WRD_TAG" - - _HIGHLIGHT_LABEL_COLOUR = "#C0C0C0" # dark grey - _HIGHLIGHT_LABEL_TAG = "HL_LBL_TAG" - - # Percentage of text left of the scrollbar position - _FRACTION_LEFT_TEXT = 0.30 - - def __init__(self): - self.queue = q.Queue() - self.model = ConcordanceSearchModel(self.queue) - self.top = Tk() - self._init_top(self.top) - self._init_menubar() - self._init_widgets(self.top) - self.load_corpus(self.model.DEFAULT_CORPUS) - self.after = self.top.after(POLL_INTERVAL, self._poll) - - def _init_top(self, top): - top.geometry("950x680+50+50") - top.title("NLTK Concordance Search") - top.bind("", self.destroy) - top.protocol("WM_DELETE_WINDOW", self.destroy) - top.minsize(950, 680) - - def _init_widgets(self, parent): - self.main_frame = Frame( - parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1) - ) - self._init_corpus_select(self.main_frame) - self._init_query_box(self.main_frame) - self._init_results_box(self.main_frame) - self._init_paging(self.main_frame) - self._init_status(self.main_frame) - self.main_frame.pack(fill="both", expand=True) - - def _init_menubar(self): - self._result_size = IntVar(self.top) - self._cntx_bf_len = IntVar(self.top) - self._cntx_af_len = IntVar(self.top) - menubar = Menu(self.top) - - filemenu = Menu(menubar, tearoff=0, borderwidth=0) - filemenu.add_command( - label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-q" - ) - menubar.add_cascade(label="File", underline=0, menu=filemenu) - - editmenu = Menu(menubar, tearoff=0) - rescntmenu = Menu(editmenu, tearoff=0) - rescntmenu.add_radiobutton( - label="20", - variable=self._result_size, - underline=0, - value=20, - command=self.set_result_size, - ) - rescntmenu.add_radiobutton( - label="50", - variable=self._result_size, - underline=0, - value=50, - command=self.set_result_size, - ) - rescntmenu.add_radiobutton( - label="100", - variable=self._result_size, - underline=0, - value=100, - command=self.set_result_size, - ) - rescntmenu.invoke(1) - editmenu.add_cascade(label="Result Count", underline=0, menu=rescntmenu) - - cntxmenu = Menu(editmenu, tearoff=0) - cntxbfmenu = Menu(cntxmenu, tearoff=0) - cntxbfmenu.add_radiobutton( - label="60 characters", - variable=self._cntx_bf_len, - underline=0, - value=60, - command=self.set_cntx_bf_len, - ) - cntxbfmenu.add_radiobutton( - label="80 characters", - variable=self._cntx_bf_len, - underline=0, - value=80, - command=self.set_cntx_bf_len, - ) - cntxbfmenu.add_radiobutton( - label="100 characters", - variable=self._cntx_bf_len, - underline=0, - value=100, - command=self.set_cntx_bf_len, - ) - cntxbfmenu.invoke(1) - cntxmenu.add_cascade(label="Before", underline=0, menu=cntxbfmenu) - - cntxafmenu = Menu(cntxmenu, tearoff=0) - cntxafmenu.add_radiobutton( - label="70 characters", - variable=self._cntx_af_len, - underline=0, - value=70, - command=self.set_cntx_af_len, - ) - cntxafmenu.add_radiobutton( - label="90 characters", - variable=self._cntx_af_len, - underline=0, - value=90, - command=self.set_cntx_af_len, - ) - cntxafmenu.add_radiobutton( - label="110 characters", - variable=self._cntx_af_len, - underline=0, - value=110, - command=self.set_cntx_af_len, - ) - cntxafmenu.invoke(1) - cntxmenu.add_cascade(label="After", underline=0, menu=cntxafmenu) - - editmenu.add_cascade(label="Context", underline=0, menu=cntxmenu) - - menubar.add_cascade(label="Edit", underline=0, menu=editmenu) - - self.top.config(menu=menubar) - - def set_result_size(self, **kwargs): - self.model.result_count = self._result_size.get() - - def set_cntx_af_len(self, **kwargs): - self._char_after = self._cntx_af_len.get() - - def set_cntx_bf_len(self, **kwargs): - self._char_before = self._cntx_bf_len.get() - - def _init_corpus_select(self, parent): - innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) - self.var = StringVar(innerframe) - self.var.set(self.model.DEFAULT_CORPUS) - Label( - innerframe, - justify=LEFT, - text=" Corpus: ", - background=self._BACKGROUND_COLOUR, - padx=2, - pady=1, - border=0, - ).pack(side="left") - - other_corpora = list(self.model.CORPORA.keys()).remove( - self.model.DEFAULT_CORPUS - ) - om = OptionMenu( - innerframe, - self.var, - self.model.DEFAULT_CORPUS, - command=self.corpus_selected, - *self.model.non_default_corpora() - ) - om["borderwidth"] = 0 - om["highlightthickness"] = 1 - om.pack(side="left") - innerframe.pack(side="top", fill="x", anchor="n") - - def _init_status(self, parent): - self.status = Label( - parent, - justify=LEFT, - relief=SUNKEN, - background=self._BACKGROUND_COLOUR, - border=0, - padx=1, - pady=0, - ) - self.status.pack(side="top", anchor="sw") - - def _init_query_box(self, parent): - innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) - another = Frame(innerframe, background=self._BACKGROUND_COLOUR) - self.query_box = Entry(another, width=60) - self.query_box.pack(side="left", fill="x", pady=25, anchor="center") - self.search_button = Button( - another, - text="Search", - command=self.search, - borderwidth=1, - highlightthickness=1, - ) - self.search_button.pack(side="left", fill="x", pady=25, anchor="center") - self.query_box.bind("", self.search_enter_keypress_handler) - another.pack() - innerframe.pack(side="top", fill="x", anchor="n") - - def search_enter_keypress_handler(self, *event): - self.search() - - def _init_results_box(self, parent): - innerframe = Frame(parent) - i1 = Frame(innerframe) - i2 = Frame(innerframe) - vscrollbar = Scrollbar(i1, borderwidth=1) - hscrollbar = Scrollbar(i2, borderwidth=1, orient="horiz") - self.results_box = Text( - i1, - font=Font(family="courier", size="16"), - state="disabled", - borderwidth=1, - yscrollcommand=vscrollbar.set, - xscrollcommand=hscrollbar.set, - wrap="none", - width="40", - height="20", - exportselection=1, - ) - self.results_box.pack(side="left", fill="both", expand=True) - self.results_box.tag_config( - self._HIGHLIGHT_WORD_TAG, foreground=self._HIGHLIGHT_WORD_COLOUR - ) - self.results_box.tag_config( - self._HIGHLIGHT_LABEL_TAG, foreground=self._HIGHLIGHT_LABEL_COLOUR - ) - vscrollbar.pack(side="left", fill="y", anchor="e") - vscrollbar.config(command=self.results_box.yview) - hscrollbar.pack(side="left", fill="x", expand=True, anchor="w") - hscrollbar.config(command=self.results_box.xview) - # there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!! - Label(i2, text=" ", background=self._BACKGROUND_COLOUR).pack( - side="left", anchor="e" - ) - i1.pack(side="top", fill="both", expand=True, anchor="n") - i2.pack(side="bottom", fill="x", anchor="s") - innerframe.pack(side="top", fill="both", expand=True) - - def _init_paging(self, parent): - innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) - self.prev = prev = Button( - innerframe, - text="Previous", - command=self.previous, - width="10", - borderwidth=1, - highlightthickness=1, - state="disabled", - ) - prev.pack(side="left", anchor="center") - self.next = next = Button( - innerframe, - text="Next", - command=self.__next__, - width="10", - borderwidth=1, - highlightthickness=1, - state="disabled", - ) - next.pack(side="right", anchor="center") - innerframe.pack(side="top", fill="y") - self.current_page = 0 - - def previous(self): - self.clear_results_box() - self.freeze_editable() - self.model.prev(self.current_page - 1) - - def __next__(self): - self.clear_results_box() - self.freeze_editable() - self.model.next(self.current_page + 1) - - def about(self, *e): - ABOUT = "NLTK Concordance Search Demo\n" - TITLE = "About: NLTK Concordance Search Demo" - try: - from tkinter.messagebox import Message - - Message(message=ABOUT, title=TITLE, parent=self.main_frame).show() - except: - ShowText(self.top, TITLE, ABOUT) - - def _bind_event_handlers(self): - self.top.bind(CORPUS_LOADED_EVENT, self.handle_corpus_loaded) - self.top.bind(SEARCH_TERMINATED_EVENT, self.handle_search_terminated) - self.top.bind(SEARCH_ERROR_EVENT, self.handle_search_error) - self.top.bind(ERROR_LOADING_CORPUS_EVENT, self.handle_error_loading_corpus) - - def _poll(self): - try: - event = self.queue.get(block=False) - except q.Empty: - pass - else: - if event == CORPUS_LOADED_EVENT: - self.handle_corpus_loaded(event) - elif event == SEARCH_TERMINATED_EVENT: - self.handle_search_terminated(event) - elif event == SEARCH_ERROR_EVENT: - self.handle_search_error(event) - elif event == ERROR_LOADING_CORPUS_EVENT: - self.handle_error_loading_corpus(event) - self.after = self.top.after(POLL_INTERVAL, self._poll) - - def handle_error_loading_corpus(self, event): - self.status["text"] = "Error in loading " + self.var.get() - self.unfreeze_editable() - self.clear_all() - self.freeze_editable() - - def handle_corpus_loaded(self, event): - self.status["text"] = self.var.get() + " is loaded" - self.unfreeze_editable() - self.clear_all() - self.query_box.focus_set() - - def handle_search_terminated(self, event): - # todo: refactor the model such that it is less state sensitive - results = self.model.get_results() - self.write_results(results) - self.status["text"] = "" - if len(results) == 0: - self.status["text"] = "No results found for " + self.model.query - else: - self.current_page = self.model.last_requested_page - self.unfreeze_editable() - self.results_box.xview_moveto(self._FRACTION_LEFT_TEXT) - - def handle_search_error(self, event): - self.status["text"] = "Error in query " + self.model.query - self.unfreeze_editable() - - def corpus_selected(self, *args): - new_selection = self.var.get() - self.load_corpus(new_selection) - - def load_corpus(self, selection): - if self.model.selected_corpus != selection: - self.status["text"] = "Loading " + selection + "..." - self.freeze_editable() - self.model.load_corpus(selection) - - def search(self): - self.current_page = 0 - self.clear_results_box() - self.model.reset_results() - query = self.query_box.get() - if len(query.strip()) == 0: - return - self.status["text"] = "Searching for " + query - self.freeze_editable() - self.model.search(query, self.current_page + 1) - - def write_results(self, results): - self.results_box["state"] = "normal" - row = 1 - for each in results: - sent, pos1, pos2 = each[0].strip(), each[1], each[2] - if len(sent) != 0: - if pos1 < self._char_before: - sent, pos1, pos2 = self.pad(sent, pos1, pos2) - sentence = sent[pos1 - self._char_before : pos1 + self._char_after] - if not row == len(results): - sentence += "\n" - self.results_box.insert(str(row) + ".0", sentence) - word_markers, label_markers = self.words_and_labels(sent, pos1, pos2) - for marker in word_markers: - self.results_box.tag_add( - self._HIGHLIGHT_WORD_TAG, - str(row) + "." + str(marker[0]), - str(row) + "." + str(marker[1]), - ) - for marker in label_markers: - self.results_box.tag_add( - self._HIGHLIGHT_LABEL_TAG, - str(row) + "." + str(marker[0]), - str(row) + "." + str(marker[1]), - ) - row += 1 - self.results_box["state"] = "disabled" - - def words_and_labels(self, sentence, pos1, pos2): - search_exp = sentence[pos1:pos2] - words, labels = [], [] - labeled_words = search_exp.split(" ") - index = 0 - for each in labeled_words: - if each == "": - index += 1 - else: - word, label = each.split("/") - words.append( - (self._char_before + index, self._char_before + index + len(word)) - ) - index += len(word) + 1 - labels.append( - (self._char_before + index, self._char_before + index + len(label)) - ) - index += len(label) - index += 1 - return words, labels - - def pad(self, sent, hstart, hend): - if hstart >= self._char_before: - return sent, hstart, hend - d = self._char_before - hstart - sent = "".join([" "] * d) + sent - return sent, hstart + d, hend + d - - def destroy(self, *e): - if self.top is None: - return - self.top.after_cancel(self.after) - self.top.destroy() - self.top = None - - def clear_all(self): - self.query_box.delete(0, END) - self.model.reset_query() - self.clear_results_box() - - def clear_results_box(self): - self.results_box["state"] = "normal" - self.results_box.delete("1.0", END) - self.results_box["state"] = "disabled" - - def freeze_editable(self): - self.query_box["state"] = "disabled" - self.search_button["state"] = "disabled" - self.prev["state"] = "disabled" - self.next["state"] = "disabled" - - def unfreeze_editable(self): - self.query_box["state"] = "normal" - self.search_button["state"] = "normal" - self.set_paging_button_states() - - def set_paging_button_states(self): - if self.current_page == 0 or self.current_page == 1: - self.prev["state"] = "disabled" - else: - self.prev["state"] = "normal" - if self.model.has_more_pages(self.current_page): - self.next["state"] = "normal" - else: - self.next["state"] = "disabled" - - def fire_event(self, event): - # Firing an event so that rendering of widgets happen in the mainloop thread - self.top.event_generate(event, when="tail") - - def mainloop(self, *args, **kwargs): - if in_idle(): - return - self.top.mainloop(*args, **kwargs) - - -class ConcordanceSearchModel: - def __init__(self, queue): - self.queue = queue - self.CORPORA = _CORPORA - self.DEFAULT_CORPUS = _DEFAULT - self.selected_corpus = None - self.reset_query() - self.reset_results() - self.result_count = None - self.last_sent_searched = 0 - - def non_default_corpora(self): - copy = [] - copy.extend(list(self.CORPORA.keys())) - copy.remove(self.DEFAULT_CORPUS) - copy.sort() - return copy - - def load_corpus(self, name): - self.selected_corpus = name - self.tagged_sents = [] - runner_thread = self.LoadCorpus(name, self) - runner_thread.start() - - def search(self, query, page): - self.query = query - self.last_requested_page = page - self.SearchCorpus(self, page, self.result_count).start() - - def next(self, page): - self.last_requested_page = page - if len(self.results) < page: - self.search(self.query, page) - else: - self.queue.put(SEARCH_TERMINATED_EVENT) - - def prev(self, page): - self.last_requested_page = page - self.queue.put(SEARCH_TERMINATED_EVENT) - - def reset_results(self): - self.last_sent_searched = 0 - self.results = [] - self.last_page = None - - def reset_query(self): - self.query = None - - def set_results(self, page, resultset): - self.results.insert(page - 1, resultset) - - def get_results(self): - return self.results[self.last_requested_page - 1] - - def has_more_pages(self, page): - if self.results == [] or self.results[0] == []: - return False - if self.last_page is None: - return True - return page < self.last_page - - class LoadCorpus(threading.Thread): - def __init__(self, name, model): - threading.Thread.__init__(self) - self.model, self.name = model, name - - def run(self): - try: - ts = self.model.CORPORA[self.name]() - self.model.tagged_sents = [ - " ".join(w + "/" + t for (w, t) in sent) for sent in ts - ] - self.model.queue.put(CORPUS_LOADED_EVENT) - except Exception as e: - print(e) - self.model.queue.put(ERROR_LOADING_CORPUS_EVENT) - - class SearchCorpus(threading.Thread): - def __init__(self, model, page, count): - self.model, self.count, self.page = model, count, page - threading.Thread.__init__(self) - - def run(self): - q = self.processed_query() - sent_pos, i, sent_count = [], 0, 0 - for sent in self.model.tagged_sents[self.model.last_sent_searched :]: - try: - m = re.search(q, sent) - except re.error: - self.model.reset_results() - self.model.queue.put(SEARCH_ERROR_EVENT) - return - if m: - sent_pos.append((sent, m.start(), m.end())) - i += 1 - if i > self.count: - self.model.last_sent_searched += sent_count - 1 - break - sent_count += 1 - if self.count >= len(sent_pos): - self.model.last_sent_searched += sent_count - 1 - self.model.last_page = self.page - self.model.set_results(self.page, sent_pos) - else: - self.model.set_results(self.page, sent_pos[:-1]) - self.model.queue.put(SEARCH_TERMINATED_EVENT) - - def processed_query(self): - new = [] - for term in self.model.query.split(): - term = re.sub(r"\.", r"[^/ ]", term) - if re.match("[A-Z]+$", term): - new.append(BOUNDARY + WORD_OR_TAG + "/" + term + BOUNDARY) - elif "/" in term: - new.append(BOUNDARY + term + BOUNDARY) - else: - new.append(BOUNDARY + term + "/" + WORD_OR_TAG + BOUNDARY) - return " ".join(new) - - -def app(): - d = ConcordanceSearchView() - d.mainloop() - - -if __name__ == "__main__": - app() - -__all__ = ["app"] diff --git a/pipeline/nltk/app/nemo_app.py b/pipeline/nltk/app/nemo_app.py deleted file mode 100644 index df0ceb1be59e40bb48289f4f1411653789ca7a17..0000000000000000000000000000000000000000 --- a/pipeline/nltk/app/nemo_app.py +++ /dev/null @@ -1,163 +0,0 @@ -# Finding (and Replacing) Nemo, Version 1.1, Aristide Grange 2006/06/06 -# https://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/496783 - -""" -Finding (and Replacing) Nemo - -Instant Regular Expressions -Created by Aristide Grange -""" -import itertools -import re -from tkinter import SEL_FIRST, SEL_LAST, Frame, Label, PhotoImage, Scrollbar, Text, Tk - -windowTitle = "Finding (and Replacing) Nemo" -initialFind = r"n(.*?)e(.*?)m(.*?)o" -initialRepl = r"M\1A\2K\3I" -initialText = """\ -Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. -Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. -Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. -Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. -""" -images = { - "FIND": "R0lGODlhMAAiAPcAMf/////37//35//n1v97Off///f/9/f37/fexvfOvfeEQvd7QvdrQvdrKfdaKfdSMfdSIe/v9+/v7+/v5+/n3u/e1u/Wxu/Gre+1lO+tnO+thO+Ua+97Y+97Oe97Me9rOe9rMe9jOe9jMe9jIe9aMefe5+fe3ufezuece+eEWudzQudaIedSIedKMedKIedCKedCId7e1t7Wzt7Oxt7Gvd69vd69rd61pd6ljN6UjN6Ue96EY95zY95rUt5rQt5jMd5SId5KIdbn59be3tbGztbGvda1rdaEa9Z7a9Z7WtZzQtZzOdZzMdZjMdZaQtZSOdZSMdZKMdZCKdZCGNY5Ic7W1s7Oxs7Gtc69xs69tc69rc6tpc6llM6clM6cjM6Ue86EY85zWs5rSs5SKc5KKc5KGMa1tcatrcalvcalnMaUpcZ7c8ZzMcZrUsZrOcZrMcZaQsZSOcZSMcZKMcZCKcZCGMYxIcYxGL3Gxr21tb21rb2lpb2crb2cjL2UnL2UlL2UhL2Ec717Wr17Ur1zWr1rMb1jUr1KMb1KIb1CIb0xGLWlrbWlpbWcnLWEe7V7c7VzY7VzUrVSKbVKMbVCMbVCIbU5KbUxIbUxEK2lta2lpa2clK2UjK2MnK2MlK2Ea617e61za61rY61rMa1jSq1aUq1aSq1SQq1KKa0xEKWlnKWcnKWUnKWUhKWMjKWEa6Vza6VrWqVjMaVaUqVaKaVSMaVCMaU5KaUxIaUxGJyclJyMe5yElJyEhJx7e5x7c5xrOZxaQpxSOZxKQpw5IZSMhJSEjJR7c5Rre5RrY5RrUpRSQpRSKZRCOZRCKZQxKZQxIYyEhIx7hIxza4xzY4xrc4xjUoxaa4xaUoxSSoxKQoxCMYw5GIR7c4Rzc4Rre4RjY4RjWoRaa4RSWoRSUoRSMYRKQoRCOYQ5KYQxIXtra3taY3taSntKOXtCMXtCKXNCMXM5MXMxIWtSUmtKSmtKQmtCOWs5MWs5KWs5IWNCKWMxIVIxKUIQCDkhGAAAACH+AS4ALAAAAAAwACIAAAj/AAEIHEiwoMGDCBMqXMiwoUOHMqxIeEiRoZVp7cpZ29WrF4WKIAd208dGAQEVbiTVChUjZMU9+pYQmPmBZpxgvVw+nDdKwQICNVcIXQEkTgKdDdUJ+/nggVAXK1xI3TEA6UIr2uJ8iBqka1cXXTlkqGoVYRZ7iLyqBSs0iiEtZQVKiDGxBI1u3NR6lUpGDKg8MSgEQCphU7Z22vhg0dILXRCpYLuSCcYJT4wqXASBQaBzU7klHxC127OHD7ZDJFpERqRt0x5OnwQpmZmCLEhrbgg4WIHO1RY+nbQ9WRGEDJlmnXwJ+9FBgXMCIzYMVijBBgYMFxIMqJBMSc0Ht7qh/+Gjpte2rnYsYeNlasWIBgQ6yCewIoPCCp/cyP/wgUGbXVu0QcADZNBDnh98gHMLGXYQUw02w61QU3wdbNWDbQVVIIhMMwFF1DaZiPLBAy7E04kafrjSizaK3LFNNc0AAYRQDsAHHQlJ2IDQJ2zE1+EKDjiAijShkECCC8Qgw4cr7ZgyzC2WaHPNLWWoNeNWPiRAw0QFWQFMhz8C+QQ20yAiVSrY+MGOJCsccsst2GCzoHFxxEGGC+8hgs0MB2kyCpgzrUDCbs1Es41UdtATHFFkWELMOtsoQsYcgvRRQw5RSDgGOjZMR1AvPQIq6KCo9AKOJWDd48owQlHR4DXEKP9iyRrK+DNNBTu4RwIPFeTAGUG7hAomkA84gEg1m6ADljy9PBKGGJY4ig0xlsTBRSn98FOFDUC8pwQOPkgHbCGAzhTkA850s0c7j6Hjix9+gBIrMXLeAccWXUCyiRBcBEECdEJ98KtAqtBCYQc/OvDENnl4gYpUxISCIjjzylkGGV9okYUVNogRhAOBuuAEhjG08wOgDYzAgA5bCjIoCe5uwUk80RKTTSppPREGGGCIISOQ9AXBg6cC6WIywvCpoMHAocRBwhP4bHLFLujYkV42xNxBRhAyGrc113EgYtRBerDDDHMoDCyQEL5sE083EkgwQyBhxGFHMM206DUixGxmE0wssbQjCQ4JCaFKFwgQTVAVVhQUwAVPIFJKrHfYYRwi6OCDzzuIJIFhXAD0EccPsYRiSyqKSDpFcWSMIcZRoBMkQyA2BGZDIKSYcggih8TRRg4VxM5QABVYYLxgwiev/PLMCxQQADs=", - "find": "R0lGODlhMAAiAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OSkpKRgYGAAAAAAAAAAAAAAAAAAAACH+AS4ALAAAAAAwACIAAAX/ICCOZGmeaKquY2AGLiuvMCAUBuHWc48Kh0iFInEYCb4kSQCxPBiMxkMigRQEgJiSFVBYHNGG0RiZOHjblWAiiY4fkDhEYoBp06dAWfyAQyKAgAwDaHgnB0RwgYASgQ0IhDuGJDAIFhMRVFSLEX8QCJJ4AQM5AgQHTZqqjBAOCQQEkWkCDRMUFQsICQ4Vm5maEwwHOAsPDTpKMAsUDlO4CssTcb+2DAp8YGCyNFoCEsZwFQ3QDRTTVBRS0g1QbgsCd5QAAwgIBwYFAwStzQ8UEdCKVchky0yVBw7YuXkAKt4IAg74vXHVagqFBRgXSCAyYWAVCH0SNhDTitCJfSL5/4RbAPKPhQYYjVCYYAvCP0BxEDaD8CheAAHNwqh8MMGPSwgLeJWhwHSjqkYI+xg4MMCEgQjtRvZ7UAYCpghMF7CxONOWJkYR+rCpY4JlVpVxKDwYWEactKW9mhYRtqCTgwgWEMArERSK1j5q//6T8KXonFsShpiJkAECgQYVjykooCVA0JGHEWNiYCHThTFeb3UkoiCCBgwGEKQ1kuAJlhFwhA71h5SukwUM5qqeCSGBgicEWkfNiWSERtBad4JNIBaQBaQah1ToyGZBAnsIuIJs1qnqiAIVjIE2gnAB1T5x0icgzXT79ipgMOOEH6HBbREBMJCeGEY08IoLAkzB1YYFwjxwSUGSNULQJnNUwRYlCcyEkALIxECAP9cNMMABYpRhy3ZsSLDaR70oUAiABGCkAxowCGCAAfDYIQACXoElGRsdXWDBdg2Y90IWktDYGYAB9PWHP0PMdFZaF07SQgAFNDAMAQg0QA1UC8xoZQl22JGFPgWkOUCOL1pZQyhjxinnnCWEAAA7", - "REPL": "R0lGODlhMAAjAPcAMf/////3//+lOf+UKf+MEPf///f39/f35/fv7/ecQvecOfecKfeUIfeUGPeUEPeUCPeMAO/37+/v9+/v3u/n3u/n1u+9jO+9c++1hO+ta++tY++tWu+tUu+tSu+lUu+lQu+lMe+UMe+UKe+UGO+UEO+UAO+MCOfv5+fvxufn7+fn5+fnzue9lOe9c+e1jOe1e+e1c+e1a+etWuetUuelQuecOeeUUueUCN7e597e3t7e1t7ezt7evd7Wzt7Oxt7Ovd7Otd7Opd7OnN7Gtd7Gpd69lN61hN6ta96lStbextberdbW3tbWztbWxtbOvdbOrda1hNalUtaECM7W1s7Ozs7Oxs7Otc7Gxs7Gvc69tc69rc69pc61jM6lc8bWlMbOvcbGxsbGpca9tca9pca1nMaMAL3OhL3Gtb21vb21tb2tpb2tnL2tlLW9tbW9pbW9e7W1pbWtjLWcKa21nK2tra2tnK2tlK2lpa2llK2ljK2le6WlnKWljKWUe6WUc6WUY5y1QpyclJycjJychJyUc5yMY5StY5SUe5SMhJSMe5SMc5SMWpSEa5SESoyUe4yMhIyEY4SlKYScWoSMe4SEe4SEa4R7c4R7Y3uMY3uEe3t7e3t7c3tza3tzY3trKXtjIXOcAHOUMXOEY3Nzc3NzWnNrSmulCGuUMWuMGGtzWmtrY2taMWtaGGOUOWOMAGNzUmNjWmNjSmNaUmNaQmNaOWNaIWNSCFqcAFpjUlpSMVpSIVpSEFpKKVKMAFJSUlJSSlJSMVJKMVJKGFJKAFI5CEqUAEqEAEpzQkpKIUpCQkpCGEpCAEo5EEoxAEJjOUJCOUJCAEI5IUIxADl7ADlaITlCOTkxMTkxKTkxEDkhADFzADFrGDE5OTExADEpEClrCCkxKSkpKSkpISkpACkhCCkhACkYACFzACFrACEhCCEYGBhjEBhjABghABgYCBgYABgQEBgQABAQABAIAAhjAAhSAAhKAAgIEAgICABaAABCAAAhAAAQAAAIAAAAAAAAACH+AS4ALAAAAAAwACMAAAj/AAEIHEiwoMGDCBMqXMiwocOHAA4cgEixIIIJO3JMmAjADIqKFU/8MHIkg5EgYXx4iaTkI0iHE6wE2TCggYILQayEAgXIy8uGCKz8sDCAQAMRG3iEcXULlJkJPwli3OFjh9UdYYLE6NBhA04UXHoVA2XoTZgfPKBWlOBDphAWOdfMcfMDLloeO3hIMjbWVCQ5Fn6E2UFxgpsgFjYIEBADrZU6luqEEfqjTqpt54z1uuWqTIcgWAk7PECGzIUQDRosDmxlUrVJkwQJkqVuX71v06YZcyUlROAdbnLAJKPFyAYFAhoMwFlnEh0rWkpz8raPHm7dqKKc/KFFkBUrVn1M/ziBcEIeLUEQI8/AYk0i9Be4sqjsrN66c9/OnbobhpR3HkIUoZ0WVnBE0AGLFKKFD0HAFUQe77HQgQI1hRBDEHMcY0899bBzihZuCPILJD8EccEGGzwAQhFaUHHQH82sUkgeNHISDBk8WCCCcsqFUEQWmOyzjz3sUGNNOO5Y48YOEgowAAQhnBScQV00k82V47jzjy9CXZBcjziFoco//4CDiSOyhPMPLkJZkEBqJmRQxA9uZGEQD8Ncmc044/zzDF2IZQBCCDYE8QMZz/iiCSx0neHGI7BIhhhNn+1gxRpokEcQAp7seWU7/PwTyxqG/iCEEVzQmUombnDRxRExzP9nBR2PCKLFD3UJwcMPa/SRqUGNWJmNOVn+M44ukMRB4KGcWDNLVhuUMEIJAlzwA3DJBHMJIXm4sQYhqyxCRQQGLSIsn1qac2UzysQSyzX/hLMGD0F0IMCODYAQBA9W/PKPOcRiw0wzwxTiokF9dLMnuv/Mo+fCZF7jBr0xbDDCACWEYKgb1vzjDp/jZNOMLX0IZxAKq2TZTjtaOjwOsXyG+s8sZJTIQsUdIGHoJPf8w487QI/TDSt5mGwQFZxc406o8HiDJchk/ltLHpSlJwSvz5DpTjvmuGNOM57koelBOaAhiCaaPBLL0wwbm003peRBnBZqJMJL1ECz/HXYYx/NdAIOOVCxQyLorswymU93o0wuwfAiTDNR/xz0MLXU0XdCE+UwSTRZAq2lsSATu+4wkGvt+TjNzPLrQyegAUku2Hij5cd8LhxyM8QIg4w18HgcdC6BTBFSDmfQqsovttveDcG7lFLHI75cE841sARCxeWsnxC4G9HADPK6ywzDCRqBo0EHHWhMgT1IJzziNci1N7PMKnSYfML96/90AiJKey/0KtbLX1QK0rrNnQ541xugQ7SHhkXBghN0SKACWRc4KlAhBwKcIOYymJCAAAA7", - "repl": "R0lGODlhMAAjAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OTExMSkpKSEhIRgYGBAQEAgICAAAACH+AS4ALAAAAAAwACMAAAX/ICCOZGmeaKqubOu+gCDANBkIQ1EMQhAghFptYEAkEgjEwXBo7ISvweGgWCwUysPjwTgEoCafTySYIhYMxgLBjEQgCULvCw0QdAZdoVhUIJUFChISEAxYeQM1N1OMTAp+UwZ5eA4TEhFbDWYFdC4ECVMJjwl5BwsQa0umEhUVlhESDgqlBp0rAn5nVpBMDxeZDRQbHBgWFBSWDgtLBnFjKwRYCI9VqQsPs0YKEcMXFq0UEalFDWx4BAO2IwPjppAKDkrTWKYUGd7fEJJFEZpM00cOzCgh4EE8SaoWxKNixQooBRMyZMBwAYIRBhUgLDGS4MoBJeoANMhAgQsaCRZm/5lqaCUJhA4cNHjDoKEDBlJUHqkBlYBTiQUZNGjYMMxDhY3VWk6R4MEDBoMUak5AqoYBqANIBo4wcGGDUKIeLlzVZmWJggsVIkwAZaQSA3kdZzlKkIiEAAlDvW5oOkEBs488JTw44oeUIwdvVTFTUK7uiAAPgubt8GFDhQepqETAQCFU1UMGzlqAgFhUsAcCS0AO6lUDhw8xNRSbENGDhgWSHjWUe6ACbKITizmopZoBa6KvOwj9uuHDhwxyj3xekgDDhw5EvWKo0IB4iQLCOCC/njc7ZQ8UeGvza+ABZZgcxJNc4FO1gc0cOsCUrHevc8tdIMTIAhc4F198G2Qwwd8CBIQUAwEINABBBJUwR9R5wElgVRLwWODBBx4cGB8GEzDQIAo33CGJA8gh+JoH/clUgQU0YvDhdfmJdwEFC6Sjgg8yEPAABsPkh2F22cl2AQbn6QdTghTQ5eAJAQyQAAQV0MSBB9gRVZ4GE1mw5JZOAmiAVi1UWcAZDrDyZXYTeaOhA/bIVuIBPtKQ4h7ViYekUPdcEAEbzTzCRp5CADmAAwj+ORGPBcgwAAHo9ABGCYtm0ChwFHShlRiXhmHlkAcCiOeUodqQw5W0oXLAiamy4MOkjOyAaqxUymApDCEAADs=", -} -colors = ["#FF7B39", "#80F121"] -emphColors = ["#DAFC33", "#F42548"] -fieldParams = { - "height": 3, - "width": 70, - "font": ("monaco", 14), - "highlightthickness": 0, - "borderwidth": 0, - "background": "white", -} -textParams = { - "bg": "#F7E0D4", - "fg": "#2321F1", - "highlightthickness": 0, - "width": 1, - "height": 10, - "font": ("verdana", 16), - "wrap": "word", -} - - -class Zone: - def __init__(self, image, initialField, initialText): - frm = Frame(root) - frm.config(background="white") - self.image = PhotoImage(format="gif", data=images[image.upper()]) - self.imageDimmed = PhotoImage(format="gif", data=images[image]) - self.img = Label(frm) - self.img.config(borderwidth=0) - self.img.pack(side="left") - self.fld = Text(frm, **fieldParams) - self.initScrollText(frm, self.fld, initialField) - frm = Frame(root) - self.txt = Text(frm, **textParams) - self.initScrollText(frm, self.txt, initialText) - for i in range(2): - self.txt.tag_config(colors[i], background=colors[i]) - self.txt.tag_config("emph" + colors[i], foreground=emphColors[i]) - - def initScrollText(self, frm, txt, contents): - scl = Scrollbar(frm) - scl.config(command=txt.yview) - scl.pack(side="right", fill="y") - txt.pack(side="left", expand=True, fill="x") - txt.config(yscrollcommand=scl.set) - txt.insert("1.0", contents) - frm.pack(fill="x") - Frame(height=2, bd=1, relief="ridge").pack(fill="x") - - def refresh(self): - self.colorCycle = itertools.cycle(colors) - try: - self.substitute() - self.img.config(image=self.image) - except re.error: - self.img.config(image=self.imageDimmed) - - -class FindZone(Zone): - def addTags(self, m): - color = next(self.colorCycle) - self.txt.tag_add(color, "1.0+%sc" % m.start(), "1.0+%sc" % m.end()) - try: - self.txt.tag_add( - "emph" + color, "1.0+%sc" % m.start("emph"), "1.0+%sc" % m.end("emph") - ) - except: - pass - - def substitute(self, *args): - for color in colors: - self.txt.tag_remove(color, "1.0", "end") - self.txt.tag_remove("emph" + color, "1.0", "end") - self.rex = re.compile("") # default value in case of malformed regexp - self.rex = re.compile(self.fld.get("1.0", "end")[:-1], re.MULTILINE) - try: - re.compile("(?P%s)" % self.fld.get(SEL_FIRST, SEL_LAST)) - self.rexSel = re.compile( - "%s(?P%s)%s" - % ( - self.fld.get("1.0", SEL_FIRST), - self.fld.get(SEL_FIRST, SEL_LAST), - self.fld.get(SEL_LAST, "end")[:-1], - ), - re.MULTILINE, - ) - except: - self.rexSel = self.rex - self.rexSel.sub(self.addTags, self.txt.get("1.0", "end")) - - -class ReplaceZone(Zone): - def addTags(self, m): - s = sz.rex.sub(self.repl, m.group()) - self.txt.delete( - "1.0+%sc" % (m.start() + self.diff), "1.0+%sc" % (m.end() + self.diff) - ) - self.txt.insert("1.0+%sc" % (m.start() + self.diff), s, next(self.colorCycle)) - self.diff += len(s) - (m.end() - m.start()) - - def substitute(self): - self.txt.delete("1.0", "end") - self.txt.insert("1.0", sz.txt.get("1.0", "end")[:-1]) - self.diff = 0 - self.repl = rex0.sub(r"\\g<\1>", self.fld.get("1.0", "end")[:-1]) - sz.rex.sub(self.addTags, sz.txt.get("1.0", "end")[:-1]) - - -def launchRefresh(_): - sz.fld.after_idle(sz.refresh) - rz.fld.after_idle(rz.refresh) - - -def app(): - global root, sz, rz, rex0 - root = Tk() - root.resizable(height=False, width=True) - root.title(windowTitle) - root.minsize(width=250, height=0) - sz = FindZone("find", initialFind, initialText) - sz.fld.bind("", launchRefresh) - sz.fld.bind("", launchRefresh) - sz.fld.bind("", launchRefresh) - sz.rexSel = re.compile("") - rz = ReplaceZone("repl", initialRepl, "") - rex0 = re.compile(r"(?", launchRefresh) - launchRefresh(None) - root.mainloop() - - -if __name__ == "__main__": - app() - -__all__ = ["app"] diff --git a/pipeline/nltk/app/rdparser_app.py b/pipeline/nltk/app/rdparser_app.py deleted file mode 100644 index 16de5a442659171763da4b4d19e9f56ef9db6277..0000000000000000000000000000000000000000 --- a/pipeline/nltk/app/rdparser_app.py +++ /dev/null @@ -1,1052 +0,0 @@ -# Natural Language Toolkit: Recursive Descent Parser Application -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -A graphical tool for exploring the recursive descent parser. - -The recursive descent parser maintains a tree, which records the -structure of the portion of the text that has been parsed. It uses -CFG productions to expand the fringe of the tree, and matches its -leaves against the text. Initially, the tree contains the start -symbol ("S"). It is shown in the main canvas, to the right of the -list of available expansions. - -The parser builds up a tree structure for the text using three -operations: - - - "expand" uses a CFG production to add children to a node on the - fringe of the tree. - - "match" compares a leaf in the tree to a text token. - - "backtrack" returns the tree to its state before the most recent - expand or match operation. - -The parser maintains a list of tree locations called a "frontier" to -remember which nodes have not yet been expanded and which leaves have -not yet been matched against the text. The leftmost frontier node is -shown in green, and the other frontier nodes are shown in blue. The -parser always performs expand and match operations on the leftmost -element of the frontier. - -You can control the parser's operation by using the "expand," "match," -and "backtrack" buttons; or you can use the "step" button to let the -parser automatically decide which operation to apply. The parser uses -the following rules to decide which operation to apply: - - - If the leftmost frontier element is a token, try matching it. - - If the leftmost frontier element is a node, try expanding it with - the first untried expansion. - - Otherwise, backtrack. - -The "expand" button applies the untried expansion whose CFG production -is listed earliest in the grammar. To manually choose which expansion -to apply, click on a CFG production from the list of available -expansions, on the left side of the main window. - -The "autostep" button will let the parser continue applying -applications to the tree until it reaches a complete parse. You can -cancel an autostep in progress at any time by clicking on the -"autostep" button again. - -Keyboard Shortcuts:: - [Space]\t Perform the next expand, match, or backtrack operation - [a]\t Step through operations until the next complete parse - [e]\t Perform an expand operation - [m]\t Perform a match operation - [b]\t Perform a backtrack operation - [Delete]\t Reset the parser - [g]\t Show/hide available expansions list - [h]\t Help - [Ctrl-p]\t Print - [q]\t Quit -""" - -from tkinter import Button, Frame, IntVar, Label, Listbox, Menu, Scrollbar, Tk -from tkinter.font import Font - -from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment -from nltk.draw.util import CanvasFrame, EntryDialog, ShowText, TextWidget -from nltk.parse import SteppingRecursiveDescentParser -from nltk.tree import Tree -from nltk.util import in_idle - - -class RecursiveDescentApp: - """ - A graphical tool for exploring the recursive descent parser. The tool - displays the parser's tree and the remaining text, and allows the - user to control the parser's operation. In particular, the user - can expand subtrees on the frontier, match tokens on the frontier - against the text, and backtrack. A "step" button simply steps - through the parsing process, performing the operations that - ``RecursiveDescentParser`` would use. - """ - - def __init__(self, grammar, sent, trace=0): - self._sent = sent - self._parser = SteppingRecursiveDescentParser(grammar, trace) - - # Set up the main window. - self._top = Tk() - self._top.title("Recursive Descent Parser Application") - - # Set up key bindings. - self._init_bindings() - - # Initialize the fonts. - self._init_fonts(self._top) - - # Animations. animating_lock is a lock to prevent the demo - # from performing new operations while it's animating. - self._animation_frames = IntVar(self._top) - self._animation_frames.set(5) - self._animating_lock = 0 - self._autostep = 0 - - # The user can hide the grammar. - self._show_grammar = IntVar(self._top) - self._show_grammar.set(1) - - # Create the basic frames. - self._init_menubar(self._top) - self._init_buttons(self._top) - self._init_feedback(self._top) - self._init_grammar(self._top) - self._init_canvas(self._top) - - # Initialize the parser. - self._parser.initialize(self._sent) - - # Resize callback - self._canvas.bind("", self._configure) - - ######################################### - ## Initialization Helpers - ######################################### - - def _init_fonts(self, root): - # See: - self._sysfont = Font(font=Button()["font"]) - root.option_add("*Font", self._sysfont) - - # TWhat's our font size (default=same as sysfont) - self._size = IntVar(root) - self._size.set(self._sysfont.cget("size")) - - self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get()) - self._font = Font(family="helvetica", size=self._size.get()) - if self._size.get() < 0: - big = self._size.get() - 2 - else: - big = self._size.get() + 2 - self._bigfont = Font(family="helvetica", weight="bold", size=big) - - def _init_grammar(self, parent): - # Grammar view. - self._prodframe = listframe = Frame(parent) - self._prodframe.pack(fill="both", side="left", padx=2) - self._prodlist_label = Label( - self._prodframe, font=self._boldfont, text="Available Expansions" - ) - self._prodlist_label.pack() - self._prodlist = Listbox( - self._prodframe, - selectmode="single", - relief="groove", - background="white", - foreground="#909090", - font=self._font, - selectforeground="#004040", - selectbackground="#c0f0c0", - ) - - self._prodlist.pack(side="right", fill="both", expand=1) - - self._productions = list(self._parser.grammar().productions()) - for production in self._productions: - self._prodlist.insert("end", (" %s" % production)) - self._prodlist.config(height=min(len(self._productions), 25)) - - # Add a scrollbar if there are more than 25 productions. - if len(self._productions) > 25: - listscroll = Scrollbar(self._prodframe, orient="vertical") - self._prodlist.config(yscrollcommand=listscroll.set) - listscroll.config(command=self._prodlist.yview) - listscroll.pack(side="left", fill="y") - - # If they select a production, apply it. - self._prodlist.bind("<>", self._prodlist_select) - - def _init_bindings(self): - # Key bindings are a good thing. - self._top.bind("", self.destroy) - self._top.bind("", self.destroy) - self._top.bind("", self.destroy) - self._top.bind("e", self.expand) - # self._top.bind('', self.expand) - # self._top.bind('', self.expand) - self._top.bind("m", self.match) - self._top.bind("", self.match) - self._top.bind("", self.match) - self._top.bind("b", self.backtrack) - self._top.bind("", self.backtrack) - self._top.bind("", self.backtrack) - self._top.bind("", self.backtrack) - self._top.bind("", self.backtrack) - self._top.bind("a", self.autostep) - # self._top.bind('', self.autostep) - self._top.bind("", self.autostep) - self._top.bind("", self.cancel_autostep) - self._top.bind("", self.step) - self._top.bind("", self.reset) - self._top.bind("", self.postscript) - # self._top.bind('', self.help) - # self._top.bind('', self.help) - self._top.bind("", self.help) - self._top.bind("", self.help) - # self._top.bind('', self.toggle_grammar) - # self._top.bind('', self.toggle_grammar) - # self._top.bind('', self.toggle_grammar) - self._top.bind("", self.edit_grammar) - self._top.bind("", self.edit_sentence) - - def _init_buttons(self, parent): - # Set up the frames. - self._buttonframe = buttonframe = Frame(parent) - buttonframe.pack(fill="none", side="bottom", padx=3, pady=2) - Button( - buttonframe, - text="Step", - background="#90c0d0", - foreground="black", - command=self.step, - ).pack(side="left") - Button( - buttonframe, - text="Autostep", - background="#90c0d0", - foreground="black", - command=self.autostep, - ).pack(side="left") - Button( - buttonframe, - text="Expand", - underline=0, - background="#90f090", - foreground="black", - command=self.expand, - ).pack(side="left") - Button( - buttonframe, - text="Match", - underline=0, - background="#90f090", - foreground="black", - command=self.match, - ).pack(side="left") - Button( - buttonframe, - text="Backtrack", - underline=0, - background="#f0a0a0", - foreground="black", - command=self.backtrack, - ).pack(side="left") - # Replace autostep... - - # self._autostep_button = Button(buttonframe, text='Autostep', - # underline=0, command=self.autostep) - # self._autostep_button.pack(side='left') - - def _configure(self, event): - self._autostep = 0 - (x1, y1, x2, y2) = self._cframe.scrollregion() - y2 = event.height - 6 - self._canvas["scrollregion"] = "%d %d %d %d" % (x1, y1, x2, y2) - self._redraw() - - def _init_feedback(self, parent): - self._feedbackframe = feedbackframe = Frame(parent) - feedbackframe.pack(fill="x", side="bottom", padx=3, pady=3) - self._lastoper_label = Label( - feedbackframe, text="Last Operation:", font=self._font - ) - self._lastoper_label.pack(side="left") - lastoperframe = Frame(feedbackframe, relief="sunken", border=1) - lastoperframe.pack(fill="x", side="right", expand=1, padx=5) - self._lastoper1 = Label( - lastoperframe, foreground="#007070", background="#f0f0f0", font=self._font - ) - self._lastoper2 = Label( - lastoperframe, - anchor="w", - width=30, - foreground="#004040", - background="#f0f0f0", - font=self._font, - ) - self._lastoper1.pack(side="left") - self._lastoper2.pack(side="left", fill="x", expand=1) - - def _init_canvas(self, parent): - self._cframe = CanvasFrame( - parent, - background="white", - # width=525, height=250, - closeenough=10, - border=2, - relief="sunken", - ) - self._cframe.pack(expand=1, fill="both", side="top", pady=2) - canvas = self._canvas = self._cframe.canvas() - - # Initially, there's no tree or text - self._tree = None - self._textwidgets = [] - self._textline = None - - def _init_menubar(self, parent): - menubar = Menu(parent) - - filemenu = Menu(menubar, tearoff=0) - filemenu.add_command( - label="Reset Parser", underline=0, command=self.reset, accelerator="Del" - ) - filemenu.add_command( - label="Print to Postscript", - underline=0, - command=self.postscript, - accelerator="Ctrl-p", - ) - filemenu.add_command( - label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x" - ) - menubar.add_cascade(label="File", underline=0, menu=filemenu) - - editmenu = Menu(menubar, tearoff=0) - editmenu.add_command( - label="Edit Grammar", - underline=5, - command=self.edit_grammar, - accelerator="Ctrl-g", - ) - editmenu.add_command( - label="Edit Text", - underline=5, - command=self.edit_sentence, - accelerator="Ctrl-t", - ) - menubar.add_cascade(label="Edit", underline=0, menu=editmenu) - - rulemenu = Menu(menubar, tearoff=0) - rulemenu.add_command( - label="Step", underline=1, command=self.step, accelerator="Space" - ) - rulemenu.add_separator() - rulemenu.add_command( - label="Match", underline=0, command=self.match, accelerator="Ctrl-m" - ) - rulemenu.add_command( - label="Expand", underline=0, command=self.expand, accelerator="Ctrl-e" - ) - rulemenu.add_separator() - rulemenu.add_command( - label="Backtrack", underline=0, command=self.backtrack, accelerator="Ctrl-b" - ) - menubar.add_cascade(label="Apply", underline=0, menu=rulemenu) - - viewmenu = Menu(menubar, tearoff=0) - viewmenu.add_checkbutton( - label="Show Grammar", - underline=0, - variable=self._show_grammar, - command=self._toggle_grammar, - ) - viewmenu.add_separator() - viewmenu.add_radiobutton( - label="Tiny", - variable=self._size, - underline=0, - value=10, - command=self.resize, - ) - viewmenu.add_radiobutton( - label="Small", - variable=self._size, - underline=0, - value=12, - command=self.resize, - ) - viewmenu.add_radiobutton( - label="Medium", - variable=self._size, - underline=0, - value=14, - command=self.resize, - ) - viewmenu.add_radiobutton( - label="Large", - variable=self._size, - underline=0, - value=18, - command=self.resize, - ) - viewmenu.add_radiobutton( - label="Huge", - variable=self._size, - underline=0, - value=24, - command=self.resize, - ) - menubar.add_cascade(label="View", underline=0, menu=viewmenu) - - animatemenu = Menu(menubar, tearoff=0) - animatemenu.add_radiobutton( - label="No Animation", underline=0, variable=self._animation_frames, value=0 - ) - animatemenu.add_radiobutton( - label="Slow Animation", - underline=0, - variable=self._animation_frames, - value=10, - accelerator="-", - ) - animatemenu.add_radiobutton( - label="Normal Animation", - underline=0, - variable=self._animation_frames, - value=5, - accelerator="=", - ) - animatemenu.add_radiobutton( - label="Fast Animation", - underline=0, - variable=self._animation_frames, - value=2, - accelerator="+", - ) - menubar.add_cascade(label="Animate", underline=1, menu=animatemenu) - - helpmenu = Menu(menubar, tearoff=0) - helpmenu.add_command(label="About", underline=0, command=self.about) - helpmenu.add_command( - label="Instructions", underline=0, command=self.help, accelerator="F1" - ) - menubar.add_cascade(label="Help", underline=0, menu=helpmenu) - - parent.config(menu=menubar) - - ######################################### - ## Helper - ######################################### - - def _get(self, widget, treeloc): - for i in treeloc: - widget = widget.subtrees()[i] - if isinstance(widget, TreeSegmentWidget): - widget = widget.label() - return widget - - ######################################### - ## Main draw procedure - ######################################### - - def _redraw(self): - canvas = self._canvas - - # Delete the old tree, widgets, etc. - if self._tree is not None: - self._cframe.destroy_widget(self._tree) - for twidget in self._textwidgets: - self._cframe.destroy_widget(twidget) - if self._textline is not None: - self._canvas.delete(self._textline) - - # Draw the tree. - helv = ("helvetica", -self._size.get()) - bold = ("helvetica", -self._size.get(), "bold") - attribs = { - "tree_color": "#000000", - "tree_width": 2, - "node_font": bold, - "leaf_font": helv, - } - tree = self._parser.tree() - self._tree = tree_to_treesegment(canvas, tree, **attribs) - self._cframe.add_widget(self._tree, 30, 5) - - # Draw the text. - helv = ("helvetica", -self._size.get()) - bottom = y = self._cframe.scrollregion()[3] - self._textwidgets = [ - TextWidget(canvas, word, font=self._font) for word in self._sent - ] - for twidget in self._textwidgets: - self._cframe.add_widget(twidget, 0, 0) - twidget.move(0, bottom - twidget.bbox()[3] - 5) - y = min(y, twidget.bbox()[1]) - - # Draw a line over the text, to separate it from the tree. - self._textline = canvas.create_line(-5000, y - 5, 5000, y - 5, dash=".") - - # Highlight appropriate nodes. - self._highlight_nodes() - self._highlight_prodlist() - - # Make sure the text lines up. - self._position_text() - - def _redraw_quick(self): - # This should be more-or-less sufficient after an animation. - self._highlight_nodes() - self._highlight_prodlist() - self._position_text() - - def _highlight_nodes(self): - # Highlight the list of nodes to be checked. - bold = ("helvetica", -self._size.get(), "bold") - for treeloc in self._parser.frontier()[:1]: - self._get(self._tree, treeloc)["color"] = "#20a050" - self._get(self._tree, treeloc)["font"] = bold - for treeloc in self._parser.frontier()[1:]: - self._get(self._tree, treeloc)["color"] = "#008080" - - def _highlight_prodlist(self): - # Highlight the productions that can be expanded. - # Boy, too bad tkinter doesn't implement Listbox.itemconfig; - # that would be pretty useful here. - self._prodlist.delete(0, "end") - expandable = self._parser.expandable_productions() - untried = self._parser.untried_expandable_productions() - productions = self._productions - for index in range(len(productions)): - if productions[index] in expandable: - if productions[index] in untried: - self._prodlist.insert(index, " %s" % productions[index]) - else: - self._prodlist.insert(index, " %s (TRIED)" % productions[index]) - self._prodlist.selection_set(index) - else: - self._prodlist.insert(index, " %s" % productions[index]) - - def _position_text(self): - # Line up the text widgets that are matched against the tree - numwords = len(self._sent) - num_matched = numwords - len(self._parser.remaining_text()) - leaves = self._tree_leaves()[:num_matched] - xmax = self._tree.bbox()[0] - for i in range(0, len(leaves)): - widget = self._textwidgets[i] - leaf = leaves[i] - widget["color"] = "#006040" - leaf["color"] = "#006040" - widget.move(leaf.bbox()[0] - widget.bbox()[0], 0) - xmax = widget.bbox()[2] + 10 - - # Line up the text widgets that are not matched against the tree. - for i in range(len(leaves), numwords): - widget = self._textwidgets[i] - widget["color"] = "#a0a0a0" - widget.move(xmax - widget.bbox()[0], 0) - xmax = widget.bbox()[2] + 10 - - # If we have a complete parse, make everything green :) - if self._parser.currently_complete(): - for twidget in self._textwidgets: - twidget["color"] = "#00a000" - - # Move the matched leaves down to the text. - for i in range(0, len(leaves)): - widget = self._textwidgets[i] - leaf = leaves[i] - dy = widget.bbox()[1] - leaf.bbox()[3] - 10.0 - dy = max(dy, leaf.parent().label().bbox()[3] - leaf.bbox()[3] + 10) - leaf.move(0, dy) - - def _tree_leaves(self, tree=None): - if tree is None: - tree = self._tree - if isinstance(tree, TreeSegmentWidget): - leaves = [] - for child in tree.subtrees(): - leaves += self._tree_leaves(child) - return leaves - else: - return [tree] - - ######################################### - ## Button Callbacks - ######################################### - - def destroy(self, *e): - self._autostep = 0 - if self._top is None: - return - self._top.destroy() - self._top = None - - def reset(self, *e): - self._autostep = 0 - self._parser.initialize(self._sent) - self._lastoper1["text"] = "Reset Application" - self._lastoper2["text"] = "" - self._redraw() - - def autostep(self, *e): - if self._animation_frames.get() == 0: - self._animation_frames.set(2) - if self._autostep: - self._autostep = 0 - else: - self._autostep = 1 - self._step() - - def cancel_autostep(self, *e): - # self._autostep_button['text'] = 'Autostep' - self._autostep = 0 - - # Make sure to stop auto-stepping if we get any user input. - def step(self, *e): - self._autostep = 0 - self._step() - - def match(self, *e): - self._autostep = 0 - self._match() - - def expand(self, *e): - self._autostep = 0 - self._expand() - - def backtrack(self, *e): - self._autostep = 0 - self._backtrack() - - def _step(self): - if self._animating_lock: - return - - # Try expanding, matching, and backtracking (in that order) - if self._expand(): - pass - elif self._parser.untried_match() and self._match(): - pass - elif self._backtrack(): - pass - else: - self._lastoper1["text"] = "Finished" - self._lastoper2["text"] = "" - self._autostep = 0 - - # Check if we just completed a parse. - if self._parser.currently_complete(): - self._autostep = 0 - self._lastoper2["text"] += " [COMPLETE PARSE]" - - def _expand(self, *e): - if self._animating_lock: - return - old_frontier = self._parser.frontier() - rv = self._parser.expand() - if rv is not None: - self._lastoper1["text"] = "Expand:" - self._lastoper2["text"] = rv - self._prodlist.selection_clear(0, "end") - index = self._productions.index(rv) - self._prodlist.selection_set(index) - self._animate_expand(old_frontier[0]) - return True - else: - self._lastoper1["text"] = "Expand:" - self._lastoper2["text"] = "(all expansions tried)" - return False - - def _match(self, *e): - if self._animating_lock: - return - old_frontier = self._parser.frontier() - rv = self._parser.match() - if rv is not None: - self._lastoper1["text"] = "Match:" - self._lastoper2["text"] = rv - self._animate_match(old_frontier[0]) - return True - else: - self._lastoper1["text"] = "Match:" - self._lastoper2["text"] = "(failed)" - return False - - def _backtrack(self, *e): - if self._animating_lock: - return - if self._parser.backtrack(): - elt = self._parser.tree() - for i in self._parser.frontier()[0]: - elt = elt[i] - self._lastoper1["text"] = "Backtrack" - self._lastoper2["text"] = "" - if isinstance(elt, Tree): - self._animate_backtrack(self._parser.frontier()[0]) - else: - self._animate_match_backtrack(self._parser.frontier()[0]) - return True - else: - self._autostep = 0 - self._lastoper1["text"] = "Finished" - self._lastoper2["text"] = "" - return False - - def about(self, *e): - ABOUT = ( - "NLTK Recursive Descent Parser Application\n" + "Written by Edward Loper" - ) - TITLE = "About: Recursive Descent Parser Application" - try: - from tkinter.messagebox import Message - - Message(message=ABOUT, title=TITLE).show() - except: - ShowText(self._top, TITLE, ABOUT) - - def help(self, *e): - self._autostep = 0 - # The default font's not very legible; try using 'fixed' instead. - try: - ShowText( - self._top, - "Help: Recursive Descent Parser Application", - (__doc__ or "").strip(), - width=75, - font="fixed", - ) - except: - ShowText( - self._top, - "Help: Recursive Descent Parser Application", - (__doc__ or "").strip(), - width=75, - ) - - def postscript(self, *e): - self._autostep = 0 - self._cframe.print_to_file() - - def mainloop(self, *args, **kwargs): - """ - Enter the Tkinter mainloop. This function must be called if - this demo is created from a non-interactive program (e.g. - from a secript); otherwise, the demo will close as soon as - the script completes. - """ - if in_idle(): - return - self._top.mainloop(*args, **kwargs) - - def resize(self, size=None): - if size is not None: - self._size.set(size) - size = self._size.get() - self._font.configure(size=-(abs(size))) - self._boldfont.configure(size=-(abs(size))) - self._sysfont.configure(size=-(abs(size))) - self._bigfont.configure(size=-(abs(size + 2))) - self._redraw() - - ######################################### - ## Expand Production Selection - ######################################### - - def _toggle_grammar(self, *e): - if self._show_grammar.get(): - self._prodframe.pack( - fill="both", side="left", padx=2, after=self._feedbackframe - ) - self._lastoper1["text"] = "Show Grammar" - else: - self._prodframe.pack_forget() - self._lastoper1["text"] = "Hide Grammar" - self._lastoper2["text"] = "" - - # def toggle_grammar(self, *e): - # self._show_grammar = not self._show_grammar - # if self._show_grammar: - # self._prodframe.pack(fill='both', expand='y', side='left', - # after=self._feedbackframe) - # self._lastoper1['text'] = 'Show Grammar' - # else: - # self._prodframe.pack_forget() - # self._lastoper1['text'] = 'Hide Grammar' - # self._lastoper2['text'] = '' - - def _prodlist_select(self, event): - selection = self._prodlist.curselection() - if len(selection) != 1: - return - index = int(selection[0]) - old_frontier = self._parser.frontier() - production = self._parser.expand(self._productions[index]) - - if production: - self._lastoper1["text"] = "Expand:" - self._lastoper2["text"] = production - self._prodlist.selection_clear(0, "end") - self._prodlist.selection_set(index) - self._animate_expand(old_frontier[0]) - else: - # Reset the production selections. - self._prodlist.selection_clear(0, "end") - for prod in self._parser.expandable_productions(): - index = self._productions.index(prod) - self._prodlist.selection_set(index) - - ######################################### - ## Animation - ######################################### - - def _animate_expand(self, treeloc): - oldwidget = self._get(self._tree, treeloc) - oldtree = oldwidget.parent() - top = not isinstance(oldtree.parent(), TreeSegmentWidget) - - tree = self._parser.tree() - for i in treeloc: - tree = tree[i] - - widget = tree_to_treesegment( - self._canvas, - tree, - node_font=self._boldfont, - leaf_color="white", - tree_width=2, - tree_color="white", - node_color="white", - leaf_font=self._font, - ) - widget.label()["color"] = "#20a050" - - (oldx, oldy) = oldtree.label().bbox()[:2] - (newx, newy) = widget.label().bbox()[:2] - widget.move(oldx - newx, oldy - newy) - - if top: - self._cframe.add_widget(widget, 0, 5) - widget.move(30 - widget.label().bbox()[0], 0) - self._tree = widget - else: - oldtree.parent().replace_child(oldtree, widget) - - # Move the children over so they don't overlap. - # Line the children up in a strange way. - if widget.subtrees(): - dx = ( - oldx - + widget.label().width() / 2 - - widget.subtrees()[0].bbox()[0] / 2 - - widget.subtrees()[0].bbox()[2] / 2 - ) - for subtree in widget.subtrees(): - subtree.move(dx, 0) - - self._makeroom(widget) - - if top: - self._cframe.destroy_widget(oldtree) - else: - oldtree.destroy() - - colors = [ - "gray%d" % (10 * int(10 * x / self._animation_frames.get())) - for x in range(self._animation_frames.get(), 0, -1) - ] - - # Move the text string down, if necessary. - dy = widget.bbox()[3] + 30 - self._canvas.coords(self._textline)[1] - if dy > 0: - for twidget in self._textwidgets: - twidget.move(0, dy) - self._canvas.move(self._textline, 0, dy) - - self._animate_expand_frame(widget, colors) - - def _makeroom(self, treeseg): - """ - Make sure that no sibling tree bbox's overlap. - """ - parent = treeseg.parent() - if not isinstance(parent, TreeSegmentWidget): - return - - index = parent.subtrees().index(treeseg) - - # Handle siblings to the right - rsiblings = parent.subtrees()[index + 1 :] - if rsiblings: - dx = treeseg.bbox()[2] - rsiblings[0].bbox()[0] + 10 - for sibling in rsiblings: - sibling.move(dx, 0) - - # Handle siblings to the left - if index > 0: - lsibling = parent.subtrees()[index - 1] - dx = max(0, lsibling.bbox()[2] - treeseg.bbox()[0] + 10) - treeseg.move(dx, 0) - - # Keep working up the tree. - self._makeroom(parent) - - def _animate_expand_frame(self, widget, colors): - if len(colors) > 0: - self._animating_lock = 1 - widget["color"] = colors[0] - for subtree in widget.subtrees(): - if isinstance(subtree, TreeSegmentWidget): - subtree.label()["color"] = colors[0] - else: - subtree["color"] = colors[0] - self._top.after(50, self._animate_expand_frame, widget, colors[1:]) - else: - widget["color"] = "black" - for subtree in widget.subtrees(): - if isinstance(subtree, TreeSegmentWidget): - subtree.label()["color"] = "black" - else: - subtree["color"] = "black" - self._redraw_quick() - widget.label()["color"] = "black" - self._animating_lock = 0 - if self._autostep: - self._step() - - def _animate_backtrack(self, treeloc): - # Flash red first, if we're animating. - if self._animation_frames.get() == 0: - colors = [] - else: - colors = ["#a00000", "#000000", "#a00000"] - colors += [ - "gray%d" % (10 * int(10 * x / (self._animation_frames.get()))) - for x in range(1, self._animation_frames.get() + 1) - ] - - widgets = [self._get(self._tree, treeloc).parent()] - for subtree in widgets[0].subtrees(): - if isinstance(subtree, TreeSegmentWidget): - widgets.append(subtree.label()) - else: - widgets.append(subtree) - - self._animate_backtrack_frame(widgets, colors) - - def _animate_backtrack_frame(self, widgets, colors): - if len(colors) > 0: - self._animating_lock = 1 - for widget in widgets: - widget["color"] = colors[0] - self._top.after(50, self._animate_backtrack_frame, widgets, colors[1:]) - else: - for widget in widgets[0].subtrees(): - widgets[0].remove_child(widget) - widget.destroy() - self._redraw_quick() - self._animating_lock = 0 - if self._autostep: - self._step() - - def _animate_match_backtrack(self, treeloc): - widget = self._get(self._tree, treeloc) - node = widget.parent().label() - dy = (node.bbox()[3] - widget.bbox()[1] + 14) / max( - 1, self._animation_frames.get() - ) - self._animate_match_backtrack_frame(self._animation_frames.get(), widget, dy) - - def _animate_match(self, treeloc): - widget = self._get(self._tree, treeloc) - - dy = (self._textwidgets[0].bbox()[1] - widget.bbox()[3] - 10.0) / max( - 1, self._animation_frames.get() - ) - self._animate_match_frame(self._animation_frames.get(), widget, dy) - - def _animate_match_frame(self, frame, widget, dy): - if frame > 0: - self._animating_lock = 1 - widget.move(0, dy) - self._top.after(10, self._animate_match_frame, frame - 1, widget, dy) - else: - widget["color"] = "#006040" - self._redraw_quick() - self._animating_lock = 0 - if self._autostep: - self._step() - - def _animate_match_backtrack_frame(self, frame, widget, dy): - if frame > 0: - self._animating_lock = 1 - widget.move(0, dy) - self._top.after( - 10, self._animate_match_backtrack_frame, frame - 1, widget, dy - ) - else: - widget.parent().remove_child(widget) - widget.destroy() - self._animating_lock = 0 - if self._autostep: - self._step() - - def edit_grammar(self, *e): - CFGEditor(self._top, self._parser.grammar(), self.set_grammar) - - def set_grammar(self, grammar): - self._parser.set_grammar(grammar) - self._productions = list(grammar.productions()) - self._prodlist.delete(0, "end") - for production in self._productions: - self._prodlist.insert("end", (" %s" % production)) - - def edit_sentence(self, *e): - sentence = " ".join(self._sent) - title = "Edit Text" - instr = "Enter a new sentence to parse." - EntryDialog(self._top, sentence, instr, self.set_sentence, title) - - def set_sentence(self, sentence): - self._sent = sentence.split() # [XX] use tagged? - self.reset() - - -def app(): - """ - Create a recursive descent parser demo, using a simple grammar and - text. - """ - from nltk.grammar import CFG - - grammar = CFG.fromstring( - """ - # Grammatical productions. - S -> NP VP - NP -> Det N PP | Det N - VP -> V NP PP | V NP | V - PP -> P NP - # Lexical productions. - NP -> 'I' - Det -> 'the' | 'a' - N -> 'man' | 'park' | 'dog' | 'telescope' - V -> 'ate' | 'saw' - P -> 'in' | 'under' | 'with' - """ - ) - - sent = "the dog saw a man in the park".split() - - RecursiveDescentApp(grammar, sent).mainloop() - - -if __name__ == "__main__": - app() - -__all__ = ["app"] diff --git a/pipeline/nltk/app/srparser_app.py b/pipeline/nltk/app/srparser_app.py deleted file mode 100644 index cca5cb2de2149cc573b6d471cd5fef2a57cbbb7d..0000000000000000000000000000000000000000 --- a/pipeline/nltk/app/srparser_app.py +++ /dev/null @@ -1,937 +0,0 @@ -# Natural Language Toolkit: Shift-Reduce Parser Application -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -A graphical tool for exploring the shift-reduce parser. - -The shift-reduce parser maintains a stack, which records the structure -of the portion of the text that has been parsed. The stack is -initially empty. Its contents are shown on the left side of the main -canvas. - -On the right side of the main canvas is the remaining text. This is -the portion of the text which has not yet been considered by the -parser. - -The parser builds up a tree structure for the text using two -operations: - - - "shift" moves the first token from the remaining text to the top - of the stack. In the demo, the top of the stack is its right-hand - side. - - "reduce" uses a grammar production to combine the rightmost stack - elements into a single tree token. - -You can control the parser's operation by using the "shift" and -"reduce" buttons; or you can use the "step" button to let the parser -automatically decide which operation to apply. The parser uses the -following rules to decide which operation to apply: - - - Only shift if no reductions are available. - - If multiple reductions are available, then apply the reduction - whose CFG production is listed earliest in the grammar. - -The "reduce" button applies the reduction whose CFG production is -listed earliest in the grammar. There are two ways to manually choose -which reduction to apply: - - - Click on a CFG production from the list of available reductions, - on the left side of the main window. The reduction based on that - production will be applied to the top of the stack. - - Click on one of the stack elements. A popup window will appear, - containing all available reductions. Select one, and it will be - applied to the top of the stack. - -Note that reductions can only be applied to the top of the stack. - -Keyboard Shortcuts:: - [Space]\t Perform the next shift or reduce operation - [s]\t Perform a shift operation - [r]\t Perform a reduction operation - [Ctrl-z]\t Undo most recent operation - [Delete]\t Reset the parser - [g]\t Show/hide available production list - [Ctrl-a]\t Toggle animations - [h]\t Help - [Ctrl-p]\t Print - [q]\t Quit - -""" - -from tkinter import Button, Frame, IntVar, Label, Listbox, Menu, Scrollbar, Tk -from tkinter.font import Font - -from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment -from nltk.draw.util import CanvasFrame, EntryDialog, ShowText, TextWidget -from nltk.parse import SteppingShiftReduceParser -from nltk.tree import Tree -from nltk.util import in_idle - -""" -Possible future improvements: - - button/window to change and/or select text. Just pop up a window - with an entry, and let them modify the text; and then retokenize - it? Maybe give a warning if it contains tokens whose types are - not in the grammar. - - button/window to change and/or select grammar. Select from - several alternative grammars? Or actually change the grammar? If - the later, then I'd want to define nltk.draw.cfg, which would be - responsible for that. -""" - - -class ShiftReduceApp: - """ - A graphical tool for exploring the shift-reduce parser. The tool - displays the parser's stack and the remaining text, and allows the - user to control the parser's operation. In particular, the user - can shift tokens onto the stack, and can perform reductions on the - top elements of the stack. A "step" button simply steps through - the parsing process, performing the operations that - ``nltk.parse.ShiftReduceParser`` would use. - """ - - def __init__(self, grammar, sent, trace=0): - self._sent = sent - self._parser = SteppingShiftReduceParser(grammar, trace) - - # Set up the main window. - self._top = Tk() - self._top.title("Shift Reduce Parser Application") - - # Animations. animating_lock is a lock to prevent the demo - # from performing new operations while it's animating. - self._animating_lock = 0 - self._animate = IntVar(self._top) - self._animate.set(10) # = medium - - # The user can hide the grammar. - self._show_grammar = IntVar(self._top) - self._show_grammar.set(1) - - # Initialize fonts. - self._init_fonts(self._top) - - # Set up key bindings. - self._init_bindings() - - # Create the basic frames. - self._init_menubar(self._top) - self._init_buttons(self._top) - self._init_feedback(self._top) - self._init_grammar(self._top) - self._init_canvas(self._top) - - # A popup menu for reducing. - self._reduce_menu = Menu(self._canvas, tearoff=0) - - # Reset the demo, and set the feedback frame to empty. - self.reset() - self._lastoper1["text"] = "" - - ######################################### - ## Initialization Helpers - ######################################### - - def _init_fonts(self, root): - # See: - self._sysfont = Font(font=Button()["font"]) - root.option_add("*Font", self._sysfont) - - # TWhat's our font size (default=same as sysfont) - self._size = IntVar(root) - self._size.set(self._sysfont.cget("size")) - - self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get()) - self._font = Font(family="helvetica", size=self._size.get()) - - def _init_grammar(self, parent): - # Grammar view. - self._prodframe = listframe = Frame(parent) - self._prodframe.pack(fill="both", side="left", padx=2) - self._prodlist_label = Label( - self._prodframe, font=self._boldfont, text="Available Reductions" - ) - self._prodlist_label.pack() - self._prodlist = Listbox( - self._prodframe, - selectmode="single", - relief="groove", - background="white", - foreground="#909090", - font=self._font, - selectforeground="#004040", - selectbackground="#c0f0c0", - ) - - self._prodlist.pack(side="right", fill="both", expand=1) - - self._productions = list(self._parser.grammar().productions()) - for production in self._productions: - self._prodlist.insert("end", (" %s" % production)) - self._prodlist.config(height=min(len(self._productions), 25)) - - # Add a scrollbar if there are more than 25 productions. - if 1: # len(self._productions) > 25: - listscroll = Scrollbar(self._prodframe, orient="vertical") - self._prodlist.config(yscrollcommand=listscroll.set) - listscroll.config(command=self._prodlist.yview) - listscroll.pack(side="left", fill="y") - - # If they select a production, apply it. - self._prodlist.bind("<>", self._prodlist_select) - - # When they hover over a production, highlight it. - self._hover = -1 - self._prodlist.bind("", self._highlight_hover) - self._prodlist.bind("", self._clear_hover) - - def _init_bindings(self): - # Quit - self._top.bind("", self.destroy) - self._top.bind("", self.destroy) - self._top.bind("", self.destroy) - self._top.bind("", self.destroy) - - # Ops (step, shift, reduce, undo) - self._top.bind("", self.step) - self._top.bind("", self.shift) - self._top.bind("", self.shift) - self._top.bind("", self.shift) - self._top.bind("", self.reduce) - self._top.bind("", self.reduce) - self._top.bind("", self.reduce) - self._top.bind("", self.reset) - self._top.bind("", self.undo) - self._top.bind("", self.undo) - self._top.bind("", self.undo) - self._top.bind("", self.undo) - self._top.bind("", self.undo) - - # Misc - self._top.bind("", self.postscript) - self._top.bind("", self.help) - self._top.bind("", self.help) - self._top.bind("", self.edit_grammar) - self._top.bind("", self.edit_sentence) - - # Animation speed control - self._top.bind("-", lambda e, a=self._animate: a.set(20)) - self._top.bind("=", lambda e, a=self._animate: a.set(10)) - self._top.bind("+", lambda e, a=self._animate: a.set(4)) - - def _init_buttons(self, parent): - # Set up the frames. - self._buttonframe = buttonframe = Frame(parent) - buttonframe.pack(fill="none", side="bottom") - Button( - buttonframe, - text="Step", - background="#90c0d0", - foreground="black", - command=self.step, - ).pack(side="left") - Button( - buttonframe, - text="Shift", - underline=0, - background="#90f090", - foreground="black", - command=self.shift, - ).pack(side="left") - Button( - buttonframe, - text="Reduce", - underline=0, - background="#90f090", - foreground="black", - command=self.reduce, - ).pack(side="left") - Button( - buttonframe, - text="Undo", - underline=0, - background="#f0a0a0", - foreground="black", - command=self.undo, - ).pack(side="left") - - def _init_menubar(self, parent): - menubar = Menu(parent) - - filemenu = Menu(menubar, tearoff=0) - filemenu.add_command( - label="Reset Parser", underline=0, command=self.reset, accelerator="Del" - ) - filemenu.add_command( - label="Print to Postscript", - underline=0, - command=self.postscript, - accelerator="Ctrl-p", - ) - filemenu.add_command( - label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x" - ) - menubar.add_cascade(label="File", underline=0, menu=filemenu) - - editmenu = Menu(menubar, tearoff=0) - editmenu.add_command( - label="Edit Grammar", - underline=5, - command=self.edit_grammar, - accelerator="Ctrl-g", - ) - editmenu.add_command( - label="Edit Text", - underline=5, - command=self.edit_sentence, - accelerator="Ctrl-t", - ) - menubar.add_cascade(label="Edit", underline=0, menu=editmenu) - - rulemenu = Menu(menubar, tearoff=0) - rulemenu.add_command( - label="Step", underline=1, command=self.step, accelerator="Space" - ) - rulemenu.add_separator() - rulemenu.add_command( - label="Shift", underline=0, command=self.shift, accelerator="Ctrl-s" - ) - rulemenu.add_command( - label="Reduce", underline=0, command=self.reduce, accelerator="Ctrl-r" - ) - rulemenu.add_separator() - rulemenu.add_command( - label="Undo", underline=0, command=self.undo, accelerator="Ctrl-u" - ) - menubar.add_cascade(label="Apply", underline=0, menu=rulemenu) - - viewmenu = Menu(menubar, tearoff=0) - viewmenu.add_checkbutton( - label="Show Grammar", - underline=0, - variable=self._show_grammar, - command=self._toggle_grammar, - ) - viewmenu.add_separator() - viewmenu.add_radiobutton( - label="Tiny", - variable=self._size, - underline=0, - value=10, - command=self.resize, - ) - viewmenu.add_radiobutton( - label="Small", - variable=self._size, - underline=0, - value=12, - command=self.resize, - ) - viewmenu.add_radiobutton( - label="Medium", - variable=self._size, - underline=0, - value=14, - command=self.resize, - ) - viewmenu.add_radiobutton( - label="Large", - variable=self._size, - underline=0, - value=18, - command=self.resize, - ) - viewmenu.add_radiobutton( - label="Huge", - variable=self._size, - underline=0, - value=24, - command=self.resize, - ) - menubar.add_cascade(label="View", underline=0, menu=viewmenu) - - animatemenu = Menu(menubar, tearoff=0) - animatemenu.add_radiobutton( - label="No Animation", underline=0, variable=self._animate, value=0 - ) - animatemenu.add_radiobutton( - label="Slow Animation", - underline=0, - variable=self._animate, - value=20, - accelerator="-", - ) - animatemenu.add_radiobutton( - label="Normal Animation", - underline=0, - variable=self._animate, - value=10, - accelerator="=", - ) - animatemenu.add_radiobutton( - label="Fast Animation", - underline=0, - variable=self._animate, - value=4, - accelerator="+", - ) - menubar.add_cascade(label="Animate", underline=1, menu=animatemenu) - - helpmenu = Menu(menubar, tearoff=0) - helpmenu.add_command(label="About", underline=0, command=self.about) - helpmenu.add_command( - label="Instructions", underline=0, command=self.help, accelerator="F1" - ) - menubar.add_cascade(label="Help", underline=0, menu=helpmenu) - - parent.config(menu=menubar) - - def _init_feedback(self, parent): - self._feedbackframe = feedbackframe = Frame(parent) - feedbackframe.pack(fill="x", side="bottom", padx=3, pady=3) - self._lastoper_label = Label( - feedbackframe, text="Last Operation:", font=self._font - ) - self._lastoper_label.pack(side="left") - lastoperframe = Frame(feedbackframe, relief="sunken", border=1) - lastoperframe.pack(fill="x", side="right", expand=1, padx=5) - self._lastoper1 = Label( - lastoperframe, foreground="#007070", background="#f0f0f0", font=self._font - ) - self._lastoper2 = Label( - lastoperframe, - anchor="w", - width=30, - foreground="#004040", - background="#f0f0f0", - font=self._font, - ) - self._lastoper1.pack(side="left") - self._lastoper2.pack(side="left", fill="x", expand=1) - - def _init_canvas(self, parent): - self._cframe = CanvasFrame( - parent, - background="white", - width=525, - closeenough=10, - border=2, - relief="sunken", - ) - self._cframe.pack(expand=1, fill="both", side="top", pady=2) - canvas = self._canvas = self._cframe.canvas() - - self._stackwidgets = [] - self._rtextwidgets = [] - self._titlebar = canvas.create_rectangle( - 0, 0, 0, 0, fill="#c0f0f0", outline="black" - ) - self._exprline = canvas.create_line(0, 0, 0, 0, dash=".") - self._stacktop = canvas.create_line(0, 0, 0, 0, fill="#408080") - size = self._size.get() + 4 - self._stacklabel = TextWidget( - canvas, "Stack", color="#004040", font=self._boldfont - ) - self._rtextlabel = TextWidget( - canvas, "Remaining Text", color="#004040", font=self._boldfont - ) - self._cframe.add_widget(self._stacklabel) - self._cframe.add_widget(self._rtextlabel) - - ######################################### - ## Main draw procedure - ######################################### - - def _redraw(self): - scrollregion = self._canvas["scrollregion"].split() - (cx1, cy1, cx2, cy2) = (int(c) for c in scrollregion) - - # Delete the old stack & rtext widgets. - for stackwidget in self._stackwidgets: - self._cframe.destroy_widget(stackwidget) - self._stackwidgets = [] - for rtextwidget in self._rtextwidgets: - self._cframe.destroy_widget(rtextwidget) - self._rtextwidgets = [] - - # Position the titlebar & exprline - (x1, y1, x2, y2) = self._stacklabel.bbox() - y = y2 - y1 + 10 - self._canvas.coords(self._titlebar, -5000, 0, 5000, y - 4) - self._canvas.coords(self._exprline, 0, y * 2 - 10, 5000, y * 2 - 10) - - # Position the titlebar labels.. - (x1, y1, x2, y2) = self._stacklabel.bbox() - self._stacklabel.move(5 - x1, 3 - y1) - (x1, y1, x2, y2) = self._rtextlabel.bbox() - self._rtextlabel.move(cx2 - x2 - 5, 3 - y1) - - # Draw the stack. - stackx = 5 - for tok in self._parser.stack(): - if isinstance(tok, Tree): - attribs = { - "tree_color": "#4080a0", - "tree_width": 2, - "node_font": self._boldfont, - "node_color": "#006060", - "leaf_color": "#006060", - "leaf_font": self._font, - } - widget = tree_to_treesegment(self._canvas, tok, **attribs) - widget.label()["color"] = "#000000" - else: - widget = TextWidget(self._canvas, tok, color="#000000", font=self._font) - widget.bind_click(self._popup_reduce) - self._stackwidgets.append(widget) - self._cframe.add_widget(widget, stackx, y) - stackx = widget.bbox()[2] + 10 - - # Draw the remaining text. - rtextwidth = 0 - for tok in self._parser.remaining_text(): - widget = TextWidget(self._canvas, tok, color="#000000", font=self._font) - self._rtextwidgets.append(widget) - self._cframe.add_widget(widget, rtextwidth, y) - rtextwidth = widget.bbox()[2] + 4 - - # Allow enough room to shift the next token (for animations) - if len(self._rtextwidgets) > 0: - stackx += self._rtextwidgets[0].width() - - # Move the remaining text to the correct location (keep it - # right-justified, when possible); and move the remaining text - # label, if necessary. - stackx = max(stackx, self._stacklabel.width() + 25) - rlabelwidth = self._rtextlabel.width() + 10 - if stackx >= cx2 - max(rtextwidth, rlabelwidth): - cx2 = stackx + max(rtextwidth, rlabelwidth) - for rtextwidget in self._rtextwidgets: - rtextwidget.move(4 + cx2 - rtextwidth, 0) - self._rtextlabel.move(cx2 - self._rtextlabel.bbox()[2] - 5, 0) - - midx = (stackx + cx2 - max(rtextwidth, rlabelwidth)) / 2 - self._canvas.coords(self._stacktop, midx, 0, midx, 5000) - (x1, y1, x2, y2) = self._stacklabel.bbox() - - # Set up binding to allow them to shift a token by dragging it. - if len(self._rtextwidgets) > 0: - - def drag_shift(widget, midx=midx, self=self): - if widget.bbox()[0] < midx: - self.shift() - else: - self._redraw() - - self._rtextwidgets[0].bind_drag(drag_shift) - self._rtextwidgets[0].bind_click(self.shift) - - # Draw the stack top. - self._highlight_productions() - - def _draw_stack_top(self, widget): - # hack.. - midx = widget.bbox()[2] + 50 - self._canvas.coords(self._stacktop, midx, 0, midx, 5000) - - def _highlight_productions(self): - # Highlight the productions that can be reduced. - self._prodlist.selection_clear(0, "end") - for prod in self._parser.reducible_productions(): - index = self._productions.index(prod) - self._prodlist.selection_set(index) - - ######################################### - ## Button Callbacks - ######################################### - - def destroy(self, *e): - if self._top is None: - return - self._top.destroy() - self._top = None - - def reset(self, *e): - self._parser.initialize(self._sent) - self._lastoper1["text"] = "Reset App" - self._lastoper2["text"] = "" - self._redraw() - - def step(self, *e): - if self.reduce(): - return True - elif self.shift(): - return True - else: - if list(self._parser.parses()): - self._lastoper1["text"] = "Finished:" - self._lastoper2["text"] = "Success" - else: - self._lastoper1["text"] = "Finished:" - self._lastoper2["text"] = "Failure" - - def shift(self, *e): - if self._animating_lock: - return - if self._parser.shift(): - tok = self._parser.stack()[-1] - self._lastoper1["text"] = "Shift:" - self._lastoper2["text"] = "%r" % tok - if self._animate.get(): - self._animate_shift() - else: - self._redraw() - return True - return False - - def reduce(self, *e): - if self._animating_lock: - return - production = self._parser.reduce() - if production: - self._lastoper1["text"] = "Reduce:" - self._lastoper2["text"] = "%s" % production - if self._animate.get(): - self._animate_reduce() - else: - self._redraw() - return production - - def undo(self, *e): - if self._animating_lock: - return - if self._parser.undo(): - self._redraw() - - def postscript(self, *e): - self._cframe.print_to_file() - - def mainloop(self, *args, **kwargs): - """ - Enter the Tkinter mainloop. This function must be called if - this demo is created from a non-interactive program (e.g. - from a secript); otherwise, the demo will close as soon as - the script completes. - """ - if in_idle(): - return - self._top.mainloop(*args, **kwargs) - - ######################################### - ## Menubar callbacks - ######################################### - - def resize(self, size=None): - if size is not None: - self._size.set(size) - size = self._size.get() - self._font.configure(size=-(abs(size))) - self._boldfont.configure(size=-(abs(size))) - self._sysfont.configure(size=-(abs(size))) - - # self._stacklabel['font'] = ('helvetica', -size-4, 'bold') - # self._rtextlabel['font'] = ('helvetica', -size-4, 'bold') - # self._lastoper_label['font'] = ('helvetica', -size) - # self._lastoper1['font'] = ('helvetica', -size) - # self._lastoper2['font'] = ('helvetica', -size) - # self._prodlist['font'] = ('helvetica', -size) - # self._prodlist_label['font'] = ('helvetica', -size-2, 'bold') - self._redraw() - - def help(self, *e): - # The default font's not very legible; try using 'fixed' instead. - try: - ShowText( - self._top, - "Help: Shift-Reduce Parser Application", - (__doc__ or "").strip(), - width=75, - font="fixed", - ) - except: - ShowText( - self._top, - "Help: Shift-Reduce Parser Application", - (__doc__ or "").strip(), - width=75, - ) - - def about(self, *e): - ABOUT = "NLTK Shift-Reduce Parser Application\n" + "Written by Edward Loper" - TITLE = "About: Shift-Reduce Parser Application" - try: - from tkinter.messagebox import Message - - Message(message=ABOUT, title=TITLE).show() - except: - ShowText(self._top, TITLE, ABOUT) - - def edit_grammar(self, *e): - CFGEditor(self._top, self._parser.grammar(), self.set_grammar) - - def set_grammar(self, grammar): - self._parser.set_grammar(grammar) - self._productions = list(grammar.productions()) - self._prodlist.delete(0, "end") - for production in self._productions: - self._prodlist.insert("end", (" %s" % production)) - - def edit_sentence(self, *e): - sentence = " ".join(self._sent) - title = "Edit Text" - instr = "Enter a new sentence to parse." - EntryDialog(self._top, sentence, instr, self.set_sentence, title) - - def set_sentence(self, sent): - self._sent = sent.split() # [XX] use tagged? - self.reset() - - ######################################### - ## Reduce Production Selection - ######################################### - - def _toggle_grammar(self, *e): - if self._show_grammar.get(): - self._prodframe.pack( - fill="both", side="left", padx=2, after=self._feedbackframe - ) - self._lastoper1["text"] = "Show Grammar" - else: - self._prodframe.pack_forget() - self._lastoper1["text"] = "Hide Grammar" - self._lastoper2["text"] = "" - - def _prodlist_select(self, event): - selection = self._prodlist.curselection() - if len(selection) != 1: - return - index = int(selection[0]) - production = self._parser.reduce(self._productions[index]) - if production: - self._lastoper1["text"] = "Reduce:" - self._lastoper2["text"] = "%s" % production - if self._animate.get(): - self._animate_reduce() - else: - self._redraw() - else: - # Reset the production selections. - self._prodlist.selection_clear(0, "end") - for prod in self._parser.reducible_productions(): - index = self._productions.index(prod) - self._prodlist.selection_set(index) - - def _popup_reduce(self, widget): - # Remove old commands. - productions = self._parser.reducible_productions() - if len(productions) == 0: - return - - self._reduce_menu.delete(0, "end") - for production in productions: - self._reduce_menu.add_command(label=str(production), command=self.reduce) - self._reduce_menu.post( - self._canvas.winfo_pointerx(), self._canvas.winfo_pointery() - ) - - ######################################### - ## Animations - ######################################### - - def _animate_shift(self): - # What widget are we shifting? - widget = self._rtextwidgets[0] - - # Where are we shifting from & to? - right = widget.bbox()[0] - if len(self._stackwidgets) == 0: - left = 5 - else: - left = self._stackwidgets[-1].bbox()[2] + 10 - - # Start animating. - dt = self._animate.get() - dx = (left - right) * 1.0 / dt - self._animate_shift_frame(dt, widget, dx) - - def _animate_shift_frame(self, frame, widget, dx): - if frame > 0: - self._animating_lock = 1 - widget.move(dx, 0) - self._top.after(10, self._animate_shift_frame, frame - 1, widget, dx) - else: - # but: stacktop?? - - # Shift the widget to the stack. - del self._rtextwidgets[0] - self._stackwidgets.append(widget) - self._animating_lock = 0 - - # Display the available productions. - self._draw_stack_top(widget) - self._highlight_productions() - - def _animate_reduce(self): - # What widgets are we shifting? - numwidgets = len(self._parser.stack()[-1]) # number of children - widgets = self._stackwidgets[-numwidgets:] - - # How far are we moving? - if isinstance(widgets[0], TreeSegmentWidget): - ydist = 15 + widgets[0].label().height() - else: - ydist = 15 + widgets[0].height() - - # Start animating. - dt = self._animate.get() - dy = ydist * 2.0 / dt - self._animate_reduce_frame(dt / 2, widgets, dy) - - def _animate_reduce_frame(self, frame, widgets, dy): - if frame > 0: - self._animating_lock = 1 - for widget in widgets: - widget.move(0, dy) - self._top.after(10, self._animate_reduce_frame, frame - 1, widgets, dy) - else: - del self._stackwidgets[-len(widgets) :] - for widget in widgets: - self._cframe.remove_widget(widget) - tok = self._parser.stack()[-1] - if not isinstance(tok, Tree): - raise ValueError() - label = TextWidget( - self._canvas, str(tok.label()), color="#006060", font=self._boldfont - ) - widget = TreeSegmentWidget(self._canvas, label, widgets, width=2) - (x1, y1, x2, y2) = self._stacklabel.bbox() - y = y2 - y1 + 10 - if not self._stackwidgets: - x = 5 - else: - x = self._stackwidgets[-1].bbox()[2] + 10 - self._cframe.add_widget(widget, x, y) - self._stackwidgets.append(widget) - - # Display the available productions. - self._draw_stack_top(widget) - self._highlight_productions() - - # # Delete the old widgets.. - # del self._stackwidgets[-len(widgets):] - # for widget in widgets: - # self._cframe.destroy_widget(widget) - # - # # Make a new one. - # tok = self._parser.stack()[-1] - # if isinstance(tok, Tree): - # attribs = {'tree_color': '#4080a0', 'tree_width': 2, - # 'node_font': bold, 'node_color': '#006060', - # 'leaf_color': '#006060', 'leaf_font':self._font} - # widget = tree_to_treesegment(self._canvas, tok.type(), - # **attribs) - # widget.node()['color'] = '#000000' - # else: - # widget = TextWidget(self._canvas, tok.type(), - # color='#000000', font=self._font) - # widget.bind_click(self._popup_reduce) - # (x1, y1, x2, y2) = self._stacklabel.bbox() - # y = y2-y1+10 - # if not self._stackwidgets: x = 5 - # else: x = self._stackwidgets[-1].bbox()[2] + 10 - # self._cframe.add_widget(widget, x, y) - # self._stackwidgets.append(widget) - - # self._redraw() - self._animating_lock = 0 - - ######################################### - ## Hovering. - ######################################### - - def _highlight_hover(self, event): - # What production are we hovering over? - index = self._prodlist.nearest(event.y) - if self._hover == index: - return - - # Clear any previous hover highlighting. - self._clear_hover() - - # If the production corresponds to an available reduction, - # highlight the stack. - selection = [int(s) for s in self._prodlist.curselection()] - if index in selection: - rhslen = len(self._productions[index].rhs()) - for stackwidget in self._stackwidgets[-rhslen:]: - if isinstance(stackwidget, TreeSegmentWidget): - stackwidget.label()["color"] = "#00a000" - else: - stackwidget["color"] = "#00a000" - - # Remember what production we're hovering over. - self._hover = index - - def _clear_hover(self, *event): - # Clear any previous hover highlighting. - if self._hover == -1: - return - self._hover = -1 - for stackwidget in self._stackwidgets: - if isinstance(stackwidget, TreeSegmentWidget): - stackwidget.label()["color"] = "black" - else: - stackwidget["color"] = "black" - - -def app(): - """ - Create a shift reduce parser app, using a simple grammar and - text. - """ - - from nltk.grammar import CFG, Nonterminal, Production - - nonterminals = "S VP NP PP P N Name V Det" - (S, VP, NP, PP, P, N, Name, V, Det) = (Nonterminal(s) for s in nonterminals.split()) - - productions = ( - # Syntactic Productions - Production(S, [NP, VP]), - Production(NP, [Det, N]), - Production(NP, [NP, PP]), - Production(VP, [VP, PP]), - Production(VP, [V, NP, PP]), - Production(VP, [V, NP]), - Production(PP, [P, NP]), - # Lexical Productions - Production(NP, ["I"]), - Production(Det, ["the"]), - Production(Det, ["a"]), - Production(N, ["man"]), - Production(V, ["saw"]), - Production(P, ["in"]), - Production(P, ["with"]), - Production(N, ["park"]), - Production(N, ["dog"]), - Production(N, ["statue"]), - Production(Det, ["my"]), - ) - - grammar = CFG(S, productions) - - # tokenize the sentence - sent = "my dog saw a man in the park with a statue".split() - - ShiftReduceApp(grammar, sent).mainloop() - - -if __name__ == "__main__": - app() - -__all__ = ["app"] diff --git a/pipeline/nltk/app/wordfreq_app.py b/pipeline/nltk/app/wordfreq_app.py deleted file mode 100644 index 2846b31216be4611aeabb539782137f2f0decac7..0000000000000000000000000000000000000000 --- a/pipeline/nltk/app/wordfreq_app.py +++ /dev/null @@ -1,36 +0,0 @@ -# Natural Language Toolkit: Wordfreq Application -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Sumukh Ghodke -# URL: -# For license information, see LICENSE.TXT - -from matplotlib import pylab - -from nltk.corpus import gutenberg -from nltk.text import Text - - -def plot_word_freq_dist(text): - fd = text.vocab() - - samples = [item for item, _ in fd.most_common(50)] - values = [fd[sample] for sample in samples] - values = [sum(values[: i + 1]) * 100.0 / fd.N() for i in range(len(values))] - pylab.title(text.name) - pylab.xlabel("Samples") - pylab.ylabel("Cumulative Percentage") - pylab.plot(values) - pylab.xticks(range(len(samples)), [str(s) for s in samples], rotation=90) - pylab.show() - - -def app(): - t1 = Text(gutenberg.words("melville-moby_dick.txt")) - plot_word_freq_dist(t1) - - -if __name__ == "__main__": - app() - -__all__ = ["app"] diff --git a/pipeline/nltk/app/wordnet_app.py b/pipeline/nltk/app/wordnet_app.py deleted file mode 100644 index afed38b947d0ec231fe4d6f2f56614358d98c7b2..0000000000000000000000000000000000000000 --- a/pipeline/nltk/app/wordnet_app.py +++ /dev/null @@ -1,1005 +0,0 @@ -# Natural Language Toolkit: WordNet Browser Application -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Jussi Salmela -# Paul Bone -# URL: -# For license information, see LICENSE.TXT - -""" -A WordNet Browser application which launches the default browser -(if it is not already running) and opens a new tab with a connection -to http://localhost:port/ . It also starts an HTTP server on the -specified port and begins serving browser requests. The default -port is 8000. (For command-line help, run "python wordnet -h") -This application requires that the user's web browser supports -Javascript. - -BrowServer is a server for browsing the NLTK Wordnet database It first -launches a browser client to be used for browsing and then starts -serving the requests of that and maybe other clients - -Usage:: - - browserver.py -h - browserver.py [-s] [-p ] - -Options:: - - -h or --help - Display this help message. - - -l or --log-file - Logs messages to the given file, If this option is not specified - messages are silently dropped. - - -p or --port - Run the web server on this TCP port, defaults to 8000. - - -s or --server-mode - Do not start a web browser, and do not allow a user to - shutdown the server through the web interface. -""" -# TODO: throughout this package variable names and docstrings need -# modifying to be compliant with NLTK's coding standards. Tests also -# need to be develop to ensure this continues to work in the face of -# changes to other NLTK packages. - -import base64 -import copy -import getopt -import io -import os -import pickle -import sys -import threading -import time -import webbrowser -from collections import defaultdict -from http.server import BaseHTTPRequestHandler, HTTPServer - -# Allow this program to run inside the NLTK source tree. -from sys import argv -from urllib.parse import unquote_plus - -from nltk.corpus import wordnet as wn -from nltk.corpus.reader.wordnet import Lemma, Synset - -firstClient = True - -# True if we're not also running a web browser. The value f server_mode -# gets set by demo(). -server_mode = None - -# If set this is a file object for writing log messages. -logfile = None - - -class MyServerHandler(BaseHTTPRequestHandler): - def do_HEAD(self): - self.send_head() - - def do_GET(self): - global firstClient - sp = self.path[1:] - if unquote_plus(sp) == "SHUTDOWN THE SERVER": - if server_mode: - page = "Server must be killed with SIGTERM." - type = "text/plain" - else: - print("Server shutting down!") - os._exit(0) - - elif sp == "": # First request. - type = "text/html" - if not server_mode and firstClient: - firstClient = False - page = get_static_index_page(True) - else: - page = get_static_index_page(False) - word = "green" - - elif sp.endswith(".html"): # Trying to fetch a HTML file TODO: - type = "text/html" - usp = unquote_plus(sp) - if usp == "NLTK Wordnet Browser Database Info.html": - word = "* Database Info *" - if os.path.isfile(usp): - with open(usp) as infile: - page = infile.read() - else: - page = ( - (html_header % word) + "

The database info file:" - "

" - + usp - + "" - + "

was not found. Run this:" - + "

python dbinfo_html.py" - + "

to produce it." - + html_trailer - ) - else: - # Handle files here. - word = sp - try: - page = get_static_page_by_path(usp) - except FileNotFoundError: - page = "Internal error: Path for static page '%s' is unknown" % usp - # Set type to plain to prevent XSS by printing the path as HTML - type = "text/plain" - elif sp.startswith("search"): - # This doesn't seem to work with MWEs. - type = "text/html" - parts = (sp.split("?")[1]).split("&") - word = [ - p.split("=")[1].replace("+", " ") - for p in parts - if p.startswith("nextWord") - ][0] - page, word = page_from_word(word) - elif sp.startswith("lookup_"): - # TODO add a variation of this that takes a non ecoded word or MWE. - type = "text/html" - sp = sp[len("lookup_") :] - page, word = page_from_href(sp) - elif sp == "start_page": - # if this is the first request we should display help - # information, and possibly set a default word. - type = "text/html" - page, word = page_from_word("wordnet") - else: - type = "text/plain" - page = "Could not parse request: '%s'" % sp - - # Send result. - self.send_head(type) - self.wfile.write(page.encode("utf8")) - - def send_head(self, type=None): - self.send_response(200) - self.send_header("Content-type", type) - self.end_headers() - - def log_message(self, format, *args): - global logfile - - if logfile: - logfile.write( - "%s - - [%s] %s\n" - % (self.address_string(), self.log_date_time_string(), format % args) - ) - - -def get_unique_counter_from_url(sp): - """ - Extract the unique counter from the URL if it has one. Otherwise return - null. - """ - pos = sp.rfind("%23") - if pos != -1: - return int(sp[(pos + 3) :]) - else: - return None - - -def wnb(port=8000, runBrowser=True, logfilename=None): - """ - Run NLTK Wordnet Browser Server. - - :param port: The port number for the server to listen on, defaults to - 8000 - :type port: int - - :param runBrowser: True to start a web browser and point it at the web - server. - :type runBrowser: bool - """ - # The webbrowser module is unpredictable, typically it blocks if it uses - # a console web browser, and doesn't block if it uses a GUI webbrowser, - # so we need to force it to have a clear correct behaviour. - # - # Normally the server should run for as long as the user wants. they - # should idealy be able to control this from the UI by closing the - # window or tab. Second best would be clicking a button to say - # 'Shutdown' that first shutsdown the server and closes the window or - # tab, or exits the text-mode browser. Both of these are unfreasable. - # - # The next best alternative is to start the server, have it close when - # it receives SIGTERM (default), and run the browser as well. The user - # may have to shutdown both programs. - # - # Since webbrowser may block, and the webserver will block, we must run - # them in separate threads. - # - global server_mode, logfile - server_mode = not runBrowser - - # Setup logging. - if logfilename: - try: - logfile = open(logfilename, "a", 1) # 1 means 'line buffering' - except OSError as e: - sys.stderr.write("Couldn't open %s for writing: %s", logfilename, e) - sys.exit(1) - else: - logfile = None - - # Compute URL and start web browser - url = "http://localhost:" + str(port) - - server_ready = None - browser_thread = None - - if runBrowser: - server_ready = threading.Event() - browser_thread = startBrowser(url, server_ready) - - # Start the server. - server = HTTPServer(("", port), MyServerHandler) - if logfile: - logfile.write("NLTK Wordnet browser server running serving: %s\n" % url) - if runBrowser: - server_ready.set() - - try: - server.serve_forever() - except KeyboardInterrupt: - pass - - if runBrowser: - browser_thread.join() - - if logfile: - logfile.close() - - -def startBrowser(url, server_ready): - def run(): - server_ready.wait() - time.sleep(1) # Wait a little bit more, there's still the chance of - # a race condition. - webbrowser.open(url, new=2, autoraise=1) - - t = threading.Thread(target=run) - t.start() - return t - - -##################################################################### -# Utilities -##################################################################### - - -""" -WordNet Browser Utilities. - -This provides a backend to both wxbrowse and browserver.py. -""" - -################################################################################ -# -# Main logic for wordnet browser. -# - -# This is wrapped inside a function since wn is only available if the -# WordNet corpus is installed. -def _pos_tuples(): - return [ - (wn.NOUN, "N", "noun"), - (wn.VERB, "V", "verb"), - (wn.ADJ, "J", "adj"), - (wn.ADV, "R", "adv"), - ] - - -def _pos_match(pos_tuple): - """ - This function returns the complete pos tuple for the partial pos - tuple given to it. It attempts to match it against the first - non-null component of the given pos tuple. - """ - if pos_tuple[0] == "s": - pos_tuple = ("a", pos_tuple[1], pos_tuple[2]) - for n, x in enumerate(pos_tuple): - if x is not None: - break - for pt in _pos_tuples(): - if pt[n] == pos_tuple[n]: - return pt - return None - - -HYPONYM = 0 -HYPERNYM = 1 -CLASS_REGIONAL = 2 -PART_HOLONYM = 3 -PART_MERONYM = 4 -ATTRIBUTE = 5 -SUBSTANCE_HOLONYM = 6 -SUBSTANCE_MERONYM = 7 -MEMBER_HOLONYM = 8 -MEMBER_MERONYM = 9 -VERB_GROUP = 10 -INSTANCE_HYPONYM = 12 -INSTANCE_HYPERNYM = 13 -CAUSE = 14 -ALSO_SEE = 15 -SIMILAR = 16 -ENTAILMENT = 17 -ANTONYM = 18 -FRAMES = 19 -PERTAINYM = 20 - -CLASS_CATEGORY = 21 -CLASS_USAGE = 22 -CLASS_REGIONAL = 23 -CLASS_USAGE = 24 -CLASS_CATEGORY = 11 - -DERIVATIONALLY_RELATED_FORM = 25 - -INDIRECT_HYPERNYMS = 26 - - -def lemma_property(word, synset, func): - def flattern(l): - if l == []: - return [] - else: - return l[0] + flattern(l[1:]) - - return flattern([func(l) for l in synset.lemmas() if l.name == word]) - - -def rebuild_tree(orig_tree): - node = orig_tree[0] - children = orig_tree[1:] - return (node, [rebuild_tree(t) for t in children]) - - -def get_relations_data(word, synset): - """ - Get synset relations data for a synset. Note that this doesn't - yet support things such as full hyponym vs direct hyponym. - """ - if synset.pos() == wn.NOUN: - return ( - (HYPONYM, "Hyponyms", synset.hyponyms()), - (INSTANCE_HYPONYM, "Instance hyponyms", synset.instance_hyponyms()), - (HYPERNYM, "Direct hypernyms", synset.hypernyms()), - ( - INDIRECT_HYPERNYMS, - "Indirect hypernyms", - rebuild_tree(synset.tree(lambda x: x.hypernyms()))[1], - ), - # hypernyms', 'Sister terms', - (INSTANCE_HYPERNYM, "Instance hypernyms", synset.instance_hypernyms()), - # (CLASS_REGIONAL, ['domain term region'], ), - (PART_HOLONYM, "Part holonyms", synset.part_holonyms()), - (PART_MERONYM, "Part meronyms", synset.part_meronyms()), - (SUBSTANCE_HOLONYM, "Substance holonyms", synset.substance_holonyms()), - (SUBSTANCE_MERONYM, "Substance meronyms", synset.substance_meronyms()), - (MEMBER_HOLONYM, "Member holonyms", synset.member_holonyms()), - (MEMBER_MERONYM, "Member meronyms", synset.member_meronyms()), - (ATTRIBUTE, "Attributes", synset.attributes()), - (ANTONYM, "Antonyms", lemma_property(word, synset, lambda l: l.antonyms())), - ( - DERIVATIONALLY_RELATED_FORM, - "Derivationally related form", - lemma_property( - word, synset, lambda l: l.derivationally_related_forms() - ), - ), - ) - elif synset.pos() == wn.VERB: - return ( - (ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())), - (HYPONYM, "Hyponym", synset.hyponyms()), - (HYPERNYM, "Direct hypernyms", synset.hypernyms()), - ( - INDIRECT_HYPERNYMS, - "Indirect hypernyms", - rebuild_tree(synset.tree(lambda x: x.hypernyms()))[1], - ), - (ENTAILMENT, "Entailments", synset.entailments()), - (CAUSE, "Causes", synset.causes()), - (ALSO_SEE, "Also see", synset.also_sees()), - (VERB_GROUP, "Verb Groups", synset.verb_groups()), - ( - DERIVATIONALLY_RELATED_FORM, - "Derivationally related form", - lemma_property( - word, synset, lambda l: l.derivationally_related_forms() - ), - ), - ) - elif synset.pos() == wn.ADJ or synset.pos == wn.ADJ_SAT: - return ( - (ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())), - (SIMILAR, "Similar to", synset.similar_tos()), - # Participle of verb - not supported by corpus - ( - PERTAINYM, - "Pertainyms", - lemma_property(word, synset, lambda l: l.pertainyms()), - ), - (ATTRIBUTE, "Attributes", synset.attributes()), - (ALSO_SEE, "Also see", synset.also_sees()), - ) - elif synset.pos() == wn.ADV: - # This is weird. adverbs such as 'quick' and 'fast' don't seem - # to have antonyms returned by the corpus.a - return ( - (ANTONYM, "Antonym", lemma_property(word, synset, lambda l: l.antonyms())), - ) - # Derived from adjective - not supported by corpus - else: - raise TypeError("Unhandles synset POS type: " + str(synset.pos())) - - -html_header = """ - - - - - -NLTK Wordnet Browser display of: %s - -""" -html_trailer = """ - - -""" - -explanation = """ -

Search Help

-
  • The display below the line is an example of the output the browser -shows you when you enter a search word. The search word was green.
  • -
  • The search result shows for different parts of speech the synsets -i.e. different meanings for the word.
  • -
  • All underlined texts are hypertext links. There are two types of links: -word links and others. Clicking a word link carries out a search for the word -in the Wordnet database.
  • -
  • Clicking a link of the other type opens a display section of data attached -to that link. Clicking that link a second time closes the section again.
  • -
  • Clicking S: opens a section showing the relations for that synset. -
  • -
  • Clicking on a relation name opens a section that displays the associated -synsets.
  • -
  • Type a search word in the Word field and start the search by the -Enter/Return key or click the Search button.
  • -
-
-""" - -# HTML oriented functions - - -def _bold(txt): - return "%s" % txt - - -def _center(txt): - return "
%s
" % txt - - -def _hlev(n, txt): - return "%s" % (n, txt, n) - - -def _italic(txt): - return "%s" % txt - - -def _li(txt): - return "
  • %s
  • " % txt - - -def pg(word, body): - """ - Return a HTML page of NLTK Browser format constructed from the - word and body - - :param word: The word that the body corresponds to - :type word: str - :param body: The HTML body corresponding to the word - :type body: str - :return: a HTML page for the word-body combination - :rtype: str - """ - return (html_header % word) + body + html_trailer - - -def _ul(txt): - return "
      " + txt + "
    " - - -def _abbc(txt): - """ - abbc = asterisks, breaks, bold, center - """ - return _center(_bold("
    " * 10 + "*" * 10 + " " + txt + " " + "*" * 10)) - - -full_hyponym_cont_text = _ul(_li(_italic("(has full hyponym continuation)"))) + "\n" - - -def _get_synset(synset_key): - """ - The synset key is the unique name of the synset, this can be - retrieved via synset.name() - """ - return wn.synset(synset_key) - - -def _collect_one_synset(word, synset, synset_relations): - """ - Returns the HTML string for one synset or word - - :param word: the current word - :type word: str - :param synset: a synset - :type synset: synset - :param synset_relations: information about which synset relations - to display. - :type synset_relations: dict(synset_key, set(relation_id)) - :return: The HTML string built for this synset - :rtype: str - """ - if isinstance(synset, tuple): # It's a word - raise NotImplementedError("word not supported by _collect_one_synset") - - typ = "S" - pos_tuple = _pos_match((synset.pos(), None, None)) - assert pos_tuple is not None, "pos_tuple is null: synset.pos(): %s" % synset.pos() - descr = pos_tuple[2] - ref = copy.deepcopy(Reference(word, synset_relations)) - ref.toggle_synset(synset) - synset_label = typ + ";" - if synset.name() in synset_relations: - synset_label = _bold(synset_label) - s = f"
  • {make_lookup_link(ref, synset_label)} ({descr}) " - - def format_lemma(w): - w = w.replace("_", " ") - if w.lower() == word: - return _bold(w) - else: - ref = Reference(w) - return make_lookup_link(ref, w) - - s += ", ".join(format_lemma(l.name()) for l in synset.lemmas()) - - gl = " ({}) {} ".format( - synset.definition(), - "; ".join('"%s"' % e for e in synset.examples()), - ) - return s + gl + _synset_relations(word, synset, synset_relations) + "
  • \n" - - -def _collect_all_synsets(word, pos, synset_relations=dict()): - """ - Return a HTML unordered list of synsets for the given word and - part of speech. - """ - return "
      %s\n
    \n" % "".join( - _collect_one_synset(word, synset, synset_relations) - for synset in wn.synsets(word, pos) - ) - - -def _synset_relations(word, synset, synset_relations): - """ - Builds the HTML string for the relations of a synset - - :param word: The current word - :type word: str - :param synset: The synset for which we're building the relations. - :type synset: Synset - :param synset_relations: synset keys and relation types for which to display relations. - :type synset_relations: dict(synset_key, set(relation_type)) - :return: The HTML for a synset's relations - :rtype: str - """ - - if not synset.name() in synset_relations: - return "" - ref = Reference(word, synset_relations) - - def relation_html(r): - if isinstance(r, Synset): - return make_lookup_link(Reference(r.lemma_names()[0]), r.lemma_names()[0]) - elif isinstance(r, Lemma): - return relation_html(r.synset()) - elif isinstance(r, tuple): - # It's probably a tuple containing a Synset and a list of - # similar tuples. This forms a tree of synsets. - return "{}\n
      {}
    \n".format( - relation_html(r[0]), - "".join("
  • %s
  • \n" % relation_html(sr) for sr in r[1]), - ) - else: - raise TypeError( - "r must be a synset, lemma or list, it was: type(r) = %s, r = %s" - % (type(r), r) - ) - - def make_synset_html(db_name, disp_name, rels): - synset_html = "%s\n" % make_lookup_link( - copy.deepcopy(ref).toggle_synset_relation(synset, db_name), - disp_name, - ) - - if db_name in ref.synset_relations[synset.name()]: - synset_html += "
      %s
    \n" % "".join( - "
  • %s
  • \n" % relation_html(r) for r in rels - ) - - return synset_html - - html = ( - "
      " - + "\n".join( - "
    • %s
    • " % make_synset_html(*rel_data) - for rel_data in get_relations_data(word, synset) - if rel_data[2] != [] - ) - + "
    " - ) - - return html - - -class RestrictedUnpickler(pickle.Unpickler): - """ - Unpickler that prevents any class or function from being used during loading. - """ - - def find_class(self, module, name): - # Forbid every function - raise pickle.UnpicklingError(f"global '{module}.{name}' is forbidden") - - -class Reference: - """ - A reference to a page that may be generated by page_word - """ - - def __init__(self, word, synset_relations=dict()): - """ - Build a reference to a new page. - - word is the word or words (separated by commas) for which to - search for synsets of - - synset_relations is a dictionary of synset keys to sets of - synset relation identifaiers to unfold a list of synset - relations for. - """ - self.word = word - self.synset_relations = synset_relations - - def encode(self): - """ - Encode this reference into a string to be used in a URL. - """ - # This uses a tuple rather than an object since the python - # pickle representation is much smaller and there is no need - # to represent the complete object. - string = pickle.dumps((self.word, self.synset_relations), -1) - return base64.urlsafe_b64encode(string).decode() - - @staticmethod - def decode(string): - """ - Decode a reference encoded with Reference.encode - """ - string = base64.urlsafe_b64decode(string.encode()) - word, synset_relations = RestrictedUnpickler(io.BytesIO(string)).load() - return Reference(word, synset_relations) - - def toggle_synset_relation(self, synset, relation): - """ - Toggle the display of the relations for the given synset and - relation type. - - This function will throw a KeyError if the synset is currently - not being displayed. - """ - if relation in self.synset_relations[synset.name()]: - self.synset_relations[synset.name()].remove(relation) - else: - self.synset_relations[synset.name()].add(relation) - - return self - - def toggle_synset(self, synset): - """ - Toggle displaying of the relation types for the given synset - """ - if synset.name() in self.synset_relations: - del self.synset_relations[synset.name()] - else: - self.synset_relations[synset.name()] = set() - - return self - - -def make_lookup_link(ref, label): - return f'{label}' - - -def page_from_word(word): - """ - Return a HTML page for the given word. - - :type word: str - :param word: The currently active word - :return: A tuple (page,word), where page is the new current HTML page - to be sent to the browser and - word is the new current word - :rtype: A tuple (str,str) - """ - return page_from_reference(Reference(word)) - - -def page_from_href(href): - """ - Returns a tuple of the HTML page built and the new current word - - :param href: The hypertext reference to be solved - :type href: str - :return: A tuple (page,word), where page is the new current HTML page - to be sent to the browser and - word is the new current word - :rtype: A tuple (str,str) - """ - return page_from_reference(Reference.decode(href)) - - -def page_from_reference(href): - """ - Returns a tuple of the HTML page built and the new current word - - :param href: The hypertext reference to be solved - :type href: str - :return: A tuple (page,word), where page is the new current HTML page - to be sent to the browser and - word is the new current word - :rtype: A tuple (str,str) - """ - word = href.word - pos_forms = defaultdict(list) - words = word.split(",") - words = [w for w in [w.strip().lower().replace(" ", "_") for w in words] if w != ""] - if len(words) == 0: - # No words were found. - return "", "Please specify a word to search for." - - # This looks up multiple words at once. This is probably not - # necessary and may lead to problems. - for w in words: - for pos in [wn.NOUN, wn.VERB, wn.ADJ, wn.ADV]: - form = wn.morphy(w, pos) - if form and form not in pos_forms[pos]: - pos_forms[pos].append(form) - body = "" - for pos, pos_str, name in _pos_tuples(): - if pos in pos_forms: - body += _hlev(3, name) + "\n" - for w in pos_forms[pos]: - # Not all words of exc files are in the database, skip - # to the next word if a KeyError is raised. - try: - body += _collect_all_synsets(w, pos, href.synset_relations) - except KeyError: - pass - if not body: - body = "The word or words '%s' were not found in the dictionary." % word - return body, word - - -##################################################################### -# Static pages -##################################################################### - - -def get_static_page_by_path(path): - """ - Return a static HTML page from the path given. - """ - if path == "index_2.html": - return get_static_index_page(False) - elif path == "index.html": - return get_static_index_page(True) - elif path == "NLTK Wordnet Browser Database Info.html": - return "Display of Wordnet Database Statistics is not supported" - elif path == "upper_2.html": - return get_static_upper_page(False) - elif path == "upper.html": - return get_static_upper_page(True) - elif path == "web_help.html": - return get_static_web_help_page() - elif path == "wx_help.html": - return get_static_wx_help_page() - raise FileNotFoundError() - - -def get_static_web_help_page(): - """ - Return the static web help page. - """ - return """ - - - - - - NLTK Wordnet Browser display of: * Help * - - -

    NLTK Wordnet Browser Help

    -

    The NLTK Wordnet Browser is a tool to use in browsing the Wordnet database. It tries to behave like the Wordnet project's web browser but the difference is that the NLTK Wordnet Browser uses a local Wordnet database. -

    You are using the Javascript client part of the NLTK Wordnet BrowseServer. We assume your browser is in tab sheets enabled mode.

    -

    For background information on Wordnet, see the Wordnet project home page: https://wordnet.princeton.edu/. For more information on the NLTK project, see the project home: -https://www.nltk.org/. To get an idea of what the Wordnet version used by this browser includes choose Show Database Info from the View submenu.

    -

    Word search

    -

    The word to be searched is typed into the New Word field and the search started with Enter or by clicking the Search button. There is no uppercase/lowercase distinction: the search word is transformed to lowercase before the search.

    -

    In addition, the word does not have to be in base form. The browser tries to find the possible base form(s) by making certain morphological substitutions. Typing fLIeS as an obscure example gives one this. Click the previous link to see what this kind of search looks like and then come back to this page by using the Alt+LeftArrow key combination.

    -

    The result of a search is a display of one or more -synsets for every part of speech in which a form of the -search word was found to occur. A synset is a set of words -having the same sense or meaning. Each word in a synset that is -underlined is a hyperlink which can be clicked to trigger an -automatic search for that word.

    -

    Every synset has a hyperlink S: at the start of its -display line. Clicking that symbol shows you the name of every -relation that this synset is part of. Every relation name is a hyperlink that opens up a display for that relation. Clicking it another time closes the display again. Clicking another relation name on a line that has an opened relation closes the open relation and opens the clicked relation.

    -

    It is also possible to give two or more words or collocations to be searched at the same time separating them with a comma like this cheer up,clear up, for example. Click the previous link to see what this kind of search looks like and then come back to this page by using the Alt+LeftArrow key combination. As you could see the search result includes the synsets found in the same order than the forms were given in the search field.

    -

    -There are also word level (lexical) relations recorded in the Wordnet database. Opening this kind of relation displays lines with a hyperlink W: at their beginning. Clicking this link shows more info on the word in question.

    -

    The Buttons

    -

    The Search and Help buttons need no more explanation.

    -

    The Show Database Info button shows a collection of Wordnet database statistics.

    -

    The Shutdown the Server button is shown for the first client of the BrowServer program i.e. for the client that is automatically launched when the BrowServer is started but not for the succeeding clients in order to protect the server from accidental shutdowns. -

    - -""" - - -def get_static_welcome_message(): - """ - Get the static welcome page. - """ - return """ -

    Search Help

    -
    • The display below the line is an example of the output the browser -shows you when you enter a search word. The search word was green.
    • -
    • The search result shows for different parts of speech the synsets -i.e. different meanings for the word.
    • -
    • All underlined texts are hypertext links. There are two types of links: -word links and others. Clicking a word link carries out a search for the word -in the Wordnet database.
    • -
    • Clicking a link of the other type opens a display section of data attached -to that link. Clicking that link a second time closes the section again.
    • -
    • Clicking S: opens a section showing the relations for that synset.
    • -
    • Clicking on a relation name opens a section that displays the associated -synsets.
    • -
    • Type a search word in the Next Word field and start the search by the -Enter/Return key or click the Search button.
    • -
    -""" - - -def get_static_index_page(with_shutdown): - """ - Get the static index page. - """ - template = """ - - - - - NLTK Wordnet Browser - - - - - - - -""" - if with_shutdown: - upper_link = "upper.html" - else: - upper_link = "upper_2.html" - - return template % upper_link - - -def get_static_upper_page(with_shutdown): - """ - Return the upper frame page, - - If with_shutdown is True then a 'shutdown' button is also provided - to shutdown the server. - """ - template = """ - - - - - - Untitled Document - - -
    - Current Word:  - Next Word:  - -
    - Help - %s - - - -""" - if with_shutdown: - shutdown_link = 'Shutdown' - else: - shutdown_link = "" - - return template % shutdown_link - - -def usage(): - """ - Display the command line help message. - """ - print(__doc__) - - -def app(): - # Parse and interpret options. - (opts, _) = getopt.getopt( - argv[1:], "l:p:sh", ["logfile=", "port=", "server-mode", "help"] - ) - port = 8000 - server_mode = False - help_mode = False - logfilename = None - for (opt, value) in opts: - if (opt == "-l") or (opt == "--logfile"): - logfilename = str(value) - elif (opt == "-p") or (opt == "--port"): - port = int(value) - elif (opt == "-s") or (opt == "--server-mode"): - server_mode = True - elif (opt == "-h") or (opt == "--help"): - help_mode = True - - if help_mode: - usage() - else: - wnb(port, not server_mode, logfilename) - - -if __name__ == "__main__": - app() - -__all__ = ["app"] diff --git a/pipeline/nltk/book.py b/pipeline/nltk/book.py deleted file mode 100644 index 704f84d426fdf87b4233454c8ceb9915d7db3161..0000000000000000000000000000000000000000 --- a/pipeline/nltk/book.py +++ /dev/null @@ -1,213 +0,0 @@ -# Natural Language Toolkit: Some texts for exploration in chapter 1 of the book -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# -# URL: -# For license information, see LICENSE.TXT - -from nltk.corpus import ( - genesis, - gutenberg, - inaugural, - nps_chat, - treebank, - webtext, - wordnet, -) -from nltk.probability import FreqDist -from nltk.text import Text -from nltk.util import bigrams - -print("*** Introductory Examples for the NLTK Book ***") -print("Loading text1, ..., text9 and sent1, ..., sent9") -print("Type the name of the text or sentence to view it.") -print("Type: 'texts()' or 'sents()' to list the materials.") - -text1 = Text(gutenberg.words("melville-moby_dick.txt")) -print("text1:", text1.name) - -text2 = Text(gutenberg.words("austen-sense.txt")) -print("text2:", text2.name) - -text3 = Text(genesis.words("english-kjv.txt"), name="The Book of Genesis") -print("text3:", text3.name) - -text4 = Text(inaugural.words(), name="Inaugural Address Corpus") -print("text4:", text4.name) - -text5 = Text(nps_chat.words(), name="Chat Corpus") -print("text5:", text5.name) - -text6 = Text(webtext.words("grail.txt"), name="Monty Python and the Holy Grail") -print("text6:", text6.name) - -text7 = Text(treebank.words(), name="Wall Street Journal") -print("text7:", text7.name) - -text8 = Text(webtext.words("singles.txt"), name="Personals Corpus") -print("text8:", text8.name) - -text9 = Text(gutenberg.words("chesterton-thursday.txt")) -print("text9:", text9.name) - - -def texts(): - print("text1:", text1.name) - print("text2:", text2.name) - print("text3:", text3.name) - print("text4:", text4.name) - print("text5:", text5.name) - print("text6:", text6.name) - print("text7:", text7.name) - print("text8:", text8.name) - print("text9:", text9.name) - - -sent1 = ["Call", "me", "Ishmael", "."] -sent2 = [ - "The", - "family", - "of", - "Dashwood", - "had", - "long", - "been", - "settled", - "in", - "Sussex", - ".", -] -sent3 = [ - "In", - "the", - "beginning", - "God", - "created", - "the", - "heaven", - "and", - "the", - "earth", - ".", -] -sent4 = [ - "Fellow", - "-", - "Citizens", - "of", - "the", - "Senate", - "and", - "of", - "the", - "House", - "of", - "Representatives", - ":", -] -sent5 = [ - "I", - "have", - "a", - "problem", - "with", - "people", - "PMing", - "me", - "to", - "lol", - "JOIN", -] -sent6 = [ - "SCENE", - "1", - ":", - "[", - "wind", - "]", - "[", - "clop", - "clop", - "clop", - "]", - "KING", - "ARTHUR", - ":", - "Whoa", - "there", - "!", -] -sent7 = [ - "Pierre", - "Vinken", - ",", - "61", - "years", - "old", - ",", - "will", - "join", - "the", - "board", - "as", - "a", - "nonexecutive", - "director", - "Nov.", - "29", - ".", -] -sent8 = [ - "25", - "SEXY", - "MALE", - ",", - "seeks", - "attrac", - "older", - "single", - "lady", - ",", - "for", - "discreet", - "encounters", - ".", -] -sent9 = [ - "THE", - "suburb", - "of", - "Saffron", - "Park", - "lay", - "on", - "the", - "sunset", - "side", - "of", - "London", - ",", - "as", - "red", - "and", - "ragged", - "as", - "a", - "cloud", - "of", - "sunset", - ".", -] - - -def sents(): - print("sent1:", " ".join(sent1)) - print("sent2:", " ".join(sent2)) - print("sent3:", " ".join(sent3)) - print("sent4:", " ".join(sent4)) - print("sent5:", " ".join(sent5)) - print("sent6:", " ".join(sent6)) - print("sent7:", " ".join(sent7)) - print("sent8:", " ".join(sent8)) - print("sent9:", " ".join(sent9)) diff --git a/pipeline/nltk/ccg/__init__.py b/pipeline/nltk/ccg/__init__.py deleted file mode 100644 index 43c5876b74dcf07ea70c9d90c1dcd41971e515a4..0000000000000000000000000000000000000000 --- a/pipeline/nltk/ccg/__init__.py +++ /dev/null @@ -1,34 +0,0 @@ -# Natural Language Toolkit: Combinatory Categorial Grammar -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Graeme Gange -# URL: -# For license information, see LICENSE.TXT - -""" -Combinatory Categorial Grammar. - -For more information see nltk/doc/contrib/ccg/ccg.pdf -""" - -from nltk.ccg.chart import CCGChart, CCGChartParser, CCGEdge, CCGLeafEdge -from nltk.ccg.combinator import ( - BackwardApplication, - BackwardBx, - BackwardCombinator, - BackwardComposition, - BackwardSx, - BackwardT, - DirectedBinaryCombinator, - ForwardApplication, - ForwardCombinator, - ForwardComposition, - ForwardSubstitution, - ForwardT, - UndirectedBinaryCombinator, - UndirectedComposition, - UndirectedFunctionApplication, - UndirectedSubstitution, - UndirectedTypeRaise, -) -from nltk.ccg.lexicon import CCGLexicon diff --git a/pipeline/nltk/ccg/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/ccg/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 9e500e45ecc7877aeb232ee0373811a32c9065c5..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/ccg/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/ccg/__pycache__/api.cpython-39.pyc b/pipeline/nltk/ccg/__pycache__/api.cpython-39.pyc deleted file mode 100644 index b7c9dc75ca2c913b6b65a72df693b00f27050e1f..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/ccg/__pycache__/api.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/ccg/__pycache__/chart.cpython-39.pyc b/pipeline/nltk/ccg/__pycache__/chart.cpython-39.pyc deleted file mode 100644 index 2ea0524f5d7cd8a85bd055971c32b685f9c64644..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/ccg/__pycache__/chart.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/ccg/__pycache__/combinator.cpython-39.pyc b/pipeline/nltk/ccg/__pycache__/combinator.cpython-39.pyc deleted file mode 100644 index b198b5a9997e057396940980ac57674f1c4cc1db..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/ccg/__pycache__/combinator.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/ccg/__pycache__/lexicon.cpython-39.pyc b/pipeline/nltk/ccg/__pycache__/lexicon.cpython-39.pyc deleted file mode 100644 index ca0e7ec5afc31ab7cd959dec4210086fc56b4121..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/ccg/__pycache__/lexicon.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/ccg/__pycache__/logic.cpython-39.pyc b/pipeline/nltk/ccg/__pycache__/logic.cpython-39.pyc deleted file mode 100644 index 941549ef875f8044a18b77a01db4944448937071..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/ccg/__pycache__/logic.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/ccg/api.py b/pipeline/nltk/ccg/api.py deleted file mode 100644 index f0d1355cfadca031cf0017584d819fe794ffaea3..0000000000000000000000000000000000000000 --- a/pipeline/nltk/ccg/api.py +++ /dev/null @@ -1,358 +0,0 @@ -# Natural Language Toolkit: CCG Categories -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Graeme Gange -# URL: -# For license information, see LICENSE.TXT - -from abc import ABCMeta, abstractmethod -from functools import total_ordering - -from nltk.internals import raise_unorderable_types - - -@total_ordering -class AbstractCCGCategory(metaclass=ABCMeta): - """ - Interface for categories in combinatory grammars. - """ - - @abstractmethod - def is_primitive(self): - """ - Returns true if the category is primitive. - """ - - @abstractmethod - def is_function(self): - """ - Returns true if the category is a function application. - """ - - @abstractmethod - def is_var(self): - """ - Returns true if the category is a variable. - """ - - @abstractmethod - def substitute(self, substitutions): - """ - Takes a set of (var, category) substitutions, and replaces every - occurrence of the variable with the corresponding category. - """ - - @abstractmethod - def can_unify(self, other): - """ - Determines whether two categories can be unified. - - Returns None if they cannot be unified - - Returns a list of necessary substitutions if they can. - """ - - # Utility functions: comparison, strings and hashing. - @abstractmethod - def __str__(self): - pass - - def __eq__(self, other): - return ( - self.__class__ is other.__class__ - and self._comparison_key == other._comparison_key - ) - - def __ne__(self, other): - return not self == other - - def __lt__(self, other): - if not isinstance(other, AbstractCCGCategory): - raise_unorderable_types("<", self, other) - if self.__class__ is other.__class__: - return self._comparison_key < other._comparison_key - else: - return self.__class__.__name__ < other.__class__.__name__ - - def __hash__(self): - try: - return self._hash - except AttributeError: - self._hash = hash(self._comparison_key) - return self._hash - - -class CCGVar(AbstractCCGCategory): - """ - Class representing a variable CCG category. - Used for conjunctions (and possibly type-raising, if implemented as a - unary rule). - """ - - _maxID = 0 - - def __init__(self, prim_only=False): - """Initialize a variable (selects a new identifier) - - :param prim_only: a boolean that determines whether the variable is - restricted to primitives - :type prim_only: bool - """ - self._id = self.new_id() - self._prim_only = prim_only - self._comparison_key = self._id - - @classmethod - def new_id(cls): - """ - A class method allowing generation of unique variable identifiers. - """ - cls._maxID = cls._maxID + 1 - return cls._maxID - 1 - - @classmethod - def reset_id(cls): - cls._maxID = 0 - - def is_primitive(self): - return False - - def is_function(self): - return False - - def is_var(self): - return True - - def substitute(self, substitutions): - """If there is a substitution corresponding to this variable, - return the substituted category. - """ - for (var, cat) in substitutions: - if var == self: - return cat - return self - - def can_unify(self, other): - """If the variable can be replaced with other - a substitution is returned. - """ - if other.is_primitive() or not self._prim_only: - return [(self, other)] - return None - - def id(self): - return self._id - - def __str__(self): - return "_var" + str(self._id) - - -@total_ordering -class Direction: - """ - Class representing the direction of a function application. - Also contains maintains information as to which combinators - may be used with the category. - """ - - def __init__(self, dir, restrictions): - self._dir = dir - self._restrs = restrictions - self._comparison_key = (dir, tuple(restrictions)) - - # Testing the application direction - def is_forward(self): - return self._dir == "/" - - def is_backward(self): - return self._dir == "\\" - - def dir(self): - return self._dir - - def restrs(self): - """A list of restrictions on the combinators. - '.' denotes that permuting operations are disallowed - ',' denotes that function composition is disallowed - '_' denotes that the direction has variable restrictions. - (This is redundant in the current implementation of type-raising) - """ - return self._restrs - - def is_variable(self): - return self._restrs == "_" - - # Unification and substitution of variable directions. - # Used only if type-raising is implemented as a unary rule, as it - # must inherit restrictions from the argument category. - def can_unify(self, other): - if other.is_variable(): - return [("_", self.restrs())] - elif self.is_variable(): - return [("_", other.restrs())] - else: - if self.restrs() == other.restrs(): - return [] - return None - - def substitute(self, subs): - if not self.is_variable(): - return self - - for (var, restrs) in subs: - if var == "_": - return Direction(self._dir, restrs) - return self - - # Testing permitted combinators - def can_compose(self): - return "," not in self._restrs - - def can_cross(self): - return "." not in self._restrs - - def __eq__(self, other): - return ( - self.__class__ is other.__class__ - and self._comparison_key == other._comparison_key - ) - - def __ne__(self, other): - return not self == other - - def __lt__(self, other): - if not isinstance(other, Direction): - raise_unorderable_types("<", self, other) - if self.__class__ is other.__class__: - return self._comparison_key < other._comparison_key - else: - return self.__class__.__name__ < other.__class__.__name__ - - def __hash__(self): - try: - return self._hash - except AttributeError: - self._hash = hash(self._comparison_key) - return self._hash - - def __str__(self): - r_str = "" - for r in self._restrs: - r_str = r_str + "%s" % r - return f"{self._dir}{r_str}" - - # The negation operator reverses the direction of the application - def __neg__(self): - if self._dir == "/": - return Direction("\\", self._restrs) - else: - return Direction("/", self._restrs) - - -class PrimitiveCategory(AbstractCCGCategory): - """ - Class representing primitive categories. - Takes a string representation of the category, and a - list of strings specifying the morphological subcategories. - """ - - def __init__(self, categ, restrictions=[]): - self._categ = categ - self._restrs = restrictions - self._comparison_key = (categ, tuple(restrictions)) - - def is_primitive(self): - return True - - def is_function(self): - return False - - def is_var(self): - return False - - def restrs(self): - return self._restrs - - def categ(self): - return self._categ - - # Substitution does nothing to a primitive category - def substitute(self, subs): - return self - - # A primitive can be unified with a class of the same - # base category, given that the other category shares all - # of its subclasses, or with a variable. - def can_unify(self, other): - if not other.is_primitive(): - return None - if other.is_var(): - return [(other, self)] - if other.categ() == self.categ(): - for restr in self._restrs: - if restr not in other.restrs(): - return None - return [] - return None - - def __str__(self): - if self._restrs == []: - return "%s" % self._categ - restrictions = "[%s]" % ",".join(repr(r) for r in self._restrs) - return f"{self._categ}{restrictions}" - - -class FunctionalCategory(AbstractCCGCategory): - """ - Class that represents a function application category. - Consists of argument and result categories, together with - an application direction. - """ - - def __init__(self, res, arg, dir): - self._res = res - self._arg = arg - self._dir = dir - self._comparison_key = (arg, dir, res) - - def is_primitive(self): - return False - - def is_function(self): - return True - - def is_var(self): - return False - - # Substitution returns the category consisting of the - # substitution applied to each of its constituents. - def substitute(self, subs): - sub_res = self._res.substitute(subs) - sub_dir = self._dir.substitute(subs) - sub_arg = self._arg.substitute(subs) - return FunctionalCategory(sub_res, sub_arg, self._dir) - - # A function can unify with another function, so long as its - # constituents can unify, or with an unrestricted variable. - def can_unify(self, other): - if other.is_var(): - return [(other, self)] - if other.is_function(): - sa = self._res.can_unify(other.res()) - sd = self._dir.can_unify(other.dir()) - if sa is not None and sd is not None: - sb = self._arg.substitute(sa).can_unify(other.arg().substitute(sa)) - if sb is not None: - return sa + sb - return None - - # Constituent accessors - def arg(self): - return self._arg - - def res(self): - return self._res - - def dir(self): - return self._dir - - def __str__(self): - return f"({self._res}{self._dir}{self._arg})" diff --git a/pipeline/nltk/ccg/chart.py b/pipeline/nltk/ccg/chart.py deleted file mode 100644 index bf9e61036199016f89e89a8b0980d38d856ac4dd..0000000000000000000000000000000000000000 --- a/pipeline/nltk/ccg/chart.py +++ /dev/null @@ -1,480 +0,0 @@ -# Natural Language Toolkit: Combinatory Categorial Grammar -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Graeme Gange -# URL: -# For license information, see LICENSE.TXT - -""" -The lexicon is constructed by calling -``lexicon.fromstring()``. - -In order to construct a parser, you also need a rule set. -The standard English rules are provided in chart as -``chart.DefaultRuleSet``. - -The parser can then be constructed by calling, for example: -``parser = chart.CCGChartParser(, )`` - -Parsing is then performed by running -``parser.parse(.split())``. - -While this returns a list of trees, the default representation -of the produced trees is not very enlightening, particularly -given that it uses the same tree class as the CFG parsers. -It is probably better to call: -``chart.printCCGDerivation()`` -which should print a nice representation of the derivation. - -This entire process is shown far more clearly in the demonstration: -python chart.py -""" - -import itertools - -from nltk.ccg.combinator import * -from nltk.ccg.combinator import ( - BackwardApplication, - BackwardBx, - BackwardComposition, - BackwardSx, - BackwardT, - ForwardApplication, - ForwardComposition, - ForwardSubstitution, - ForwardT, -) -from nltk.ccg.lexicon import Token, fromstring -from nltk.ccg.logic import * -from nltk.parse import ParserI -from nltk.parse.chart import AbstractChartRule, Chart, EdgeI -from nltk.sem.logic import * -from nltk.tree import Tree - - -# Based on the EdgeI class from NLTK. -# A number of the properties of the EdgeI interface don't -# transfer well to CCGs, however. -class CCGEdge(EdgeI): - def __init__(self, span, categ, rule): - self._span = span - self._categ = categ - self._rule = rule - self._comparison_key = (span, categ, rule) - - # Accessors - def lhs(self): - return self._categ - - def span(self): - return self._span - - def start(self): - return self._span[0] - - def end(self): - return self._span[1] - - def length(self): - return self._span[1] - self.span[0] - - def rhs(self): - return () - - def dot(self): - return 0 - - def is_complete(self): - return True - - def is_incomplete(self): - return False - - def nextsym(self): - return None - - def categ(self): - return self._categ - - def rule(self): - return self._rule - - -class CCGLeafEdge(EdgeI): - """ - Class representing leaf edges in a CCG derivation. - """ - - def __init__(self, pos, token, leaf): - self._pos = pos - self._token = token - self._leaf = leaf - self._comparison_key = (pos, token.categ(), leaf) - - # Accessors - def lhs(self): - return self._token.categ() - - def span(self): - return (self._pos, self._pos + 1) - - def start(self): - return self._pos - - def end(self): - return self._pos + 1 - - def length(self): - return 1 - - def rhs(self): - return self._leaf - - def dot(self): - return 0 - - def is_complete(self): - return True - - def is_incomplete(self): - return False - - def nextsym(self): - return None - - def token(self): - return self._token - - def categ(self): - return self._token.categ() - - def leaf(self): - return self._leaf - - -class BinaryCombinatorRule(AbstractChartRule): - """ - Class implementing application of a binary combinator to a chart. - Takes the directed combinator to apply. - """ - - NUMEDGES = 2 - - def __init__(self, combinator): - self._combinator = combinator - - # Apply a combinator - def apply(self, chart, grammar, left_edge, right_edge): - # The left & right edges must be touching. - if not (left_edge.end() == right_edge.start()): - return - - # Check if the two edges are permitted to combine. - # If so, generate the corresponding edge. - if self._combinator.can_combine(left_edge.categ(), right_edge.categ()): - for res in self._combinator.combine(left_edge.categ(), right_edge.categ()): - new_edge = CCGEdge( - span=(left_edge.start(), right_edge.end()), - categ=res, - rule=self._combinator, - ) - if chart.insert(new_edge, (left_edge, right_edge)): - yield new_edge - - # The representation of the combinator (for printing derivations) - def __str__(self): - return "%s" % self._combinator - - -# Type-raising must be handled slightly differently to the other rules, as the -# resulting rules only span a single edge, rather than both edges. - - -class ForwardTypeRaiseRule(AbstractChartRule): - """ - Class for applying forward type raising - """ - - NUMEDGES = 2 - - def __init__(self): - self._combinator = ForwardT - - def apply(self, chart, grammar, left_edge, right_edge): - if not (left_edge.end() == right_edge.start()): - return - - for res in self._combinator.combine(left_edge.categ(), right_edge.categ()): - new_edge = CCGEdge(span=left_edge.span(), categ=res, rule=self._combinator) - if chart.insert(new_edge, (left_edge,)): - yield new_edge - - def __str__(self): - return "%s" % self._combinator - - -class BackwardTypeRaiseRule(AbstractChartRule): - """ - Class for applying backward type raising. - """ - - NUMEDGES = 2 - - def __init__(self): - self._combinator = BackwardT - - def apply(self, chart, grammar, left_edge, right_edge): - if not (left_edge.end() == right_edge.start()): - return - - for res in self._combinator.combine(left_edge.categ(), right_edge.categ()): - new_edge = CCGEdge(span=right_edge.span(), categ=res, rule=self._combinator) - if chart.insert(new_edge, (right_edge,)): - yield new_edge - - def __str__(self): - return "%s" % self._combinator - - -# Common sets of combinators used for English derivations. -ApplicationRuleSet = [ - BinaryCombinatorRule(ForwardApplication), - BinaryCombinatorRule(BackwardApplication), -] -CompositionRuleSet = [ - BinaryCombinatorRule(ForwardComposition), - BinaryCombinatorRule(BackwardComposition), - BinaryCombinatorRule(BackwardBx), -] -SubstitutionRuleSet = [ - BinaryCombinatorRule(ForwardSubstitution), - BinaryCombinatorRule(BackwardSx), -] -TypeRaiseRuleSet = [ForwardTypeRaiseRule(), BackwardTypeRaiseRule()] - -# The standard English rule set. -DefaultRuleSet = ( - ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet + TypeRaiseRuleSet -) - - -class CCGChartParser(ParserI): - """ - Chart parser for CCGs. - Based largely on the ChartParser class from NLTK. - """ - - def __init__(self, lexicon, rules, trace=0): - self._lexicon = lexicon - self._rules = rules - self._trace = trace - - def lexicon(self): - return self._lexicon - - # Implements the CYK algorithm - def parse(self, tokens): - tokens = list(tokens) - chart = CCGChart(list(tokens)) - lex = self._lexicon - - # Initialize leaf edges. - for index in range(chart.num_leaves()): - for token in lex.categories(chart.leaf(index)): - new_edge = CCGLeafEdge(index, token, chart.leaf(index)) - chart.insert(new_edge, ()) - - # Select a span for the new edges - for span in range(2, chart.num_leaves() + 1): - for start in range(0, chart.num_leaves() - span + 1): - # Try all possible pairs of edges that could generate - # an edge for that span - for part in range(1, span): - lstart = start - mid = start + part - rend = start + span - - for left in chart.select(span=(lstart, mid)): - for right in chart.select(span=(mid, rend)): - # Generate all possible combinations of the two edges - for rule in self._rules: - edges_added_by_rule = 0 - for newedge in rule.apply(chart, lex, left, right): - edges_added_by_rule += 1 - - # Output the resulting parses - return chart.parses(lex.start()) - - -class CCGChart(Chart): - def __init__(self, tokens): - Chart.__init__(self, tokens) - - # Constructs the trees for a given parse. Unfortnunately, the parse trees need to be - # constructed slightly differently to those in the default Chart class, so it has to - # be reimplemented - def _trees(self, edge, complete, memo, tree_class): - assert complete, "CCGChart cannot build incomplete trees" - - if edge in memo: - return memo[edge] - - if isinstance(edge, CCGLeafEdge): - word = tree_class(edge.token(), [self._tokens[edge.start()]]) - leaf = tree_class((edge.token(), "Leaf"), [word]) - memo[edge] = [leaf] - return [leaf] - - memo[edge] = [] - trees = [] - - for cpl in self.child_pointer_lists(edge): - child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl] - for children in itertools.product(*child_choices): - lhs = ( - Token( - self._tokens[edge.start() : edge.end()], - edge.lhs(), - compute_semantics(children, edge), - ), - str(edge.rule()), - ) - trees.append(tree_class(lhs, children)) - - memo[edge] = trees - return trees - - -def compute_semantics(children, edge): - if children[0].label()[0].semantics() is None: - return None - - if len(children) == 2: - if isinstance(edge.rule(), BackwardCombinator): - children = [children[1], children[0]] - - combinator = edge.rule()._combinator - function = children[0].label()[0].semantics() - argument = children[1].label()[0].semantics() - - if isinstance(combinator, UndirectedFunctionApplication): - return compute_function_semantics(function, argument) - elif isinstance(combinator, UndirectedComposition): - return compute_composition_semantics(function, argument) - elif isinstance(combinator, UndirectedSubstitution): - return compute_substitution_semantics(function, argument) - else: - raise AssertionError("Unsupported combinator '" + combinator + "'") - else: - return compute_type_raised_semantics(children[0].label()[0].semantics()) - - -# -------- -# Displaying derivations -# -------- -def printCCGDerivation(tree): - # Get the leaves and initial categories - leafcats = tree.pos() - leafstr = "" - catstr = "" - - # Construct a string with both the leaf word and corresponding - # category aligned. - for (leaf, cat) in leafcats: - str_cat = "%s" % cat - nextlen = 2 + max(len(leaf), len(str_cat)) - lcatlen = (nextlen - len(str_cat)) // 2 - rcatlen = lcatlen + (nextlen - len(str_cat)) % 2 - catstr += " " * lcatlen + str_cat + " " * rcatlen - lleaflen = (nextlen - len(leaf)) // 2 - rleaflen = lleaflen + (nextlen - len(leaf)) % 2 - leafstr += " " * lleaflen + leaf + " " * rleaflen - print(leafstr.rstrip()) - print(catstr.rstrip()) - - # Display the derivation steps - printCCGTree(0, tree) - - -# Prints the sequence of derivation steps. -def printCCGTree(lwidth, tree): - rwidth = lwidth - - # Is a leaf (word). - # Increment the span by the space occupied by the leaf. - if not isinstance(tree, Tree): - return 2 + lwidth + len(tree) - - # Find the width of the current derivation step - for child in tree: - rwidth = max(rwidth, printCCGTree(rwidth, child)) - - # Is a leaf node. - # Don't print anything, but account for the space occupied. - if not isinstance(tree.label(), tuple): - return max( - rwidth, 2 + lwidth + len("%s" % tree.label()), 2 + lwidth + len(tree[0]) - ) - - (token, op) = tree.label() - - if op == "Leaf": - return rwidth - - # Pad to the left with spaces, followed by a sequence of '-' - # and the derivation rule. - print(lwidth * " " + (rwidth - lwidth) * "-" + "%s" % op) - # Print the resulting category on a new line. - str_res = "%s" % (token.categ()) - if token.semantics() is not None: - str_res += " {" + str(token.semantics()) + "}" - respadlen = (rwidth - lwidth - len(str_res)) // 2 + lwidth - print(respadlen * " " + str_res) - return rwidth - - -### Demonstration code - -# Construct the lexicon -lex = fromstring( - """ - :- S, NP, N, VP # Primitive categories, S is the target primitive - - Det :: NP/N # Family of words - Pro :: NP - TV :: VP/NP - Modal :: (S\\NP)/VP # Backslashes need to be escaped - - I => Pro # Word -> Category mapping - you => Pro - - the => Det - - # Variables have the special keyword 'var' - # '.' prevents permutation - # ',' prevents composition - and => var\\.,var/.,var - - which => (N\\N)/(S/NP) - - will => Modal # Categories can be either explicit, or families. - might => Modal - - cook => TV - eat => TV - - mushrooms => N - parsnips => N - bacon => N - """ -) - - -def demo(): - parser = CCGChartParser(lex, DefaultRuleSet) - for parse in parser.parse("I might cook and eat the bacon".split()): - printCCGDerivation(parse) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/ccg/combinator.py b/pipeline/nltk/ccg/combinator.py deleted file mode 100644 index 6efe6adf40d1aea7c98df1aceccdf9cf5c7b5c31..0000000000000000000000000000000000000000 --- a/pipeline/nltk/ccg/combinator.py +++ /dev/null @@ -1,339 +0,0 @@ -# Natural Language Toolkit: Combinatory Categorial Grammar -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Graeme Gange -# URL: -# For license information, see LICENSE.TXT -""" -CCG Combinators -""" - -from abc import ABCMeta, abstractmethod - -from nltk.ccg.api import FunctionalCategory - - -class UndirectedBinaryCombinator(metaclass=ABCMeta): - """ - Abstract class for representing a binary combinator. - Merely defines functions for checking if the function and argument - are able to be combined, and what the resulting category is. - - Note that as no assumptions are made as to direction, the unrestricted - combinators can perform all backward, forward and crossed variations - of the combinators; these restrictions must be added in the rule - class. - """ - - @abstractmethod - def can_combine(self, function, argument): - pass - - @abstractmethod - def combine(self, function, argument): - pass - - -class DirectedBinaryCombinator(metaclass=ABCMeta): - """ - Wrapper for the undirected binary combinator. - It takes left and right categories, and decides which is to be - the function, and which the argument. - It then decides whether or not they can be combined. - """ - - @abstractmethod - def can_combine(self, left, right): - pass - - @abstractmethod - def combine(self, left, right): - pass - - -class ForwardCombinator(DirectedBinaryCombinator): - """ - Class representing combinators where the primary functor is on the left. - - Takes an undirected combinator, and a predicate which adds constraints - restricting the cases in which it may apply. - """ - - def __init__(self, combinator, predicate, suffix=""): - self._combinator = combinator - self._predicate = predicate - self._suffix = suffix - - def can_combine(self, left, right): - return self._combinator.can_combine(left, right) and self._predicate( - left, right - ) - - def combine(self, left, right): - yield from self._combinator.combine(left, right) - - def __str__(self): - return f">{self._combinator}{self._suffix}" - - -class BackwardCombinator(DirectedBinaryCombinator): - """ - The backward equivalent of the ForwardCombinator class. - """ - - def __init__(self, combinator, predicate, suffix=""): - self._combinator = combinator - self._predicate = predicate - self._suffix = suffix - - def can_combine(self, left, right): - return self._combinator.can_combine(right, left) and self._predicate( - left, right - ) - - def combine(self, left, right): - yield from self._combinator.combine(right, left) - - def __str__(self): - return f"<{self._combinator}{self._suffix}" - - -class UndirectedFunctionApplication(UndirectedBinaryCombinator): - """ - Class representing function application. - Implements rules of the form: - X/Y Y -> X (>) - And the corresponding backwards application rule - """ - - def can_combine(self, function, argument): - if not function.is_function(): - return False - - return not function.arg().can_unify(argument) is None - - def combine(self, function, argument): - if not function.is_function(): - return - - subs = function.arg().can_unify(argument) - if subs is None: - return - - yield function.res().substitute(subs) - - def __str__(self): - return "" - - -# Predicates for function application. - -# Ensures the left functor takes an argument on the right -def forwardOnly(left, right): - return left.dir().is_forward() - - -# Ensures the right functor takes an argument on the left -def backwardOnly(left, right): - return right.dir().is_backward() - - -# Application combinator instances -ForwardApplication = ForwardCombinator(UndirectedFunctionApplication(), forwardOnly) -BackwardApplication = BackwardCombinator(UndirectedFunctionApplication(), backwardOnly) - - -class UndirectedComposition(UndirectedBinaryCombinator): - """ - Functional composition (harmonic) combinator. - Implements rules of the form - X/Y Y/Z -> X/Z (B>) - And the corresponding backwards and crossed variations. - """ - - def can_combine(self, function, argument): - # Can only combine two functions, and both functions must - # allow composition. - if not (function.is_function() and argument.is_function()): - return False - if function.dir().can_compose() and argument.dir().can_compose(): - return not function.arg().can_unify(argument.res()) is None - return False - - def combine(self, function, argument): - if not (function.is_function() and argument.is_function()): - return - if function.dir().can_compose() and argument.dir().can_compose(): - subs = function.arg().can_unify(argument.res()) - if subs is not None: - yield FunctionalCategory( - function.res().substitute(subs), - argument.arg().substitute(subs), - argument.dir(), - ) - - def __str__(self): - return "B" - - -# Predicates for restricting application of straight composition. -def bothForward(left, right): - return left.dir().is_forward() and right.dir().is_forward() - - -def bothBackward(left, right): - return left.dir().is_backward() and right.dir().is_backward() - - -# Predicates for crossed composition -def crossedDirs(left, right): - return left.dir().is_forward() and right.dir().is_backward() - - -def backwardBxConstraint(left, right): - # The functors must be crossed inwards - if not crossedDirs(left, right): - return False - # Permuting combinators must be allowed - if not left.dir().can_cross() and right.dir().can_cross(): - return False - # The resulting argument category is restricted to be primitive - return left.arg().is_primitive() - - -# Straight composition combinators -ForwardComposition = ForwardCombinator(UndirectedComposition(), forwardOnly) -BackwardComposition = BackwardCombinator(UndirectedComposition(), backwardOnly) - -# Backward crossed composition -BackwardBx = BackwardCombinator( - UndirectedComposition(), backwardBxConstraint, suffix="x" -) - - -class UndirectedSubstitution(UndirectedBinaryCombinator): - r""" - Substitution (permutation) combinator. - Implements rules of the form - Y/Z (X\Y)/Z -> X/Z ( N\N -def innermostFunction(categ): - while categ.res().is_function(): - categ = categ.res() - return categ - - -class UndirectedTypeRaise(UndirectedBinaryCombinator): - """ - Undirected combinator for type raising. - """ - - def can_combine(self, function, arg): - # The argument must be a function. - # The restriction that arg.res() must be a function - # merely reduces redundant type-raising; if arg.res() is - # primitive, we have: - # X Y\X =>((>) Y - # which is equivalent to - # X Y\X =>(<) Y - if not (arg.is_function() and arg.res().is_function()): - return False - - arg = innermostFunction(arg) - - # left, arg_categ are undefined! - subs = left.can_unify(arg_categ.arg()) - if subs is not None: - return True - return False - - def combine(self, function, arg): - if not ( - function.is_primitive() and arg.is_function() and arg.res().is_function() - ): - return - - # Type-raising matches only the innermost application. - arg = innermostFunction(arg) - - subs = function.can_unify(arg.arg()) - if subs is not None: - xcat = arg.res().substitute(subs) - yield FunctionalCategory( - xcat, FunctionalCategory(xcat, function, arg.dir()), -(arg.dir()) - ) - - def __str__(self): - return "T" - - -# Predicates for type-raising -# The direction of the innermost category must be towards -# the primary functor. -# The restriction that the variable must be primitive is not -# common to all versions of CCGs; some authors have other restrictions. -def forwardTConstraint(left, right): - arg = innermostFunction(right) - return arg.dir().is_backward() and arg.res().is_primitive() - - -def backwardTConstraint(left, right): - arg = innermostFunction(left) - return arg.dir().is_forward() and arg.res().is_primitive() - - -# Instances of type-raising combinators -ForwardT = ForwardCombinator(UndirectedTypeRaise(), forwardTConstraint) -BackwardT = BackwardCombinator(UndirectedTypeRaise(), backwardTConstraint) diff --git a/pipeline/nltk/ccg/lexicon.py b/pipeline/nltk/ccg/lexicon.py deleted file mode 100644 index da7d00ab6bcdfa190f49fe7c141a23542426ff20..0000000000000000000000000000000000000000 --- a/pipeline/nltk/ccg/lexicon.py +++ /dev/null @@ -1,338 +0,0 @@ -# Natural Language Toolkit: Combinatory Categorial Grammar -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Graeme Gange -# URL: -# For license information, see LICENSE.TXT -""" -CCG Lexicons -""" - -import re -from collections import defaultdict - -from nltk.ccg.api import CCGVar, Direction, FunctionalCategory, PrimitiveCategory -from nltk.internals import deprecated -from nltk.sem.logic import Expression - -# ------------ -# Regular expressions used for parsing components of the lexicon -# ------------ - -# Parses a primitive category and subscripts -PRIM_RE = re.compile(r"""([A-Za-z]+)(\[[A-Za-z,]+\])?""") - -# Separates the next primitive category from the remainder of the -# string -NEXTPRIM_RE = re.compile(r"""([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)""") - -# Separates the next application operator from the remainder -APP_RE = re.compile(r"""([\\/])([.,]?)([.,]?)(.*)""") - -# Parses the definition of the right-hand side (rhs) of either a word or a family -LEX_RE = re.compile(r"""([\S_]+)\s*(::|[-=]+>)\s*(.+)""", re.UNICODE) - -# Parses the right hand side that contains category and maybe semantic predicate -RHS_RE = re.compile(r"""([^{}]*[^ {}])\s*(\{[^}]+\})?""", re.UNICODE) - -# Parses the semantic predicate -SEMANTICS_RE = re.compile(r"""\{([^}]+)\}""", re.UNICODE) - -# Strips comments from a line -COMMENTS_RE = re.compile("""([^#]*)(?:#.*)?""") - - -class Token: - """ - Class representing a token. - - token => category {semantics} - e.g. eat => S\\var[pl]/var {\\x y.eat(x,y)} - - * `token` (string) - * `categ` (string) - * `semantics` (Expression) - """ - - def __init__(self, token, categ, semantics=None): - self._token = token - self._categ = categ - self._semantics = semantics - - def categ(self): - return self._categ - - def semantics(self): - return self._semantics - - def __str__(self): - semantics_str = "" - if self._semantics is not None: - semantics_str = " {" + str(self._semantics) + "}" - return "" + str(self._categ) + semantics_str - - def __cmp__(self, other): - if not isinstance(other, Token): - return -1 - return cmp((self._categ, self._semantics), other.categ(), other.semantics()) - - -class CCGLexicon: - """ - Class representing a lexicon for CCG grammars. - - * `primitives`: The list of primitive categories for the lexicon - * `families`: Families of categories - * `entries`: A mapping of words to possible categories - """ - - def __init__(self, start, primitives, families, entries): - self._start = PrimitiveCategory(start) - self._primitives = primitives - self._families = families - self._entries = entries - - def categories(self, word): - """ - Returns all the possible categories for a word - """ - return self._entries[word] - - def start(self): - """ - Return the target category for the parser - """ - return self._start - - def __str__(self): - """ - String representation of the lexicon. Used for debugging. - """ - string = "" - first = True - for ident in sorted(self._entries): - if not first: - string = string + "\n" - string = string + ident + " => " - - first = True - for cat in self._entries[ident]: - if not first: - string = string + " | " - else: - first = False - string = string + "%s" % cat - return string - - -# ----------- -# Parsing lexicons -# ----------- - - -def matchBrackets(string): - """ - Separate the contents matching the first set of brackets from the rest of - the input. - """ - rest = string[1:] - inside = "(" - - while rest != "" and not rest.startswith(")"): - if rest.startswith("("): - (part, rest) = matchBrackets(rest) - inside = inside + part - else: - inside = inside + rest[0] - rest = rest[1:] - if rest.startswith(")"): - return (inside + ")", rest[1:]) - raise AssertionError("Unmatched bracket in string '" + string + "'") - - -def nextCategory(string): - """ - Separate the string for the next portion of the category from the rest - of the string - """ - if string.startswith("("): - return matchBrackets(string) - return NEXTPRIM_RE.match(string).groups() - - -def parseApplication(app): - """ - Parse an application operator - """ - return Direction(app[0], app[1:]) - - -def parseSubscripts(subscr): - """ - Parse the subscripts for a primitive category - """ - if subscr: - return subscr[1:-1].split(",") - return [] - - -def parsePrimitiveCategory(chunks, primitives, families, var): - """ - Parse a primitive category - - If the primitive is the special category 'var', replace it with the - correct `CCGVar`. - """ - if chunks[0] == "var": - if chunks[1] is None: - if var is None: - var = CCGVar() - return (var, var) - - catstr = chunks[0] - if catstr in families: - (cat, cvar) = families[catstr] - if var is None: - var = cvar - else: - cat = cat.substitute([(cvar, var)]) - return (cat, var) - - if catstr in primitives: - subscrs = parseSubscripts(chunks[1]) - return (PrimitiveCategory(catstr, subscrs), var) - raise AssertionError( - "String '" + catstr + "' is neither a family nor primitive category." - ) - - -def augParseCategory(line, primitives, families, var=None): - """ - Parse a string representing a category, and returns a tuple with - (possibly) the CCG variable for the category - """ - (cat_string, rest) = nextCategory(line) - - if cat_string.startswith("("): - (res, var) = augParseCategory(cat_string[1:-1], primitives, families, var) - - else: - (res, var) = parsePrimitiveCategory( - PRIM_RE.match(cat_string).groups(), primitives, families, var - ) - - while rest != "": - app = APP_RE.match(rest).groups() - direction = parseApplication(app[0:3]) - rest = app[3] - - (cat_string, rest) = nextCategory(rest) - if cat_string.startswith("("): - (arg, var) = augParseCategory(cat_string[1:-1], primitives, families, var) - else: - (arg, var) = parsePrimitiveCategory( - PRIM_RE.match(cat_string).groups(), primitives, families, var - ) - res = FunctionalCategory(res, arg, direction) - - return (res, var) - - -def fromstring(lex_str, include_semantics=False): - """ - Convert string representation into a lexicon for CCGs. - """ - CCGVar.reset_id() - primitives = [] - families = {} - entries = defaultdict(list) - for line in lex_str.splitlines(): - # Strip comments and leading/trailing whitespace. - line = COMMENTS_RE.match(line).groups()[0].strip() - if line == "": - continue - - if line.startswith(":-"): - # A line of primitive categories. - # The first one is the target category - # ie, :- S, N, NP, VP - primitives = primitives + [ - prim.strip() for prim in line[2:].strip().split(",") - ] - else: - # Either a family definition, or a word definition - (ident, sep, rhs) = LEX_RE.match(line).groups() - (catstr, semantics_str) = RHS_RE.match(rhs).groups() - (cat, var) = augParseCategory(catstr, primitives, families) - - if sep == "::": - # Family definition - # ie, Det :: NP/N - families[ident] = (cat, var) - else: - semantics = None - if include_semantics is True: - if semantics_str is None: - raise AssertionError( - line - + " must contain semantics because include_semantics is set to True" - ) - else: - semantics = Expression.fromstring( - SEMANTICS_RE.match(semantics_str).groups()[0] - ) - # Word definition - # ie, which => (N\N)/(S/NP) - entries[ident].append(Token(ident, cat, semantics)) - return CCGLexicon(primitives[0], primitives, families, entries) - - -@deprecated("Use fromstring() instead.") -def parseLexicon(lex_str): - return fromstring(lex_str) - - -openccg_tinytiny = fromstring( - """ - # Rather minimal lexicon based on the openccg `tinytiny' grammar. - # Only incorporates a subset of the morphological subcategories, however. - :- S,NP,N # Primitive categories - Det :: NP/N # Determiners - Pro :: NP - IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular) - IntransVpl :: S\\NP[pl] # Plural - TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular) - TransVpl :: S\\NP[pl]/NP # Plural - - the => NP[sg]/N[sg] - the => NP[pl]/N[pl] - - I => Pro - me => Pro - we => Pro - us => Pro - - book => N[sg] - books => N[pl] - - peach => N[sg] - peaches => N[pl] - - policeman => N[sg] - policemen => N[pl] - - boy => N[sg] - boys => N[pl] - - sleep => IntransVsg - sleep => IntransVpl - - eat => IntransVpl - eat => TransVpl - eats => IntransVsg - eats => TransVsg - - see => TransVpl - sees => TransVsg - """ -) diff --git a/pipeline/nltk/ccg/logic.py b/pipeline/nltk/ccg/logic.py deleted file mode 100644 index 2e347b7531f723b3d8fe0caa84c22e8fcb659a6c..0000000000000000000000000000000000000000 --- a/pipeline/nltk/ccg/logic.py +++ /dev/null @@ -1,60 +0,0 @@ -# Natural Language Toolkit: Combinatory Categorial Grammar -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Tanin Na Nakorn (@tanin) -# URL: -# For license information, see LICENSE.TXT -""" -Helper functions for CCG semantics computation -""" - -from nltk.sem.logic import * - - -def compute_type_raised_semantics(semantics): - core = semantics - parent = None - while isinstance(core, LambdaExpression): - parent = core - core = core.term - - var = Variable("F") - while var in core.free(): - var = unique_variable(pattern=var) - core = ApplicationExpression(FunctionVariableExpression(var), core) - - if parent is not None: - parent.term = core - else: - semantics = core - - return LambdaExpression(var, semantics) - - -def compute_function_semantics(function, argument): - return ApplicationExpression(function, argument).simplify() - - -def compute_composition_semantics(function, argument): - assert isinstance(argument, LambdaExpression), ( - "`" + str(argument) + "` must be a lambda expression" - ) - return LambdaExpression( - argument.variable, ApplicationExpression(function, argument.term).simplify() - ) - - -def compute_substitution_semantics(function, argument): - assert isinstance(function, LambdaExpression) and isinstance( - function.term, LambdaExpression - ), ("`" + str(function) + "` must be a lambda expression with 2 arguments") - assert isinstance(argument, LambdaExpression), ( - "`" + str(argument) + "` must be a lambda expression" - ) - - new_argument = ApplicationExpression( - argument, VariableExpression(function.variable) - ).simplify() - new_term = ApplicationExpression(function.term, new_argument).simplify() - - return LambdaExpression(function.variable, new_term) diff --git a/pipeline/nltk/chat/__init__.py b/pipeline/nltk/chat/__init__.py deleted file mode 100644 index 462f0b517068657d149662cf990414f203491caf..0000000000000000000000000000000000000000 --- a/pipeline/nltk/chat/__init__.py +++ /dev/null @@ -1,48 +0,0 @@ -# Natural Language Toolkit: Chatbots -# -# Copyright (C) 2001-2023 NLTK Project -# Authors: Steven Bird -# URL: -# For license information, see LICENSE.TXT - -# Based on an Eliza implementation by Joe Strout , -# Jeff Epler and Jez Higgins . - -""" -A class for simple chatbots. These perform simple pattern matching on sentences -typed by users, and respond with automatically generated sentences. - -These chatbots may not work using the windows command line or the -windows IDLE GUI. -""" - -from nltk.chat.eliza import eliza_chat -from nltk.chat.iesha import iesha_chat -from nltk.chat.rude import rude_chat -from nltk.chat.suntsu import suntsu_chat -from nltk.chat.util import Chat -from nltk.chat.zen import zen_chat - -bots = [ - (eliza_chat, "Eliza (psycho-babble)"), - (iesha_chat, "Iesha (teen anime junky)"), - (rude_chat, "Rude (abusive bot)"), - (suntsu_chat, "Suntsu (Chinese sayings)"), - (zen_chat, "Zen (gems of wisdom)"), -] - - -def chatbots(): - print("Which chatbot would you like to talk to?") - botcount = len(bots) - for i in range(botcount): - print(" %d: %s" % (i + 1, bots[i][1])) - while True: - choice = input(f"\nEnter a number in the range 1-{botcount}: ").strip() - if choice.isdigit() and (int(choice) - 1) in range(botcount): - break - else: - print(" Error: bad chatbot number") - - chatbot = bots[int(choice) - 1][0] - chatbot() diff --git a/pipeline/nltk/chat/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/chat/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index aa67667bd13276537efe5f32363037138a7392de..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/chat/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/chat/__pycache__/eliza.cpython-39.pyc b/pipeline/nltk/chat/__pycache__/eliza.cpython-39.pyc deleted file mode 100644 index 7ab88172136e35c556251c335bd3ea7668d6225a..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/chat/__pycache__/eliza.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/chat/__pycache__/iesha.cpython-39.pyc b/pipeline/nltk/chat/__pycache__/iesha.cpython-39.pyc deleted file mode 100644 index 26e6c20058fbb13f195b970d715e57eb62686cca..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/chat/__pycache__/iesha.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/chat/__pycache__/rude.cpython-39.pyc b/pipeline/nltk/chat/__pycache__/rude.cpython-39.pyc deleted file mode 100644 index 2113b763381665ff7f0e49d98662f6779739d4d5..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/chat/__pycache__/rude.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/chat/__pycache__/suntsu.cpython-39.pyc b/pipeline/nltk/chat/__pycache__/suntsu.cpython-39.pyc deleted file mode 100644 index 66ba02052e74f718b2f02384dc4e9e5c845885e1..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/chat/__pycache__/suntsu.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/chat/__pycache__/util.cpython-39.pyc b/pipeline/nltk/chat/__pycache__/util.cpython-39.pyc deleted file mode 100644 index 2c473bb2a5512ce447850ce3def293299ecfa1f7..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/chat/__pycache__/util.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/chat/__pycache__/zen.cpython-39.pyc b/pipeline/nltk/chat/__pycache__/zen.cpython-39.pyc deleted file mode 100644 index f1338b45c2be56814243e70189b58d49b927ce42..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/chat/__pycache__/zen.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/chat/eliza.py b/pipeline/nltk/chat/eliza.py deleted file mode 100644 index 5dfb4a4be2caa084c89a169f4861bd7a4b3eacf3..0000000000000000000000000000000000000000 --- a/pipeline/nltk/chat/eliza.py +++ /dev/null @@ -1,337 +0,0 @@ -# Natural Language Toolkit: Eliza -# -# Copyright (C) 2001-2023 NLTK Project -# Authors: Steven Bird -# Edward Loper -# URL: -# For license information, see LICENSE.TXT - -# Based on an Eliza implementation by Joe Strout , -# Jeff Epler and Jez Higgins . - -# a translation table used to convert things you say into things the -# computer says back, e.g. "I am" --> "you are" - -from nltk.chat.util import Chat, reflections - -# a table of response pairs, where each pair consists of a -# regular expression, and a list of possible responses, -# with group-macros labelled as %1, %2. - -pairs = ( - ( - r"I need (.*)", - ( - "Why do you need %1?", - "Would it really help you to get %1?", - "Are you sure you need %1?", - ), - ), - ( - r"Why don\'t you (.*)", - ( - "Do you really think I don't %1?", - "Perhaps eventually I will %1.", - "Do you really want me to %1?", - ), - ), - ( - r"Why can\'t I (.*)", - ( - "Do you think you should be able to %1?", - "If you could %1, what would you do?", - "I don't know -- why can't you %1?", - "Have you really tried?", - ), - ), - ( - r"I can\'t (.*)", - ( - "How do you know you can't %1?", - "Perhaps you could %1 if you tried.", - "What would it take for you to %1?", - ), - ), - ( - r"I am (.*)", - ( - "Did you come to me because you are %1?", - "How long have you been %1?", - "How do you feel about being %1?", - ), - ), - ( - r"I\'m (.*)", - ( - "How does being %1 make you feel?", - "Do you enjoy being %1?", - "Why do you tell me you're %1?", - "Why do you think you're %1?", - ), - ), - ( - r"Are you (.*)", - ( - "Why does it matter whether I am %1?", - "Would you prefer it if I were not %1?", - "Perhaps you believe I am %1.", - "I may be %1 -- what do you think?", - ), - ), - ( - r"What (.*)", - ( - "Why do you ask?", - "How would an answer to that help you?", - "What do you think?", - ), - ), - ( - r"How (.*)", - ( - "How do you suppose?", - "Perhaps you can answer your own question.", - "What is it you're really asking?", - ), - ), - ( - r"Because (.*)", - ( - "Is that the real reason?", - "What other reasons come to mind?", - "Does that reason apply to anything else?", - "If %1, what else must be true?", - ), - ), - ( - r"(.*) sorry (.*)", - ( - "There are many times when no apology is needed.", - "What feelings do you have when you apologize?", - ), - ), - ( - r"Hello(.*)", - ( - "Hello... I'm glad you could drop by today.", - "Hi there... how are you today?", - "Hello, how are you feeling today?", - ), - ), - ( - r"I think (.*)", - ("Do you doubt %1?", "Do you really think so?", "But you're not sure %1?"), - ), - ( - r"(.*) friend (.*)", - ( - "Tell me more about your friends.", - "When you think of a friend, what comes to mind?", - "Why don't you tell me about a childhood friend?", - ), - ), - (r"Yes", ("You seem quite sure.", "OK, but can you elaborate a bit?")), - ( - r"(.*) computer(.*)", - ( - "Are you really talking about me?", - "Does it seem strange to talk to a computer?", - "How do computers make you feel?", - "Do you feel threatened by computers?", - ), - ), - ( - r"Is it (.*)", - ( - "Do you think it is %1?", - "Perhaps it's %1 -- what do you think?", - "If it were %1, what would you do?", - "It could well be that %1.", - ), - ), - ( - r"It is (.*)", - ( - "You seem very certain.", - "If I told you that it probably isn't %1, what would you feel?", - ), - ), - ( - r"Can you (.*)", - ( - "What makes you think I can't %1?", - "If I could %1, then what?", - "Why do you ask if I can %1?", - ), - ), - ( - r"Can I (.*)", - ( - "Perhaps you don't want to %1.", - "Do you want to be able to %1?", - "If you could %1, would you?", - ), - ), - ( - r"You are (.*)", - ( - "Why do you think I am %1?", - "Does it please you to think that I'm %1?", - "Perhaps you would like me to be %1.", - "Perhaps you're really talking about yourself?", - ), - ), - ( - r"You\'re (.*)", - ( - "Why do you say I am %1?", - "Why do you think I am %1?", - "Are we talking about you, or me?", - ), - ), - ( - r"I don\'t (.*)", - ("Don't you really %1?", "Why don't you %1?", "Do you want to %1?"), - ), - ( - r"I feel (.*)", - ( - "Good, tell me more about these feelings.", - "Do you often feel %1?", - "When do you usually feel %1?", - "When you feel %1, what do you do?", - ), - ), - ( - r"I have (.*)", - ( - "Why do you tell me that you've %1?", - "Have you really %1?", - "Now that you have %1, what will you do next?", - ), - ), - ( - r"I would (.*)", - ( - "Could you explain why you would %1?", - "Why would you %1?", - "Who else knows that you would %1?", - ), - ), - ( - r"Is there (.*)", - ( - "Do you think there is %1?", - "It's likely that there is %1.", - "Would you like there to be %1?", - ), - ), - ( - r"My (.*)", - ( - "I see, your %1.", - "Why do you say that your %1?", - "When your %1, how do you feel?", - ), - ), - ( - r"You (.*)", - ( - "We should be discussing you, not me.", - "Why do you say that about me?", - "Why do you care whether I %1?", - ), - ), - (r"Why (.*)", ("Why don't you tell me the reason why %1?", "Why do you think %1?")), - ( - r"I want (.*)", - ( - "What would it mean to you if you got %1?", - "Why do you want %1?", - "What would you do if you got %1?", - "If you got %1, then what would you do?", - ), - ), - ( - r"(.*) mother(.*)", - ( - "Tell me more about your mother.", - "What was your relationship with your mother like?", - "How do you feel about your mother?", - "How does this relate to your feelings today?", - "Good family relations are important.", - ), - ), - ( - r"(.*) father(.*)", - ( - "Tell me more about your father.", - "How did your father make you feel?", - "How do you feel about your father?", - "Does your relationship with your father relate to your feelings today?", - "Do you have trouble showing affection with your family?", - ), - ), - ( - r"(.*) child(.*)", - ( - "Did you have close friends as a child?", - "What is your favorite childhood memory?", - "Do you remember any dreams or nightmares from childhood?", - "Did the other children sometimes tease you?", - "How do you think your childhood experiences relate to your feelings today?", - ), - ), - ( - r"(.*)\?", - ( - "Why do you ask that?", - "Please consider whether you can answer your own question.", - "Perhaps the answer lies within yourself?", - "Why don't you tell me?", - ), - ), - ( - r"quit", - ( - "Thank you for talking with me.", - "Good-bye.", - "Thank you, that will be $150. Have a good day!", - ), - ), - ( - r"(.*)", - ( - "Please tell me more.", - "Let's change focus a bit... Tell me about your family.", - "Can you elaborate on that?", - "Why do you say that %1?", - "I see.", - "Very interesting.", - "%1.", - "I see. And what does that tell you?", - "How does that make you feel?", - "How do you feel when you say that?", - ), - ), -) - -eliza_chatbot = Chat(pairs, reflections) - - -def eliza_chat(): - print("Therapist\n---------") - print("Talk to the program by typing in plain English, using normal upper-") - print('and lower-case letters and punctuation. Enter "quit" when done.') - print("=" * 72) - print("Hello. How are you feeling today?") - - eliza_chatbot.converse() - - -def demo(): - eliza_chat() - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/chat/iesha.py b/pipeline/nltk/chat/iesha.py deleted file mode 100644 index 552870caa30927f30b96c5dbdfd2ccb459cf48a8..0000000000000000000000000000000000000000 --- a/pipeline/nltk/chat/iesha.py +++ /dev/null @@ -1,160 +0,0 @@ -# Natural Language Toolkit: Teen Chatbot -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Selina Dennis -# URL: -# For license information, see LICENSE.TXT - -""" -This chatbot is a tongue-in-cheek take on the average teen -anime junky that frequents YahooMessenger or MSNM. -All spelling mistakes and flawed grammar are intentional. -""" - -from nltk.chat.util import Chat - -reflections = { - "am": "r", - "was": "were", - "i": "u", - "i'd": "u'd", - "i've": "u'v", - "ive": "u'v", - "i'll": "u'll", - "my": "ur", - "are": "am", - "you're": "im", - "you've": "ive", - "you'll": "i'll", - "your": "my", - "yours": "mine", - "you": "me", - "u": "me", - "ur": "my", - "urs": "mine", - "me": "u", -} - -# Note: %1/2/etc are used without spaces prior as the chat bot seems -# to add a superfluous space when matching. - -pairs = ( - ( - r"I\'m (.*)", - ( - "ur%1?? that's so cool! kekekekeke ^_^ tell me more!", - "ur%1? neat!! kekeke >_<", - ), - ), - ( - r"(.*) don\'t you (.*)", - ( - r"u think I can%2??! really?? kekeke \<_\<", - "what do u mean%2??!", - "i could if i wanted, don't you think!! kekeke", - ), - ), - (r"ye[as] [iI] (.*)", ("u%1? cool!! how?", "how come u%1??", "u%1? so do i!!")), - ( - r"do (you|u) (.*)\??", - ("do i%2? only on tuesdays! kekeke *_*", "i dunno! do u%2??"), - ), - ( - r"(.*)\?", - ( - "man u ask lots of questions!", - "booooring! how old r u??", - "boooooring!! ur not very fun", - ), - ), - ( - r"(cos|because) (.*)", - ("hee! i don't believe u! >_<", "nuh-uh! >_<", "ooooh i agree!"), - ), - ( - r"why can\'t [iI] (.*)", - ( - "i dunno! y u askin me for!", - "try harder, silly! hee! ^_^", - "i dunno! but when i can't%1 i jump up and down!", - ), - ), - ( - r"I can\'t (.*)", - ( - "u can't what??! >_<", - "that's ok! i can't%1 either! kekekekeke ^_^", - "try harder, silly! hee! ^&^", - ), - ), - ( - r"(.*) (like|love|watch) anime", - ( - "omg i love anime!! do u like sailor moon??! ^&^", - "anime yay! anime rocks sooooo much!", - "oooh anime! i love anime more than anything!", - "anime is the bestest evar! evangelion is the best!", - "hee anime is the best! do you have ur fav??", - ), - ), - ( - r"I (like|love|watch|play) (.*)", - ("yay! %2 rocks!", "yay! %2 is neat!", "cool! do u like other stuff?? ^_^"), - ), - ( - r"anime sucks|(.*) (hate|detest) anime", - ( - "ur a liar! i'm not gonna talk to u nemore if u h8 anime *;*", - "no way! anime is the best ever!", - "nuh-uh, anime is the best!", - ), - ), - ( - r"(are|r) (you|u) (.*)", - ("am i%1??! how come u ask that!", "maybe! y shud i tell u?? kekeke >_>"), - ), - ( - r"what (.*)", - ("hee u think im gonna tell u? .v.", "booooooooring! ask me somethin else!"), - ), - (r"how (.*)", ("not tellin!! kekekekekeke ^_^",)), - (r"(hi|hello|hey) (.*)", ("hi!!! how r u!!",)), - ( - r"quit", - ( - "mom says i have to go eat dinner now :,( bye!!", - "awww u have to go?? see u next time!!", - "how to see u again soon! ^_^", - ), - ), - ( - r"(.*)", - ( - "ur funny! kekeke", - "boooooring! talk about something else! tell me wat u like!", - "do u like anime??", - "do u watch anime? i like sailor moon! ^_^", - "i wish i was a kitty!! kekekeke ^_^", - ), - ), -) - -iesha_chatbot = Chat(pairs, reflections) - - -def iesha_chat(): - print("Iesha the TeenBoT\n---------") - print("Talk to the program by typing in plain English, using normal upper-") - print('and lower-case letters and punctuation. Enter "quit" when done.') - print("=" * 72) - print("hi!! i'm iesha! who r u??!") - - iesha_chatbot.converse() - - -def demo(): - iesha_chat() - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/chat/rude.py b/pipeline/nltk/chat/rude.py deleted file mode 100644 index 77404e42bc4d4c9c279540a7bac18fa47d78b9cc..0000000000000000000000000000000000000000 --- a/pipeline/nltk/chat/rude.py +++ /dev/null @@ -1,125 +0,0 @@ -# Natural Language Toolkit: Rude Chatbot -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Peter Spiller -# URL: -# For license information, see LICENSE.TXT - -from nltk.chat.util import Chat, reflections - -pairs = ( - ( - r"We (.*)", - ( - "What do you mean, 'we'?", - "Don't include me in that!", - "I wouldn't be so sure about that.", - ), - ), - ( - r"You should (.*)", - ("Don't tell me what to do, buddy.", "Really? I should, should I?"), - ), - ( - r"You\'re(.*)", - ( - "More like YOU'RE %1!", - "Hah! Look who's talking.", - "Come over here and tell me I'm %1.", - ), - ), - ( - r"You are(.*)", - ( - "More like YOU'RE %1!", - "Hah! Look who's talking.", - "Come over here and tell me I'm %1.", - ), - ), - ( - r"I can\'t(.*)", - ( - "You do sound like the type who can't %1.", - "Hear that splashing sound? That's my heart bleeding for you.", - "Tell somebody who might actually care.", - ), - ), - ( - r"I think (.*)", - ( - "I wouldn't think too hard if I were you.", - "You actually think? I'd never have guessed...", - ), - ), - ( - r"I (.*)", - ( - "I'm getting a bit tired of hearing about you.", - "How about we talk about me instead?", - "Me, me, me... Frankly, I don't care.", - ), - ), - ( - r"How (.*)", - ( - "How do you think?", - "Take a wild guess.", - "I'm not even going to dignify that with an answer.", - ), - ), - (r"What (.*)", ("Do I look like an encyclopedia?", "Figure it out yourself.")), - ( - r"Why (.*)", - ( - "Why not?", - "That's so obvious I thought even you'd have already figured it out.", - ), - ), - ( - r"(.*)shut up(.*)", - ( - "Make me.", - "Getting angry at a feeble NLP assignment? Somebody's losing it.", - "Say that again, I dare you.", - ), - ), - ( - r"Shut up(.*)", - ( - "Make me.", - "Getting angry at a feeble NLP assignment? Somebody's losing it.", - "Say that again, I dare you.", - ), - ), - ( - r"Hello(.*)", - ("Oh good, somebody else to talk to. Joy.", "'Hello'? How original..."), - ), - ( - r"(.*)", - ( - "I'm getting bored here. Become more interesting.", - "Either become more thrilling or get lost, buddy.", - "Change the subject before I die of fatal boredom.", - ), - ), -) - -rude_chatbot = Chat(pairs, reflections) - - -def rude_chat(): - print("Talk to the program by typing in plain English, using normal upper-") - print('and lower-case letters and punctuation. Enter "quit" when done.') - print("=" * 72) - print("I suppose I should say hello.") - - rude_chatbot.converse() - - -def demo(): - rude_chat() - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/chat/suntsu.py b/pipeline/nltk/chat/suntsu.py deleted file mode 100644 index 2130c7da1d630a2d8f78412d4b02d518d540af9f..0000000000000000000000000000000000000000 --- a/pipeline/nltk/chat/suntsu.py +++ /dev/null @@ -1,140 +0,0 @@ -# Natural Language Toolkit: Sun Tsu-Bot -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Sam Huston 2007 -# URL: -# For license information, see LICENSE.TXT - -""" -Tsu bot responds to all queries with a Sun Tsu sayings - -Quoted from Sun Tsu's The Art of War -Translated by LIONEL GILES, M.A. 1910 -Hosted by the Gutenberg Project -https://www.gutenberg.org/ -""" - -from nltk.chat.util import Chat, reflections - -pairs = ( - (r"quit", ("Good-bye.", "Plan well", "May victory be your future")), - ( - r"[^\?]*\?", - ( - "Please consider whether you can answer your own question.", - "Ask me no questions!", - ), - ), - ( - r"[0-9]+(.*)", - ( - "It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.", - "There are five essentials for victory", - ), - ), - ( - r"[A-Ca-c](.*)", - ( - "The art of war is of vital importance to the State.", - "All warfare is based on deception.", - "If your opponent is secure at all points, be prepared for him. If he is in superior strength, evade him.", - "If the campaign is protracted, the resources of the State will not be equal to the strain.", - "Attack him where he is unprepared, appear where you are not expected.", - "There is no instance of a country having benefited from prolonged warfare.", - ), - ), - ( - r"[D-Fd-f](.*)", - ( - "The skillful soldier does not raise a second levy, neither are his supply-wagons loaded more than twice.", - "Bring war material with you from home, but forage on the enemy.", - "In war, then, let your great object be victory, not lengthy campaigns.", - "To fight and conquer in all your battles is not supreme excellence; supreme excellence consists in breaking the enemy's resistance without fighting.", - ), - ), - ( - r"[G-Ig-i](.*)", - ( - "Heaven signifies night and day, cold and heat, times and seasons.", - "It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.", - "The good fighters of old first put themselves beyond the possibility of defeat, and then waited for an opportunity of defeating the enemy.", - "One may know how to conquer without being able to do it.", - ), - ), - ( - r"[J-Lj-l](.*)", - ( - "There are three ways in which a ruler can bring misfortune upon his army.", - "By commanding the army to advance or to retreat, being ignorant of the fact that it cannot obey. This is called hobbling the army.", - "By attempting to govern an army in the same way as he administers a kingdom, being ignorant of the conditions which obtain in an army. This causes restlessness in the soldier's minds.", - "By employing the officers of his army without discrimination, through ignorance of the military principle of adaptation to circumstances. This shakes the confidence of the soldiers.", - "There are five essentials for victory", - "He will win who knows when to fight and when not to fight.", - "He will win who knows how to handle both superior and inferior forces.", - "He will win whose army is animated by the same spirit throughout all its ranks.", - "He will win who, prepared himself, waits to take the enemy unprepared.", - "He will win who has military capacity and is not interfered with by the sovereign.", - ), - ), - ( - r"[M-Om-o](.*)", - ( - "If you know the enemy and know yourself, you need not fear the result of a hundred battles.", - "If you know yourself but not the enemy, for every victory gained you will also suffer a defeat.", - "If you know neither the enemy nor yourself, you will succumb in every battle.", - "The control of a large force is the same principle as the control of a few men: it is merely a question of dividing up their numbers.", - ), - ), - ( - r"[P-Rp-r](.*)", - ( - "Security against defeat implies defensive tactics; ability to defeat the enemy means taking the offensive.", - "Standing on the defensive indicates insufficient strength; attacking, a superabundance of strength.", - "He wins his battles by making no mistakes. Making no mistakes is what establishes the certainty of victory, for it means conquering an enemy that is already defeated.", - "A victorious army opposed to a routed one, is as a pound's weight placed in the scale against a single grain.", - "The onrush of a conquering force is like the bursting of pent-up waters into a chasm a thousand fathoms deep.", - ), - ), - ( - r"[S-Us-u](.*)", - ( - "What the ancients called a clever fighter is one who not only wins, but excels in winning with ease.", - "Hence his victories bring him neither reputation for wisdom nor credit for courage.", - "Hence the skillful fighter puts himself into a position which makes defeat impossible, and does not miss the moment for defeating the enemy.", - "In war the victorious strategist only seeks battle after the victory has been won, whereas he who is destined to defeat first fights and afterwards looks for victory.", - "There are not more than five musical notes, yet the combinations of these five give rise to more melodies than can ever be heard.", - "Appear at points which the enemy must hasten to defend; march swiftly to places where you are not expected.", - ), - ), - ( - r"[V-Zv-z](.*)", - ( - "It is a matter of life and death, a road either to safety or to ruin.", - "Hold out baits to entice the enemy. Feign disorder, and crush him.", - "All men can see the tactics whereby I conquer, but what none can see is the strategy out of which victory is evolved.", - "Do not repeat the tactics which have gained you one victory, but let your methods be regulated by the infinite variety of circumstances.", - "So in war, the way is to avoid what is strong and to strike at what is weak.", - "Just as water retains no constant shape, so in warfare there are no constant conditions.", - ), - ), - (r"(.*)", ("Your statement insults me.", "")), -) - -suntsu_chatbot = Chat(pairs, reflections) - - -def suntsu_chat(): - print("Talk to the program by typing in plain English, using normal upper-") - print('and lower-case letters and punctuation. Enter "quit" when done.') - print("=" * 72) - print("You seek enlightenment?") - - suntsu_chatbot.converse() - - -def demo(): - suntsu_chat() - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/chat/util.py b/pipeline/nltk/chat/util.py deleted file mode 100644 index ddcb246ce3b74a15cd4c87bb180811553849af1b..0000000000000000000000000000000000000000 --- a/pipeline/nltk/chat/util.py +++ /dev/null @@ -1,124 +0,0 @@ -# Natural Language Toolkit: Chatbot Utilities -# -# Copyright (C) 2001-2023 NLTK Project -# Authors: Steven Bird -# URL: -# For license information, see LICENSE.TXT - -# Based on an Eliza implementation by Joe Strout , -# Jeff Epler and Jez Higgins . - -import random -import re - -reflections = { - "i am": "you are", - "i was": "you were", - "i": "you", - "i'm": "you are", - "i'd": "you would", - "i've": "you have", - "i'll": "you will", - "my": "your", - "you are": "I am", - "you were": "I was", - "you've": "I have", - "you'll": "I will", - "your": "my", - "yours": "mine", - "you": "me", - "me": "you", -} - - -class Chat: - def __init__(self, pairs, reflections={}): - """ - Initialize the chatbot. Pairs is a list of patterns and responses. Each - pattern is a regular expression matching the user's statement or question, - e.g. r'I like (.*)'. For each such pattern a list of possible responses - is given, e.g. ['Why do you like %1', 'Did you ever dislike %1']. Material - which is matched by parenthesized sections of the patterns (e.g. .*) is mapped to - the numbered positions in the responses, e.g. %1. - - :type pairs: list of tuple - :param pairs: The patterns and responses - :type reflections: dict - :param reflections: A mapping between first and second person expressions - :rtype: None - """ - - self._pairs = [(re.compile(x, re.IGNORECASE), y) for (x, y) in pairs] - self._reflections = reflections - self._regex = self._compile_reflections() - - def _compile_reflections(self): - sorted_refl = sorted(self._reflections, key=len, reverse=True) - return re.compile( - r"\b({})\b".format("|".join(map(re.escape, sorted_refl))), re.IGNORECASE - ) - - def _substitute(self, str): - """ - Substitute words in the string, according to the specified reflections, - e.g. "I'm" -> "you are" - - :type str: str - :param str: The string to be mapped - :rtype: str - """ - - return self._regex.sub( - lambda mo: self._reflections[mo.string[mo.start() : mo.end()]], str.lower() - ) - - def _wildcards(self, response, match): - pos = response.find("%") - while pos >= 0: - num = int(response[pos + 1 : pos + 2]) - response = ( - response[:pos] - + self._substitute(match.group(num)) - + response[pos + 2 :] - ) - pos = response.find("%") - return response - - def respond(self, str): - """ - Generate a response to the user input. - - :type str: str - :param str: The string to be mapped - :rtype: str - """ - - # check each pattern - for (pattern, response) in self._pairs: - match = pattern.match(str) - - # did the pattern match? - if match: - resp = random.choice(response) # pick a random response - resp = self._wildcards(resp, match) # process wildcards - - # fix munged punctuation at the end - if resp[-2:] == "?.": - resp = resp[:-2] + "." - if resp[-2:] == "??": - resp = resp[:-2] + "?" - return resp - - # Hold a conversation with a chatbot - def converse(self, quit="quit"): - user_input = "" - while user_input != quit: - user_input = quit - try: - user_input = input(">") - except EOFError: - print(user_input) - if user_input: - while user_input[-1] in "!.": - user_input = user_input[:-1] - print(self.respond(user_input)) diff --git a/pipeline/nltk/chat/zen.py b/pipeline/nltk/chat/zen.py deleted file mode 100644 index 5972d47764056b388fca1a24acb88bd87369274f..0000000000000000000000000000000000000000 --- a/pipeline/nltk/chat/zen.py +++ /dev/null @@ -1,329 +0,0 @@ -# Natural Language Toolkit: Zen Chatbot -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Amy Holland -# URL: -# For license information, see LICENSE.TXT - -""" -Zen Chatbot talks in gems of Zen wisdom. - -This is a sample conversation with Zen Chatbot: -ZC: Welcome, my child. -me: Good afternoon. -ZC: Ask the question you have come to ask. -me: How can I achieve enlightenment? -ZC: How do you suppose? -me: Through meditation. -ZC: Form is emptiness, and emptiness form. -me: How can I empty my mind of worldly troubles? -ZC: Will an answer to that really help in your search for enlightenment? -me: Yes. -ZC: It is better to be right than to be certain. -me: I seek truth and wisdom. -ZC: The search for truth is a long journey. -me: Are you sure? -ZC: Maybe sure, maybe not sure. - - -The chatbot structure is based on that of chat.eliza. Thus, it uses -a translation table to convert from question to response -i.e. "I am" --> "you are" - -Of course, since Zen Chatbot does not understand the meaning of any words, -responses are very limited. Zen Chatbot will usually answer very vaguely, or -respond to a question by asking a different question, in much the same way -as Eliza. -""" - -from nltk.chat.util import Chat, reflections - -# responses are matched top to bottom, so non-specific matches occur later -# for each match, a list of possible responses is provided -responses = ( - # Zen Chatbot opens with the line "Welcome, my child." The usual - # response will be a greeting problem: 'good' matches "good morning", - # "good day" etc, but also "good grief!" and other sentences starting - # with the word 'good' that may not be a greeting - ( - r"(hello(.*))|(good [a-zA-Z]+)", - ( - "The path to enlightenment is often difficult to see.", - "Greetings. I sense your mind is troubled. Tell me of your troubles.", - "Ask the question you have come to ask.", - "Hello. Do you seek englightenment?", - ), - ), - # "I need" and "I want" can be followed by a thing (eg 'help') - # or an action (eg 'to see you') - # - # This is a problem with this style of response - - # person: "I need you" - # chatbot: "me can be achieved by hard work and dedication of the mind" - # i.e. 'you' is not really a thing that can be mapped this way, so this - # interpretation only makes sense for some inputs - # - ( - r"i need (.*)", - ( - "%1 can be achieved by hard work and dedication of the mind.", - "%1 is not a need, but a desire of the mind. Clear your mind of such concerns.", - "Focus your mind on%1, and you will find what you need.", - ), - ), - ( - r"i want (.*)", - ( - "Desires of the heart will distract you from the path to enlightenment.", - "Will%1 help you attain enlightenment?", - "Is%1 a desire of the mind, or of the heart?", - ), - ), - # why questions are separated into three types: - # "why..I" e.g. "why am I here?" "Why do I like cake?" - # "why..you" e.g. "why are you here?" "Why won't you tell me?" - # "why..." e.g. "Why is the sky blue?" - # problems: - # person: "Why can't you tell me?" - # chatbot: "Are you sure I tell you?" - # - this style works for positives (e.g. "why do you like cake?") - # but does not work for negatives (e.g. "why don't you like cake?") - (r"why (.*) i (.*)\?", ("You%1%2?", "Perhaps you only think you%1%2")), - (r"why (.*) you(.*)\?", ("Why%1 you%2?", "%2 I%1", "Are you sure I%2?")), - (r"why (.*)\?", ("I cannot tell you why%1.", "Why do you think %1?")), - # e.g. "are you listening?", "are you a duck" - ( - r"are you (.*)\?", - ("Maybe%1, maybe not%1.", "Whether I am%1 or not is God's business."), - ), - # e.g. "am I a duck?", "am I going to die?" - ( - r"am i (.*)\?", - ("Perhaps%1, perhaps not%1.", "Whether you are%1 or not is not for me to say."), - ), - # what questions, e.g. "what time is it?" - # problems: - # person: "What do you want?" - # chatbot: "Seek truth, not what do me want." - (r"what (.*)\?", ("Seek truth, not what%1.", "What%1 should not concern you.")), - # how questions, e.g. "how do you do?" - ( - r"how (.*)\?", - ( - "How do you suppose?", - "Will an answer to that really help in your search for enlightenment?", - "Ask yourself not how, but why.", - ), - ), - # can questions, e.g. "can you run?", "can you come over here please?" - ( - r"can you (.*)\?", - ( - "I probably can, but I may not.", - "Maybe I can%1, and maybe I cannot.", - "I can do all, and I can do nothing.", - ), - ), - # can questions, e.g. "can I have some cake?", "can I know truth?" - ( - r"can i (.*)\?", - ( - "You can%1 if you believe you can%1, and have a pure spirit.", - "Seek truth and you will know if you can%1.", - ), - ), - # e.g. "It is raining" - implies the speaker is certain of a fact - ( - r"it is (.*)", - ( - "How can you be certain that%1, when you do not even know yourself?", - "Whether it is%1 or not does not change the way the world is.", - ), - ), - # e.g. "is there a doctor in the house?" - ( - r"is there (.*)\?", - ("There is%1 if you believe there is.", "It is possible that there is%1."), - ), - # e.g. "is it possible?", "is this true?" - (r"is(.*)\?", ("%1 is not relevant.", "Does this matter?")), - # non-specific question - ( - r"(.*)\?", - ( - "Do you think %1?", - "You seek the truth. Does the truth seek you?", - "If you intentionally pursue the answers to your questions, the answers become hard to see.", - "The answer to your question cannot be told. It must be experienced.", - ), - ), - # expression of hate of form "I hate you" or "Kelly hates cheese" - ( - r"(.*) (hate[s]?)|(dislike[s]?)|(don\'t like)(.*)", - ( - "Perhaps it is not about hating %2, but about hate from within.", - "Weeds only grow when we dislike them", - "Hate is a very strong emotion.", - ), - ), - # statement containing the word 'truth' - ( - r"(.*) truth(.*)", - ( - "Seek truth, and truth will seek you.", - "Remember, it is not the spoon which bends - only yourself.", - "The search for truth is a long journey.", - ), - ), - # desire to do an action - # e.g. "I want to go shopping" - ( - r"i want to (.*)", - ("You may %1 if your heart truly desires to.", "You may have to %1."), - ), - # desire for an object - # e.g. "I want a pony" - ( - r"i want (.*)", - ( - "Does your heart truly desire %1?", - "Is this a desire of the heart, or of the mind?", - ), - ), - # e.g. "I can't wait" or "I can't do this" - ( - r"i can\'t (.*)", - ( - "What we can and can't do is a limitation of the mind.", - "There are limitations of the body, and limitations of the mind.", - "Have you tried to%1 with a clear mind?", - ), - ), - # "I think.." indicates uncertainty. e.g. "I think so." - # problem: exceptions... - # e.g. "I think, therefore I am" - ( - r"i think (.*)", - ( - "Uncertainty in an uncertain world.", - "Indeed, how can we be certain of anything in such uncertain times.", - "Are you not, in fact, certain that%1?", - ), - ), - # "I feel...emotions/sick/light-headed..." - ( - r"i feel (.*)", - ( - "Your body and your emotions are both symptoms of your mind." - "What do you believe is the root of such feelings?", - "Feeling%1 can be a sign of your state-of-mind.", - ), - ), - # exclaimation mark indicating emotion - # e.g. "Wow!" or "No!" - ( - r"(.*)!", - ( - "I sense that you are feeling emotional today.", - "You need to calm your emotions.", - ), - ), - # because [statement] - # e.g. "because I said so" - ( - r"because (.*)", - ( - "Does knowning the reasons behind things help you to understand" - " the things themselves?", - "If%1, what else must be true?", - ), - ), - # yes or no - raise an issue of certainty/correctness - ( - r"(yes)|(no)", - ( - "Is there certainty in an uncertain world?", - "It is better to be right than to be certain.", - ), - ), - # sentence containing word 'love' - ( - r"(.*)love(.*)", - ( - "Think of the trees: they let the birds perch and fly with no intention to call them when they come, and no longing for their return when they fly away. Let your heart be like the trees.", - "Free love!", - ), - ), - # sentence containing word 'understand' - r - ( - r"(.*)understand(.*)", - ( - "If you understand, things are just as they are;" - " if you do not understand, things are just as they are.", - "Imagination is more important than knowledge.", - ), - ), - # 'I', 'me', 'my' - person is talking about themself. - # this breaks down when words contain these - eg 'Thyme', 'Irish' - ( - r"(.*)(me )|( me)|(my)|(mine)|(i)(.*)", - ( - "'I', 'me', 'my'... these are selfish expressions.", - "Have you ever considered that you might be a selfish person?", - "Try to consider others, not just yourself.", - "Think not just of yourself, but of others.", - ), - ), - # 'you' starting a sentence - # e.g. "you stink!" - ( - r"you (.*)", - ("My path is not of concern to you.", "I am but one, and you but one more."), - ), - # say goodbye with some extra Zen wisdom. - ( - r"exit", - ( - "Farewell. The obstacle is the path.", - "Farewell. Life is a journey, not a destination.", - "Good bye. We are cups, constantly and quietly being filled." - "\nThe trick is knowning how to tip ourselves over and let the beautiful stuff out.", - ), - ), - # fall through case - - # when stumped, respond with generic zen wisdom - # - ( - r"(.*)", - ( - "When you're enlightened, every word is wisdom.", - "Random talk is useless.", - "The reverse side also has a reverse side.", - "Form is emptiness, and emptiness is form.", - "I pour out a cup of water. Is the cup empty?", - ), - ), -) - -zen_chatbot = Chat(responses, reflections) - - -def zen_chat(): - print("*" * 75) - print("Zen Chatbot!".center(75)) - print("*" * 75) - print('"Look beyond mere words and letters - look into your mind"'.center(75)) - print("* Talk your way to truth with Zen Chatbot.") - print("* Type 'quit' when you have had enough.") - print("*" * 75) - print("Welcome, my child.") - - zen_chatbot.converse() - - -def demo(): - zen_chat() - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/chunk/__init__.py b/pipeline/nltk/chunk/__init__.py deleted file mode 100644 index 208da9f5678f4b79282d5e6886502627ab9161ab..0000000000000000000000000000000000000000 --- a/pipeline/nltk/chunk/__init__.py +++ /dev/null @@ -1,197 +0,0 @@ -# Natural Language Toolkit: Chunkers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Edward Loper -# URL: -# For license information, see LICENSE.TXT -# - -""" -Classes and interfaces for identifying non-overlapping linguistic -groups (such as base noun phrases) in unrestricted text. This task is -called "chunk parsing" or "chunking", and the identified groups are -called "chunks". The chunked text is represented using a shallow -tree called a "chunk structure." A chunk structure is a tree -containing tokens and chunks, where each chunk is a subtree containing -only tokens. For example, the chunk structure for base noun phrase -chunks in the sentence "I saw the big dog on the hill" is:: - - (SENTENCE: - (NP: ) - - (NP: ) - - (NP: )) - -To convert a chunk structure back to a list of tokens, simply use the -chunk structure's ``leaves()`` method. - -This module defines ``ChunkParserI``, a standard interface for -chunking texts; and ``RegexpChunkParser``, a regular-expression based -implementation of that interface. It also defines ``ChunkScore``, a -utility class for scoring chunk parsers. - -RegexpChunkParser -================= - -``RegexpChunkParser`` is an implementation of the chunk parser interface -that uses regular-expressions over tags to chunk a text. Its -``parse()`` method first constructs a ``ChunkString``, which encodes a -particular chunking of the input text. Initially, nothing is -chunked. ``parse.RegexpChunkParser`` then applies a sequence of -``RegexpChunkRule`` rules to the ``ChunkString``, each of which modifies -the chunking that it encodes. Finally, the ``ChunkString`` is -transformed back into a chunk structure, which is returned. - -``RegexpChunkParser`` can only be used to chunk a single kind of phrase. -For example, you can use an ``RegexpChunkParser`` to chunk the noun -phrases in a text, or the verb phrases in a text; but you can not -use it to simultaneously chunk both noun phrases and verb phrases in -the same text. (This is a limitation of ``RegexpChunkParser``, not of -chunk parsers in general.) - -RegexpChunkRules ----------------- - -A ``RegexpChunkRule`` is a transformational rule that updates the -chunking of a text by modifying its ``ChunkString``. Each -``RegexpChunkRule`` defines the ``apply()`` method, which modifies -the chunking encoded by a ``ChunkString``. The -``RegexpChunkRule`` class itself can be used to implement any -transformational rule based on regular expressions. There are -also a number of subclasses, which can be used to implement -simpler types of rules: - - - ``ChunkRule`` chunks anything that matches a given regular - expression. - - ``StripRule`` strips anything that matches a given regular - expression. - - ``UnChunkRule`` will un-chunk any chunk that matches a given - regular expression. - - ``MergeRule`` can be used to merge two contiguous chunks. - - ``SplitRule`` can be used to split a single chunk into two - smaller chunks. - - ``ExpandLeftRule`` will expand a chunk to incorporate new - unchunked material on the left. - - ``ExpandRightRule`` will expand a chunk to incorporate new - unchunked material on the right. - -Tag Patterns -~~~~~~~~~~~~ - -A ``RegexpChunkRule`` uses a modified version of regular -expression patterns, called "tag patterns". Tag patterns are -used to match sequences of tags. Examples of tag patterns are:: - - r'(
    ||)+' - r'+' - r'' - -The differences between regular expression patterns and tag -patterns are: - - - In tag patterns, ``'<'`` and ``'>'`` act as parentheses; so - ``'+'`` matches one or more repetitions of ``''``, not - ``''``. - - Whitespace in tag patterns is ignored. So - ``'
    | '`` is equivalent to ``'
    |'`` - - In tag patterns, ``'.'`` is equivalent to ``'[^{}<>]'``; so - ``''`` matches any single tag starting with ``'NN'``. - -The function ``tag_pattern2re_pattern`` can be used to transform -a tag pattern to an equivalent regular expression pattern. - -Efficiency ----------- - -Preliminary tests indicate that ``RegexpChunkParser`` can chunk at a -rate of about 300 tokens/second, with a moderately complex rule set. - -There may be problems if ``RegexpChunkParser`` is used with more than -5,000 tokens at a time. In particular, evaluation of some regular -expressions may cause the Python regular expression engine to -exceed its maximum recursion depth. We have attempted to minimize -these problems, but it is impossible to avoid them completely. We -therefore recommend that you apply the chunk parser to a single -sentence at a time. - -Emacs Tip ---------- - -If you evaluate the following elisp expression in emacs, it will -colorize a ``ChunkString`` when you use an interactive python shell -with emacs or xemacs ("C-c !"):: - - (let () - (defconst comint-mode-font-lock-keywords - '(("<[^>]+>" 0 'font-lock-reference-face) - ("[{}]" 0 'font-lock-function-name-face))) - (add-hook 'comint-mode-hook (lambda () (turn-on-font-lock)))) - -You can evaluate this code by copying it to a temporary buffer, -placing the cursor after the last close parenthesis, and typing -"``C-x C-e``". You should evaluate it before running the interactive -session. The change will last until you close emacs. - -Unresolved Issues ------------------ - -If we use the ``re`` module for regular expressions, Python's -regular expression engine generates "maximum recursion depth -exceeded" errors when processing very large texts, even for -regular expressions that should not require any recursion. We -therefore use the ``pre`` module instead. But note that ``pre`` -does not include Unicode support, so this module will not work -with unicode strings. Note also that ``pre`` regular expressions -are not quite as advanced as ``re`` ones (e.g., no leftward -zero-length assertions). - -:type CHUNK_TAG_PATTERN: regexp -:var CHUNK_TAG_PATTERN: A regular expression to test whether a tag - pattern is valid. -""" - -from nltk.chunk.api import ChunkParserI -from nltk.chunk.regexp import RegexpChunkParser, RegexpParser -from nltk.chunk.util import ( - ChunkScore, - accuracy, - conllstr2tree, - conlltags2tree, - ieerstr2tree, - tagstr2tree, - tree2conllstr, - tree2conlltags, -) -from nltk.data import load - -# Standard treebank POS tagger -_BINARY_NE_CHUNKER = "chunkers/maxent_ne_chunker/english_ace_binary.pickle" -_MULTICLASS_NE_CHUNKER = "chunkers/maxent_ne_chunker/english_ace_multiclass.pickle" - - -def ne_chunk(tagged_tokens, binary=False): - """ - Use NLTK's currently recommended named entity chunker to - chunk the given list of tagged tokens. - """ - if binary: - chunker_pickle = _BINARY_NE_CHUNKER - else: - chunker_pickle = _MULTICLASS_NE_CHUNKER - chunker = load(chunker_pickle) - return chunker.parse(tagged_tokens) - - -def ne_chunk_sents(tagged_sentences, binary=False): - """ - Use NLTK's currently recommended named entity chunker to chunk the - given list of tagged sentences, each consisting of a list of tagged tokens. - """ - if binary: - chunker_pickle = _BINARY_NE_CHUNKER - else: - chunker_pickle = _MULTICLASS_NE_CHUNKER - chunker = load(chunker_pickle) - return chunker.parse_sents(tagged_sentences) diff --git a/pipeline/nltk/chunk/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/chunk/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index a6d17fa46476bd01577937d5bf8e0fb66d9478a4..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/chunk/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/chunk/__pycache__/api.cpython-39.pyc b/pipeline/nltk/chunk/__pycache__/api.cpython-39.pyc deleted file mode 100644 index 2b25c82f94aedf03c68924ef921c7db02998e4c5..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/chunk/__pycache__/api.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/chunk/__pycache__/named_entity.cpython-39.pyc b/pipeline/nltk/chunk/__pycache__/named_entity.cpython-39.pyc deleted file mode 100644 index 270bfd16a054468ad1bb6239b251cf8685845f28..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/chunk/__pycache__/named_entity.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/chunk/__pycache__/regexp.cpython-39.pyc b/pipeline/nltk/chunk/__pycache__/regexp.cpython-39.pyc deleted file mode 100644 index c5fdb143213f49060fbe4b87153309e4354ee88f..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/chunk/__pycache__/regexp.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/chunk/__pycache__/util.cpython-39.pyc b/pipeline/nltk/chunk/__pycache__/util.cpython-39.pyc deleted file mode 100644 index 629fe18c4ff52ae918e621c27867c904bba4a0f3..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/chunk/__pycache__/util.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/chunk/api.py b/pipeline/nltk/chunk/api.py deleted file mode 100644 index 858490a7abb82375fba271d98037e53da6a17129..0000000000000000000000000000000000000000 --- a/pipeline/nltk/chunk/api.py +++ /dev/null @@ -1,56 +0,0 @@ -# Natural Language Toolkit: Chunk parsing API -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird (minor additions) -# URL: -# For license information, see LICENSE.TXT - -##////////////////////////////////////////////////////// -## Chunk Parser Interface -##////////////////////////////////////////////////////// - -from nltk.chunk.util import ChunkScore -from nltk.internals import deprecated -from nltk.parse import ParserI - - -class ChunkParserI(ParserI): - """ - A processing interface for identifying non-overlapping groups in - unrestricted text. Typically, chunk parsers are used to find base - syntactic constituents, such as base noun phrases. Unlike - ``ParserI``, ``ChunkParserI`` guarantees that the ``parse()`` method - will always generate a parse. - """ - - def parse(self, tokens): - """ - Return the best chunk structure for the given tokens - and return a tree. - - :param tokens: The list of (word, tag) tokens to be chunked. - :type tokens: list(tuple) - :rtype: Tree - """ - raise NotImplementedError() - - @deprecated("Use accuracy(gold) instead.") - def evaluate(self, gold): - return self.accuracy(gold) - - def accuracy(self, gold): - """ - Score the accuracy of the chunker against the gold standard. - Remove the chunking the gold standard text, rechunk it using - the chunker, and return a ``ChunkScore`` object - reflecting the performance of this chunk parser. - - :type gold: list(Tree) - :param gold: The list of chunked sentences to score the chunker on. - :rtype: ChunkScore - """ - chunkscore = ChunkScore() - for correct in gold: - chunkscore.score(correct, self.parse(correct.leaves())) - return chunkscore diff --git a/pipeline/nltk/chunk/named_entity.py b/pipeline/nltk/chunk/named_entity.py deleted file mode 100644 index b8ab97742c9f0721a0bc1744703871ea278aba07..0000000000000000000000000000000000000000 --- a/pipeline/nltk/chunk/named_entity.py +++ /dev/null @@ -1,352 +0,0 @@ -# Natural Language Toolkit: Chunk parsing API -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -Named entity chunker -""" - -import os -import pickle -import re -from xml.etree import ElementTree as ET - -from nltk.tag import ClassifierBasedTagger, pos_tag - -try: - from nltk.classify import MaxentClassifier -except ImportError: - pass - -from nltk.chunk.api import ChunkParserI -from nltk.chunk.util import ChunkScore -from nltk.data import find -from nltk.tokenize import word_tokenize -from nltk.tree import Tree - - -class NEChunkParserTagger(ClassifierBasedTagger): - """ - The IOB tagger used by the chunk parser. - """ - - def __init__(self, train): - ClassifierBasedTagger.__init__( - self, train=train, classifier_builder=self._classifier_builder - ) - - def _classifier_builder(self, train): - return MaxentClassifier.train( - train, algorithm="megam", gaussian_prior_sigma=1, trace=2 - ) - - def _english_wordlist(self): - try: - wl = self._en_wordlist - except AttributeError: - from nltk.corpus import words - - self._en_wordlist = set(words.words("en-basic")) - wl = self._en_wordlist - return wl - - def _feature_detector(self, tokens, index, history): - word = tokens[index][0] - pos = simplify_pos(tokens[index][1]) - if index == 0: - prevword = prevprevword = None - prevpos = prevprevpos = None - prevshape = prevtag = prevprevtag = None - elif index == 1: - prevword = tokens[index - 1][0].lower() - prevprevword = None - prevpos = simplify_pos(tokens[index - 1][1]) - prevprevpos = None - prevtag = history[index - 1][0] - prevshape = prevprevtag = None - else: - prevword = tokens[index - 1][0].lower() - prevprevword = tokens[index - 2][0].lower() - prevpos = simplify_pos(tokens[index - 1][1]) - prevprevpos = simplify_pos(tokens[index - 2][1]) - prevtag = history[index - 1] - prevprevtag = history[index - 2] - prevshape = shape(prevword) - if index == len(tokens) - 1: - nextword = nextnextword = None - nextpos = nextnextpos = None - elif index == len(tokens) - 2: - nextword = tokens[index + 1][0].lower() - nextpos = tokens[index + 1][1].lower() - nextnextword = None - nextnextpos = None - else: - nextword = tokens[index + 1][0].lower() - nextpos = tokens[index + 1][1].lower() - nextnextword = tokens[index + 2][0].lower() - nextnextpos = tokens[index + 2][1].lower() - - # 89.6 - features = { - "bias": True, - "shape": shape(word), - "wordlen": len(word), - "prefix3": word[:3].lower(), - "suffix3": word[-3:].lower(), - "pos": pos, - "word": word, - "en-wordlist": (word in self._english_wordlist()), - "prevtag": prevtag, - "prevpos": prevpos, - "nextpos": nextpos, - "prevword": prevword, - "nextword": nextword, - "word+nextpos": f"{word.lower()}+{nextpos}", - "pos+prevtag": f"{pos}+{prevtag}", - "shape+prevtag": f"{prevshape}+{prevtag}", - } - - return features - - -class NEChunkParser(ChunkParserI): - """ - Expected input: list of pos-tagged words - """ - - def __init__(self, train): - self._train(train) - - def parse(self, tokens): - """ - Each token should be a pos-tagged word - """ - tagged = self._tagger.tag(tokens) - tree = self._tagged_to_parse(tagged) - return tree - - def _train(self, corpus): - # Convert to tagged sequence - corpus = [self._parse_to_tagged(s) for s in corpus] - - self._tagger = NEChunkParserTagger(train=corpus) - - def _tagged_to_parse(self, tagged_tokens): - """ - Convert a list of tagged tokens to a chunk-parse tree. - """ - sent = Tree("S", []) - - for (tok, tag) in tagged_tokens: - if tag == "O": - sent.append(tok) - elif tag.startswith("B-"): - sent.append(Tree(tag[2:], [tok])) - elif tag.startswith("I-"): - if sent and isinstance(sent[-1], Tree) and sent[-1].label() == tag[2:]: - sent[-1].append(tok) - else: - sent.append(Tree(tag[2:], [tok])) - return sent - - @staticmethod - def _parse_to_tagged(sent): - """ - Convert a chunk-parse tree to a list of tagged tokens. - """ - toks = [] - for child in sent: - if isinstance(child, Tree): - if len(child) == 0: - print("Warning -- empty chunk in sentence") - continue - toks.append((child[0], f"B-{child.label()}")) - for tok in child[1:]: - toks.append((tok, f"I-{child.label()}")) - else: - toks.append((child, "O")) - return toks - - -def shape(word): - if re.match(r"[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word, re.UNICODE): - return "number" - elif re.match(r"\W+$", word, re.UNICODE): - return "punct" - elif re.match(r"\w+$", word, re.UNICODE): - if word.istitle(): - return "upcase" - elif word.islower(): - return "downcase" - else: - return "mixedcase" - else: - return "other" - - -def simplify_pos(s): - if s.startswith("V"): - return "V" - else: - return s.split("-")[0] - - -def postag_tree(tree): - # Part-of-speech tagging. - words = tree.leaves() - tag_iter = (pos for (word, pos) in pos_tag(words)) - newtree = Tree("S", []) - for child in tree: - if isinstance(child, Tree): - newtree.append(Tree(child.label(), [])) - for subchild in child: - newtree[-1].append((subchild, next(tag_iter))) - else: - newtree.append((child, next(tag_iter))) - return newtree - - -def load_ace_data(roots, fmt="binary", skip_bnews=True): - for root in roots: - for root, dirs, files in os.walk(root): - if root.endswith("bnews") and skip_bnews: - continue - for f in files: - if f.endswith(".sgm"): - yield from load_ace_file(os.path.join(root, f), fmt) - - -def load_ace_file(textfile, fmt): - print(f" - {os.path.split(textfile)[1]}") - annfile = textfile + ".tmx.rdc.xml" - - # Read the xml file, and get a list of entities - entities = [] - with open(annfile) as infile: - xml = ET.parse(infile).getroot() - for entity in xml.findall("document/entity"): - typ = entity.find("entity_type").text - for mention in entity.findall("entity_mention"): - if mention.get("TYPE") != "NAME": - continue # only NEs - s = int(mention.find("head/charseq/start").text) - e = int(mention.find("head/charseq/end").text) + 1 - entities.append((s, e, typ)) - - # Read the text file, and mark the entities. - with open(textfile) as infile: - text = infile.read() - - # Strip XML tags, since they don't count towards the indices - text = re.sub("<(?!/?TEXT)[^>]+>", "", text) - - # Blank out anything before/after - def subfunc(m): - return " " * (m.end() - m.start() - 6) - - text = re.sub(r"[\s\S]*", subfunc, text) - text = re.sub(r"[\s\S]*", "", text) - - # Simplify quotes - text = re.sub("``", ' "', text) - text = re.sub("''", '" ', text) - - entity_types = {typ for (s, e, typ) in entities} - - # Binary distinction (NE or not NE) - if fmt == "binary": - i = 0 - toks = Tree("S", []) - for (s, e, typ) in sorted(entities): - if s < i: - s = i # Overlapping! Deal with this better? - if e <= s: - continue - toks.extend(word_tokenize(text[i:s])) - toks.append(Tree("NE", text[s:e].split())) - i = e - toks.extend(word_tokenize(text[i:])) - yield toks - - # Multiclass distinction (NE type) - elif fmt == "multiclass": - i = 0 - toks = Tree("S", []) - for (s, e, typ) in sorted(entities): - if s < i: - s = i # Overlapping! Deal with this better? - if e <= s: - continue - toks.extend(word_tokenize(text[i:s])) - toks.append(Tree(typ, text[s:e].split())) - i = e - toks.extend(word_tokenize(text[i:])) - yield toks - - else: - raise ValueError("bad fmt value") - - -# This probably belongs in a more general-purpose location (as does -# the parse_to_tagged function). -def cmp_chunks(correct, guessed): - correct = NEChunkParser._parse_to_tagged(correct) - guessed = NEChunkParser._parse_to_tagged(guessed) - ellipsis = False - for (w, ct), (w, gt) in zip(correct, guessed): - if ct == gt == "O": - if not ellipsis: - print(f" {ct:15} {gt:15} {w}") - print(" {:15} {:15} {2}".format("...", "...", "...")) - ellipsis = True - else: - ellipsis = False - print(f" {ct:15} {gt:15} {w}") - - -def build_model(fmt="binary"): - print("Loading training data...") - train_paths = [ - find("corpora/ace_data/ace.dev"), - find("corpora/ace_data/ace.heldout"), - find("corpora/ace_data/bbn.dev"), - find("corpora/ace_data/muc.dev"), - ] - train_trees = load_ace_data(train_paths, fmt) - train_data = [postag_tree(t) for t in train_trees] - print("Training...") - cp = NEChunkParser(train_data) - del train_data - - print("Loading eval data...") - eval_paths = [find("corpora/ace_data/ace.eval")] - eval_trees = load_ace_data(eval_paths, fmt) - eval_data = [postag_tree(t) for t in eval_trees] - - print("Evaluating...") - chunkscore = ChunkScore() - for i, correct in enumerate(eval_data): - guess = cp.parse(correct.leaves()) - chunkscore.score(correct, guess) - if i < 3: - cmp_chunks(correct, guess) - print(chunkscore) - - outfilename = f"/tmp/ne_chunker_{fmt}.pickle" - print(f"Saving chunker to {outfilename}...") - - with open(outfilename, "wb") as outfile: - pickle.dump(cp, outfile, -1) - - return cp - - -if __name__ == "__main__": - # Make sure that the pickled object has the right class name: - from nltk.chunk.named_entity import build_model - - build_model("binary") - build_model("multiclass") diff --git a/pipeline/nltk/chunk/regexp.py b/pipeline/nltk/chunk/regexp.py deleted file mode 100644 index 4369119706106db2892e47675ffd039e85db888d..0000000000000000000000000000000000000000 --- a/pipeline/nltk/chunk/regexp.py +++ /dev/null @@ -1,1475 +0,0 @@ -# Natural Language Toolkit: Regular Expression Chunkers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird (minor additions) -# URL: -# For license information, see LICENSE.TXT - -import re - -import regex - -from nltk.chunk.api import ChunkParserI -from nltk.tree import Tree - -# ////////////////////////////////////////////////////// -# ChunkString -# ////////////////////////////////////////////////////// - - -class ChunkString: - """ - A string-based encoding of a particular chunking of a text. - Internally, the ``ChunkString`` class uses a single string to - encode the chunking of the input text. This string contains a - sequence of angle-bracket delimited tags, with chunking indicated - by braces. An example of this encoding is:: - - {
    }{
    }<.>{
    }<.> - - ``ChunkString`` are created from tagged texts (i.e., lists of - ``tokens`` whose type is ``TaggedType``). Initially, nothing is - chunked. - - The chunking of a ``ChunkString`` can be modified with the ``xform()`` - method, which uses a regular expression to transform the string - representation. These transformations should only add and remove - braces; they should *not* modify the sequence of angle-bracket - delimited tags. - - :type _str: str - :ivar _str: The internal string representation of the text's - encoding. This string representation contains a sequence of - angle-bracket delimited tags, with chunking indicated by - braces. An example of this encoding is:: - - {
    }{
    }<.>{
    }<.> - - :type _pieces: list(tagged tokens and chunks) - :ivar _pieces: The tagged tokens and chunks encoded by this ``ChunkString``. - :ivar _debug: The debug level. See the constructor docs. - - :cvar IN_CHUNK_PATTERN: A zero-width regexp pattern string that - will only match positions that are in chunks. - :cvar IN_STRIP_PATTERN: A zero-width regexp pattern string that - will only match positions that are in strips. - """ - - CHUNK_TAG_CHAR = r"[^\{\}<>]" - CHUNK_TAG = r"(<%s+?>)" % CHUNK_TAG_CHAR - - IN_CHUNK_PATTERN = r"(?=[^\{]*\})" - IN_STRIP_PATTERN = r"(?=[^\}]*(\{|$))" - - # These are used by _verify - _CHUNK = r"(\{%s+?\})+?" % CHUNK_TAG - _STRIP = r"(%s+?)+?" % CHUNK_TAG - _VALID = re.compile(r"^(\{?%s\}?)*?$" % CHUNK_TAG) - _BRACKETS = re.compile(r"[^\{\}]+") - _BALANCED_BRACKETS = re.compile(r"(\{\})*$") - - def __init__(self, chunk_struct, debug_level=1): - """ - Construct a new ``ChunkString`` that encodes the chunking of - the text ``tagged_tokens``. - - :type chunk_struct: Tree - :param chunk_struct: The chunk structure to be further chunked. - :type debug_level: int - :param debug_level: The level of debugging which should be - applied to transformations on the ``ChunkString``. The - valid levels are: - - - 0: no checks - - 1: full check on to_chunkstruct - - 2: full check on to_chunkstruct and cursory check after - each transformation. - - 3: full check on to_chunkstruct and full check after - each transformation. - - We recommend you use at least level 1. You should - probably use level 3 if you use any non-standard - subclasses of ``RegexpChunkRule``. - """ - self._root_label = chunk_struct.label() - self._pieces = chunk_struct[:] - tags = [self._tag(tok) for tok in self._pieces] - self._str = "<" + "><".join(tags) + ">" - self._debug = debug_level - - def _tag(self, tok): - if isinstance(tok, tuple): - return tok[1] - elif isinstance(tok, Tree): - return tok.label() - else: - raise ValueError("chunk structures must contain tagged " "tokens or trees") - - def _verify(self, s, verify_tags): - """ - Check to make sure that ``s`` still corresponds to some chunked - version of ``_pieces``. - - :type verify_tags: bool - :param verify_tags: Whether the individual tags should be - checked. If this is false, ``_verify`` will check to make - sure that ``_str`` encodes a chunked version of *some* - list of tokens. If this is true, then ``_verify`` will - check to make sure that the tags in ``_str`` match those in - ``_pieces``. - - :raise ValueError: if the internal string representation of - this ``ChunkString`` is invalid or not consistent with _pieces. - """ - # Check overall form - if not ChunkString._VALID.match(s): - raise ValueError( - "Transformation generated invalid " "chunkstring:\n %s" % s - ) - - # Check that parens are balanced. If the string is long, we - # have to do this in pieces, to avoid a maximum recursion - # depth limit for regular expressions. - brackets = ChunkString._BRACKETS.sub("", s) - for i in range(1 + len(brackets) // 5000): - substr = brackets[i * 5000 : i * 5000 + 5000] - if not ChunkString._BALANCED_BRACKETS.match(substr): - raise ValueError( - "Transformation generated invalid " "chunkstring:\n %s" % s - ) - - if verify_tags <= 0: - return - - tags1 = (re.split(r"[\{\}<>]+", s))[1:-1] - tags2 = [self._tag(piece) for piece in self._pieces] - if tags1 != tags2: - raise ValueError( - "Transformation generated invalid " "chunkstring: tag changed" - ) - - def to_chunkstruct(self, chunk_label="CHUNK"): - """ - Return the chunk structure encoded by this ``ChunkString``. - - :rtype: Tree - :raise ValueError: If a transformation has generated an - invalid chunkstring. - """ - if self._debug > 0: - self._verify(self._str, 1) - - # Use this alternating list to create the chunkstruct. - pieces = [] - index = 0 - piece_in_chunk = 0 - for piece in re.split("[{}]", self._str): - - # Find the list of tokens contained in this piece. - length = piece.count("<") - subsequence = self._pieces[index : index + length] - - # Add this list of tokens to our pieces. - if piece_in_chunk: - pieces.append(Tree(chunk_label, subsequence)) - else: - pieces += subsequence - - # Update index, piece_in_chunk - index += length - piece_in_chunk = not piece_in_chunk - - return Tree(self._root_label, pieces) - - def xform(self, regexp, repl): - """ - Apply the given transformation to the string encoding of this - ``ChunkString``. In particular, find all occurrences that match - ``regexp``, and replace them using ``repl`` (as done by - ``re.sub``). - - This transformation should only add and remove braces; it - should *not* modify the sequence of angle-bracket delimited - tags. Furthermore, this transformation may not result in - improper bracketing. Note, in particular, that bracketing may - not be nested. - - :type regexp: str or regexp - :param regexp: A regular expression matching the substring - that should be replaced. This will typically include a - named group, which can be used by ``repl``. - :type repl: str - :param repl: An expression specifying what should replace the - matched substring. Typically, this will include a named - replacement group, specified by ``regexp``. - :rtype: None - :raise ValueError: If this transformation generated an - invalid chunkstring. - """ - # Do the actual substitution - s = re.sub(regexp, repl, self._str) - - # The substitution might have generated "empty chunks" - # (substrings of the form "{}"). Remove them, so they don't - # interfere with other transformations. - s = re.sub(r"\{\}", "", s) - - # Make sure that the transformation was legal. - if self._debug > 1: - self._verify(s, self._debug - 2) - - # Commit the transformation. - self._str = s - - def __repr__(self): - """ - Return a string representation of this ``ChunkString``. - It has the form:: - - }{
    }'> - - :rtype: str - """ - return "" % repr(self._str) - - def __str__(self): - """ - Return a formatted representation of this ``ChunkString``. - This representation will include extra spaces to ensure that - tags will line up with the representation of other - ``ChunkStrings`` for the same text, regardless of the chunking. - - :rtype: str - """ - # Add spaces to make everything line up. - str = re.sub(r">(?!\})", r"> ", self._str) - str = re.sub(r"([^\{])<", r"\1 <", str) - if str[0] == "<": - str = " " + str - return str - - -# ////////////////////////////////////////////////////// -# Chunking Rules -# ////////////////////////////////////////////////////// - - -class RegexpChunkRule: - """ - A rule specifying how to modify the chunking in a ``ChunkString``, - using a transformational regular expression. The - ``RegexpChunkRule`` class itself can be used to implement any - transformational rule based on regular expressions. There are - also a number of subclasses, which can be used to implement - simpler types of rules, based on matching regular expressions. - - Each ``RegexpChunkRule`` has a regular expression and a - replacement expression. When a ``RegexpChunkRule`` is "applied" - to a ``ChunkString``, it searches the ``ChunkString`` for any - substring that matches the regular expression, and replaces it - using the replacement expression. This search/replace operation - has the same semantics as ``re.sub``. - - Each ``RegexpChunkRule`` also has a description string, which - gives a short (typically less than 75 characters) description of - the purpose of the rule. - - This transformation defined by this ``RegexpChunkRule`` should - only add and remove braces; it should *not* modify the sequence - of angle-bracket delimited tags. Furthermore, this transformation - may not result in nested or mismatched bracketing. - """ - - def __init__(self, regexp, repl, descr): - """ - Construct a new RegexpChunkRule. - - :type regexp: regexp or str - :param regexp: The regular expression for this ``RegexpChunkRule``. - When this rule is applied to a ``ChunkString``, any - substring that matches ``regexp`` will be replaced using - the replacement string ``repl``. Note that this must be a - normal regular expression, not a tag pattern. - :type repl: str - :param repl: The replacement expression for this ``RegexpChunkRule``. - When this rule is applied to a ``ChunkString``, any substring - that matches ``regexp`` will be replaced using ``repl``. - :type descr: str - :param descr: A short description of the purpose and/or effect - of this rule. - """ - if isinstance(regexp, str): - regexp = re.compile(regexp) - self._repl = repl - self._descr = descr - self._regexp = regexp - - def apply(self, chunkstr): - # Keep docstring generic so we can inherit it. - """ - Apply this rule to the given ``ChunkString``. See the - class reference documentation for a description of what it - means to apply a rule. - - :type chunkstr: ChunkString - :param chunkstr: The chunkstring to which this rule is applied. - :rtype: None - :raise ValueError: If this transformation generated an - invalid chunkstring. - """ - chunkstr.xform(self._regexp, self._repl) - - def descr(self): - """ - Return a short description of the purpose and/or effect of - this rule. - - :rtype: str - """ - return self._descr - - def __repr__(self): - """ - Return a string representation of this rule. It has the form:: - - }'->''> - - Note that this representation does not include the - description string; that string can be accessed - separately with the ``descr()`` method. - - :rtype: str - """ - return ( - "" - + repr(self._repl) - + ">" - ) - - @staticmethod - def fromstring(s): - """ - Create a RegexpChunkRule from a string description. - Currently, the following formats are supported:: - - {regexp} # chunk rule - }regexp{ # strip rule - regexp}{regexp # split rule - regexp{}regexp # merge rule - - Where ``regexp`` is a regular expression for the rule. Any - text following the comment marker (``#``) will be used as - the rule's description: - - >>> from nltk.chunk.regexp import RegexpChunkRule - >>> RegexpChunkRule.fromstring('{
    ?+}') - ?+'> - """ - # Split off the comment (but don't split on '\#') - m = re.match(r"(?P(\\.|[^#])*)(?P#.*)?", s) - rule = m.group("rule").strip() - comment = (m.group("comment") or "")[1:].strip() - - # Pattern bodies: chunk, strip, split, merge - try: - if not rule: - raise ValueError("Empty chunk pattern") - if rule[0] == "{" and rule[-1] == "}": - return ChunkRule(rule[1:-1], comment) - elif rule[0] == "}" and rule[-1] == "{": - return StripRule(rule[1:-1], comment) - elif "}{" in rule: - left, right = rule.split("}{") - return SplitRule(left, right, comment) - elif "{}" in rule: - left, right = rule.split("{}") - return MergeRule(left, right, comment) - elif re.match("[^{}]*{[^{}]*}[^{}]*", rule): - left, chunk, right = re.split("[{}]", rule) - return ChunkRuleWithContext(left, chunk, right, comment) - else: - raise ValueError("Illegal chunk pattern: %s" % rule) - except (ValueError, re.error) as e: - raise ValueError("Illegal chunk pattern: %s" % rule) from e - - -class ChunkRule(RegexpChunkRule): - """ - A rule specifying how to add chunks to a ``ChunkString``, using a - matching tag pattern. When applied to a ``ChunkString``, it will - find any substring that matches this tag pattern and that is not - already part of a chunk, and create a new chunk containing that - substring. - """ - - def __init__(self, tag_pattern, descr): - """ - Construct a new ``ChunkRule``. - - :type tag_pattern: str - :param tag_pattern: This rule's tag pattern. When - applied to a ``ChunkString``, this rule will - chunk any substring that matches this tag pattern and that - is not already part of a chunk. - :type descr: str - :param descr: A short description of the purpose and/or effect - of this rule. - """ - self._pattern = tag_pattern - regexp = re.compile( - "(?P%s)%s" - % (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_STRIP_PATTERN) - ) - RegexpChunkRule.__init__(self, regexp, r"{\g}", descr) - - def __repr__(self): - """ - Return a string representation of this rule. It has the form:: - - '> - - Note that this representation does not include the - description string; that string can be accessed - separately with the ``descr()`` method. - - :rtype: str - """ - return "" - - -class StripRule(RegexpChunkRule): - """ - A rule specifying how to remove strips to a ``ChunkString``, - using a matching tag pattern. When applied to a - ``ChunkString``, it will find any substring that matches this - tag pattern and that is contained in a chunk, and remove it - from that chunk, thus creating two new chunks. - """ - - def __init__(self, tag_pattern, descr): - """ - Construct a new ``StripRule``. - - :type tag_pattern: str - :param tag_pattern: This rule's tag pattern. When - applied to a ``ChunkString``, this rule will - find any substring that matches this tag pattern and that - is contained in a chunk, and remove it from that chunk, - thus creating two new chunks. - :type descr: str - :param descr: A short description of the purpose and/or effect - of this rule. - """ - self._pattern = tag_pattern - regexp = re.compile( - "(?P%s)%s" - % (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHUNK_PATTERN) - ) - RegexpChunkRule.__init__(self, regexp, r"}\g{", descr) - - def __repr__(self): - """ - Return a string representation of this rule. It has the form:: - - '> - - Note that this representation does not include the - description string; that string can be accessed - separately with the ``descr()`` method. - - :rtype: str - """ - return "" - - -class UnChunkRule(RegexpChunkRule): - """ - A rule specifying how to remove chunks to a ``ChunkString``, - using a matching tag pattern. When applied to a - ``ChunkString``, it will find any complete chunk that matches this - tag pattern, and un-chunk it. - """ - - def __init__(self, tag_pattern, descr): - """ - Construct a new ``UnChunkRule``. - - :type tag_pattern: str - :param tag_pattern: This rule's tag pattern. When - applied to a ``ChunkString``, this rule will - find any complete chunk that matches this tag pattern, - and un-chunk it. - :type descr: str - :param descr: A short description of the purpose and/or effect - of this rule. - """ - self._pattern = tag_pattern - regexp = re.compile(r"\{(?P%s)\}" % tag_pattern2re_pattern(tag_pattern)) - RegexpChunkRule.__init__(self, regexp, r"\g", descr) - - def __repr__(self): - """ - Return a string representation of this rule. It has the form:: - - '> - - Note that this representation does not include the - description string; that string can be accessed - separately with the ``descr()`` method. - - :rtype: str - """ - return "" - - -class MergeRule(RegexpChunkRule): - """ - A rule specifying how to merge chunks in a ``ChunkString``, using - two matching tag patterns: a left pattern, and a right pattern. - When applied to a ``ChunkString``, it will find any chunk whose end - matches left pattern, and immediately followed by a chunk whose - beginning matches right pattern. It will then merge those two - chunks into a single chunk. - """ - - def __init__(self, left_tag_pattern, right_tag_pattern, descr): - """ - Construct a new ``MergeRule``. - - :type right_tag_pattern: str - :param right_tag_pattern: This rule's right tag - pattern. When applied to a ``ChunkString``, this - rule will find any chunk whose end matches - ``left_tag_pattern``, and immediately followed by a chunk - whose beginning matches this pattern. It will - then merge those two chunks into a single chunk. - :type left_tag_pattern: str - :param left_tag_pattern: This rule's left tag - pattern. When applied to a ``ChunkString``, this - rule will find any chunk whose end matches - this pattern, and immediately followed by a chunk - whose beginning matches ``right_tag_pattern``. It will - then merge those two chunks into a single chunk. - - :type descr: str - :param descr: A short description of the purpose and/or effect - of this rule. - """ - # Ensure that the individual patterns are coherent. E.g., if - # left='(' and right=')', then this will raise an exception: - re.compile(tag_pattern2re_pattern(left_tag_pattern)) - re.compile(tag_pattern2re_pattern(right_tag_pattern)) - - self._left_tag_pattern = left_tag_pattern - self._right_tag_pattern = right_tag_pattern - regexp = re.compile( - "(?P%s)}{(?=%s)" - % ( - tag_pattern2re_pattern(left_tag_pattern), - tag_pattern2re_pattern(right_tag_pattern), - ) - ) - RegexpChunkRule.__init__(self, regexp, r"\g", descr) - - def __repr__(self): - """ - Return a string representation of this rule. It has the form:: - - ', ''> - - Note that this representation does not include the - description string; that string can be accessed - separately with the ``descr()`` method. - - :rtype: str - """ - return ( - "" - ) - - -class SplitRule(RegexpChunkRule): - """ - A rule specifying how to split chunks in a ``ChunkString``, using - two matching tag patterns: a left pattern, and a right pattern. - When applied to a ``ChunkString``, it will find any chunk that - matches the left pattern followed by the right pattern. It will - then split the chunk into two new chunks, at the point between the - two pattern matches. - """ - - def __init__(self, left_tag_pattern, right_tag_pattern, descr): - """ - Construct a new ``SplitRule``. - - :type right_tag_pattern: str - :param right_tag_pattern: This rule's right tag - pattern. When applied to a ``ChunkString``, this rule will - find any chunk containing a substring that matches - ``left_tag_pattern`` followed by this pattern. It will - then split the chunk into two new chunks at the point - between these two matching patterns. - :type left_tag_pattern: str - :param left_tag_pattern: This rule's left tag - pattern. When applied to a ``ChunkString``, this rule will - find any chunk containing a substring that matches this - pattern followed by ``right_tag_pattern``. It will then - split the chunk into two new chunks at the point between - these two matching patterns. - :type descr: str - :param descr: A short description of the purpose and/or effect - of this rule. - """ - # Ensure that the individual patterns are coherent. E.g., if - # left='(' and right=')', then this will raise an exception: - re.compile(tag_pattern2re_pattern(left_tag_pattern)) - re.compile(tag_pattern2re_pattern(right_tag_pattern)) - - self._left_tag_pattern = left_tag_pattern - self._right_tag_pattern = right_tag_pattern - regexp = re.compile( - "(?P%s)(?=%s)" - % ( - tag_pattern2re_pattern(left_tag_pattern), - tag_pattern2re_pattern(right_tag_pattern), - ) - ) - RegexpChunkRule.__init__(self, regexp, r"\g}{", descr) - - def __repr__(self): - """ - Return a string representation of this rule. It has the form:: - - ', '
    '> - - Note that this representation does not include the - description string; that string can be accessed - separately with the ``descr()`` method. - - :rtype: str - """ - return ( - "" - ) - - -class ExpandLeftRule(RegexpChunkRule): - """ - A rule specifying how to expand chunks in a ``ChunkString`` to the left, - using two matching tag patterns: a left pattern, and a right pattern. - When applied to a ``ChunkString``, it will find any chunk whose beginning - matches right pattern, and immediately preceded by a strip whose - end matches left pattern. It will then expand the chunk to incorporate - the new material on the left. - """ - - def __init__(self, left_tag_pattern, right_tag_pattern, descr): - """ - Construct a new ``ExpandRightRule``. - - :type right_tag_pattern: str - :param right_tag_pattern: This rule's right tag - pattern. When applied to a ``ChunkString``, this - rule will find any chunk whose beginning matches - ``right_tag_pattern``, and immediately preceded by a strip - whose end matches this pattern. It will - then merge those two chunks into a single chunk. - :type left_tag_pattern: str - :param left_tag_pattern: This rule's left tag - pattern. When applied to a ``ChunkString``, this - rule will find any chunk whose beginning matches - this pattern, and immediately preceded by a strip - whose end matches ``left_tag_pattern``. It will - then expand the chunk to incorporate the new material on the left. - - :type descr: str - :param descr: A short description of the purpose and/or effect - of this rule. - """ - # Ensure that the individual patterns are coherent. E.g., if - # left='(' and right=')', then this will raise an exception: - re.compile(tag_pattern2re_pattern(left_tag_pattern)) - re.compile(tag_pattern2re_pattern(right_tag_pattern)) - - self._left_tag_pattern = left_tag_pattern - self._right_tag_pattern = right_tag_pattern - regexp = re.compile( - r"(?P%s)\{(?P%s)" - % ( - tag_pattern2re_pattern(left_tag_pattern), - tag_pattern2re_pattern(right_tag_pattern), - ) - ) - RegexpChunkRule.__init__(self, regexp, r"{\g\g", descr) - - def __repr__(self): - """ - Return a string representation of this rule. It has the form:: - - ', ''> - - Note that this representation does not include the - description string; that string can be accessed - separately with the ``descr()`` method. - - :rtype: str - """ - return ( - "" - ) - - -class ExpandRightRule(RegexpChunkRule): - """ - A rule specifying how to expand chunks in a ``ChunkString`` to the - right, using two matching tag patterns: a left pattern, and a - right pattern. When applied to a ``ChunkString``, it will find any - chunk whose end matches left pattern, and immediately followed by - a strip whose beginning matches right pattern. It will then - expand the chunk to incorporate the new material on the right. - """ - - def __init__(self, left_tag_pattern, right_tag_pattern, descr): - """ - Construct a new ``ExpandRightRule``. - - :type right_tag_pattern: str - :param right_tag_pattern: This rule's right tag - pattern. When applied to a ``ChunkString``, this - rule will find any chunk whose end matches - ``left_tag_pattern``, and immediately followed by a strip - whose beginning matches this pattern. It will - then merge those two chunks into a single chunk. - :type left_tag_pattern: str - :param left_tag_pattern: This rule's left tag - pattern. When applied to a ``ChunkString``, this - rule will find any chunk whose end matches - this pattern, and immediately followed by a strip - whose beginning matches ``right_tag_pattern``. It will - then expand the chunk to incorporate the new material on the right. - - :type descr: str - :param descr: A short description of the purpose and/or effect - of this rule. - """ - # Ensure that the individual patterns are coherent. E.g., if - # left='(' and right=')', then this will raise an exception: - re.compile(tag_pattern2re_pattern(left_tag_pattern)) - re.compile(tag_pattern2re_pattern(right_tag_pattern)) - - self._left_tag_pattern = left_tag_pattern - self._right_tag_pattern = right_tag_pattern - regexp = re.compile( - r"(?P%s)\}(?P%s)" - % ( - tag_pattern2re_pattern(left_tag_pattern), - tag_pattern2re_pattern(right_tag_pattern), - ) - ) - RegexpChunkRule.__init__(self, regexp, r"\g\g}", descr) - - def __repr__(self): - """ - Return a string representation of this rule. It has the form:: - - ', ''> - - Note that this representation does not include the - description string; that string can be accessed - separately with the ``descr()`` method. - - :rtype: str - """ - return ( - "" - ) - - -class ChunkRuleWithContext(RegexpChunkRule): - """ - A rule specifying how to add chunks to a ``ChunkString``, using - three matching tag patterns: one for the left context, one for the - chunk, and one for the right context. When applied to a - ``ChunkString``, it will find any substring that matches the chunk - tag pattern, is surrounded by substrings that match the two - context patterns, and is not already part of a chunk; and create a - new chunk containing the substring that matched the chunk tag - pattern. - - Caveat: Both the left and right context are consumed when this - rule matches; therefore, if you need to find overlapping matches, - you will need to apply your rule more than once. - """ - - def __init__( - self, - left_context_tag_pattern, - chunk_tag_pattern, - right_context_tag_pattern, - descr, - ): - """ - Construct a new ``ChunkRuleWithContext``. - - :type left_context_tag_pattern: str - :param left_context_tag_pattern: A tag pattern that must match - the left context of ``chunk_tag_pattern`` for this rule to - apply. - :type chunk_tag_pattern: str - :param chunk_tag_pattern: A tag pattern that must match for this - rule to apply. If the rule does apply, then this pattern - also identifies the substring that will be made into a chunk. - :type right_context_tag_pattern: str - :param right_context_tag_pattern: A tag pattern that must match - the right context of ``chunk_tag_pattern`` for this rule to - apply. - :type descr: str - :param descr: A short description of the purpose and/or effect - of this rule. - """ - # Ensure that the individual patterns are coherent. E.g., if - # left='(' and right=')', then this will raise an exception: - re.compile(tag_pattern2re_pattern(left_context_tag_pattern)) - re.compile(tag_pattern2re_pattern(chunk_tag_pattern)) - re.compile(tag_pattern2re_pattern(right_context_tag_pattern)) - - self._left_context_tag_pattern = left_context_tag_pattern - self._chunk_tag_pattern = chunk_tag_pattern - self._right_context_tag_pattern = right_context_tag_pattern - regexp = re.compile( - "(?P%s)(?P%s)(?P%s)%s" - % ( - tag_pattern2re_pattern(left_context_tag_pattern), - tag_pattern2re_pattern(chunk_tag_pattern), - tag_pattern2re_pattern(right_context_tag_pattern), - ChunkString.IN_STRIP_PATTERN, - ) - ) - replacement = r"\g{\g}\g" - RegexpChunkRule.__init__(self, regexp, replacement, descr) - - def __repr__(self): - """ - Return a string representation of this rule. It has the form:: - - ', '', '
    '> - - Note that this representation does not include the - description string; that string can be accessed - separately with the ``descr()`` method. - - :rtype: str - """ - return "".format( - self._left_context_tag_pattern, - self._chunk_tag_pattern, - self._right_context_tag_pattern, - ) - - -# ////////////////////////////////////////////////////// -# Tag Pattern Format Conversion -# ////////////////////////////////////////////////////// - -# this should probably be made more strict than it is -- e.g., it -# currently accepts 'foo'. -CHUNK_TAG_PATTERN = re.compile( - r"^(({}|<{}>)*)$".format(r"([^\{\}<>]|\{\d+,?\}|\{\d*,\d+\})+", r"[^\{\}<>]+") -) - - -def tag_pattern2re_pattern(tag_pattern): - """ - Convert a tag pattern to a regular expression pattern. A "tag - pattern" is a modified version of a regular expression, designed - for matching sequences of tags. The differences between regular - expression patterns and tag patterns are: - - - In tag patterns, ``'<'`` and ``'>'`` act as parentheses; so - ``'+'`` matches one or more repetitions of ``''``, not - ``''``. - - Whitespace in tag patterns is ignored. So - ``'
    | '`` is equivalent to ``'
    |'`` - - In tag patterns, ``'.'`` is equivalent to ``'[^{}<>]'``; so - ``''`` matches any single tag starting with ``'NN'``. - - In particular, ``tag_pattern2re_pattern`` performs the following - transformations on the given pattern: - - - Replace '.' with '[^<>{}]' - - Remove any whitespace - - Add extra parens around '<' and '>', to make '<' and '>' act - like parentheses. E.g., so that in '+', the '+' has scope - over the entire ''; and so that in '', the '|' has - scope over 'NN' and 'IN', but not '<' or '>'. - - Check to make sure the resulting pattern is valid. - - :type tag_pattern: str - :param tag_pattern: The tag pattern to convert to a regular - expression pattern. - :raise ValueError: If ``tag_pattern`` is not a valid tag pattern. - In particular, ``tag_pattern`` should not include braces; and it - should not contain nested or mismatched angle-brackets. - :rtype: str - :return: A regular expression pattern corresponding to - ``tag_pattern``. - """ - # Clean up the regular expression - tag_pattern = re.sub(r"\s", "", tag_pattern) - tag_pattern = re.sub(r"<", "(<(", tag_pattern) - tag_pattern = re.sub(r">", ")>)", tag_pattern) - - # Check the regular expression - if not CHUNK_TAG_PATTERN.match(tag_pattern): - raise ValueError("Bad tag pattern: %r" % tag_pattern) - - # Replace "." with CHUNK_TAG_CHAR. - # We have to do this after, since it adds {}[]<>s, which would - # confuse CHUNK_TAG_PATTERN. - # PRE doesn't have lookback assertions, so reverse twice, and do - # the pattern backwards (with lookahead assertions). This can be - # made much cleaner once we can switch back to SRE. - def reverse_str(str): - lst = list(str) - lst.reverse() - return "".join(lst) - - tc_rev = reverse_str(ChunkString.CHUNK_TAG_CHAR) - reversed = reverse_str(tag_pattern) - reversed = re.sub(r"\.(?!\\(\\\\)*($|[^\\]))", tc_rev, reversed) - tag_pattern = reverse_str(reversed) - - return tag_pattern - - -# ////////////////////////////////////////////////////// -# RegexpChunkParser -# ////////////////////////////////////////////////////// - - -class RegexpChunkParser(ChunkParserI): - """ - A regular expression based chunk parser. ``RegexpChunkParser`` uses a - sequence of "rules" to find chunks of a single type within a - text. The chunking of the text is encoded using a ``ChunkString``, - and each rule acts by modifying the chunking in the - ``ChunkString``. The rules are all implemented using regular - expression matching and substitution. - - The ``RegexpChunkRule`` class and its subclasses (``ChunkRule``, - ``StripRule``, ``UnChunkRule``, ``MergeRule``, and ``SplitRule``) - define the rules that are used by ``RegexpChunkParser``. Each rule - defines an ``apply()`` method, which modifies the chunking encoded - by a given ``ChunkString``. - - :type _rules: list(RegexpChunkRule) - :ivar _rules: The list of rules that should be applied to a text. - :type _trace: int - :ivar _trace: The default level of tracing. - - """ - - def __init__(self, rules, chunk_label="NP", root_label="S", trace=0): - """ - Construct a new ``RegexpChunkParser``. - - :type rules: list(RegexpChunkRule) - :param rules: The sequence of rules that should be used to - generate the chunking for a tagged text. - :type chunk_label: str - :param chunk_label: The node value that should be used for - chunk subtrees. This is typically a short string - describing the type of information contained by the chunk, - such as ``"NP"`` for base noun phrases. - :type root_label: str - :param root_label: The node value that should be used for the - top node of the chunk structure. - :type trace: int - :param trace: The level of tracing that should be used when - parsing a text. ``0`` will generate no tracing output; - ``1`` will generate normal tracing output; and ``2`` or - higher will generate verbose tracing output. - """ - self._rules = rules - self._trace = trace - self._chunk_label = chunk_label - self._root_label = root_label - - def _trace_apply(self, chunkstr, verbose): - """ - Apply each rule of this ``RegexpChunkParser`` to ``chunkstr``, in - turn. Generate trace output between each rule. If ``verbose`` - is true, then generate verbose output. - - :type chunkstr: ChunkString - :param chunkstr: The chunk string to which each rule should be - applied. - :type verbose: bool - :param verbose: Whether output should be verbose. - :rtype: None - """ - print("# Input:") - print(chunkstr) - for rule in self._rules: - rule.apply(chunkstr) - if verbose: - print("#", rule.descr() + " (" + repr(rule) + "):") - else: - print("#", rule.descr() + ":") - print(chunkstr) - - def _notrace_apply(self, chunkstr): - """ - Apply each rule of this ``RegexpChunkParser`` to ``chunkstr``, in - turn. - - :param chunkstr: The chunk string to which each rule should be - applied. - :type chunkstr: ChunkString - :rtype: None - """ - - for rule in self._rules: - rule.apply(chunkstr) - - def parse(self, chunk_struct, trace=None): - """ - :type chunk_struct: Tree - :param chunk_struct: the chunk structure to be (further) chunked - :type trace: int - :param trace: The level of tracing that should be used when - parsing a text. ``0`` will generate no tracing output; - ``1`` will generate normal tracing output; and ``2`` or - higher will generate verbose tracing output. This value - overrides the trace level value that was given to the - constructor. - :rtype: Tree - :return: a chunk structure that encodes the chunks in a given - tagged sentence. A chunk is a non-overlapping linguistic - group, such as a noun phrase. The set of chunks - identified in the chunk structure depends on the rules - used to define this ``RegexpChunkParser``. - """ - if len(chunk_struct) == 0: - print("Warning: parsing empty text") - return Tree(self._root_label, []) - - try: - chunk_struct.label() - except AttributeError: - chunk_struct = Tree(self._root_label, chunk_struct) - - # Use the default trace value? - if trace is None: - trace = self._trace - - chunkstr = ChunkString(chunk_struct) - - # Apply the sequence of rules to the chunkstring. - if trace: - verbose = trace > 1 - self._trace_apply(chunkstr, verbose) - else: - self._notrace_apply(chunkstr) - - # Use the chunkstring to create a chunk structure. - return chunkstr.to_chunkstruct(self._chunk_label) - - def rules(self): - """ - :return: the sequence of rules used by ``RegexpChunkParser``. - :rtype: list(RegexpChunkRule) - """ - return self._rules - - def __repr__(self): - """ - :return: a concise string representation of this - ``RegexpChunkParser``. - :rtype: str - """ - return "" % len(self._rules) - - def __str__(self): - """ - :return: a verbose string representation of this ``RegexpChunkParser``. - :rtype: str - """ - s = "RegexpChunkParser with %d rules:\n" % len(self._rules) - margin = 0 - for rule in self._rules: - margin = max(margin, len(rule.descr())) - if margin < 35: - format = " %" + repr(-(margin + 3)) + "s%s\n" - else: - format = " %s\n %s\n" - for rule in self._rules: - s += format % (rule.descr(), repr(rule)) - return s[:-1] - - -# ////////////////////////////////////////////////////// -# Chunk Grammar -# ////////////////////////////////////////////////////// - - -class RegexpParser(ChunkParserI): - r""" - A grammar based chunk parser. ``chunk.RegexpParser`` uses a set of - regular expression patterns to specify the behavior of the parser. - The chunking of the text is encoded using a ``ChunkString``, and - each rule acts by modifying the chunking in the ``ChunkString``. - The rules are all implemented using regular expression matching - and substitution. - - A grammar contains one or more clauses in the following form:: - - NP: - {} # chunk determiners and adjectives - }<[\.VI].*>+{ # strip any tag beginning with V, I, or . - <.*>}{
    # split a chunk at a determiner - {} # merge chunk ending with det/adj - # with one starting with a noun - - The patterns of a clause are executed in order. An earlier - pattern may introduce a chunk boundary that prevents a later - pattern from executing. Sometimes an individual pattern will - match on multiple, overlapping extents of the input. As with - regular expression substitution more generally, the chunker will - identify the first match possible, then continue looking for matches - after this one has ended. - - The clauses of a grammar are also executed in order. A cascaded - chunk parser is one having more than one clause. The maximum depth - of a parse tree created by this chunk parser is the same as the - number of clauses in the grammar. - - When tracing is turned on, the comment portion of a line is displayed - each time the corresponding pattern is applied. - - :type _start: str - :ivar _start: The start symbol of the grammar (the root node of - resulting trees) - :type _stages: int - :ivar _stages: The list of parsing stages corresponding to the grammar - - """ - - def __init__(self, grammar, root_label="S", loop=1, trace=0): - """ - Create a new chunk parser, from the given start state - and set of chunk patterns. - - :param grammar: The grammar, or a list of RegexpChunkParser objects - :type grammar: str or list(RegexpChunkParser) - :param root_label: The top node of the tree being created - :type root_label: str or Nonterminal - :param loop: The number of times to run through the patterns - :type loop: int - :type trace: int - :param trace: The level of tracing that should be used when - parsing a text. ``0`` will generate no tracing output; - ``1`` will generate normal tracing output; and ``2`` or - higher will generate verbose tracing output. - """ - self._trace = trace - self._stages = [] - self._grammar = grammar - self._loop = loop - - if isinstance(grammar, str): - self._read_grammar(grammar, root_label, trace) - else: - # Make sur the grammar looks like it has the right type: - type_err = ( - "Expected string or list of RegexpChunkParsers " "for the grammar." - ) - try: - grammar = list(grammar) - except BaseException as e: - raise TypeError(type_err) from e - for elt in grammar: - if not isinstance(elt, RegexpChunkParser): - raise TypeError(type_err) - self._stages = grammar - - def _read_grammar(self, grammar, root_label, trace): - """ - Helper function for __init__: read the grammar if it is a - string. - """ - rules = [] - lhs = None - pattern = regex.compile("(?P(\\.|[^:])*)(:(?P.*))") - for line in grammar.split("\n"): - line = line.strip() - - # New stage begins if there's an unescaped ':' - m = pattern.match(line) - if m: - # Record the stage that we just completed. - self._add_stage(rules, lhs, root_label, trace) - # Start a new stage. - lhs = m.group("nonterminal").strip() - rules = [] - line = m.group("rule").strip() - - # Skip blank & comment-only lines - if line == "" or line.startswith("#"): - continue - - # Add the rule - rules.append(RegexpChunkRule.fromstring(line)) - - # Record the final stage - self._add_stage(rules, lhs, root_label, trace) - - def _add_stage(self, rules, lhs, root_label, trace): - """ - Helper function for __init__: add a new stage to the parser. - """ - if rules != []: - if not lhs: - raise ValueError("Expected stage marker (eg NP:)") - parser = RegexpChunkParser( - rules, chunk_label=lhs, root_label=root_label, trace=trace - ) - self._stages.append(parser) - - def parse(self, chunk_struct, trace=None): - """ - Apply the chunk parser to this input. - - :type chunk_struct: Tree - :param chunk_struct: the chunk structure to be (further) chunked - (this tree is modified, and is also returned) - :type trace: int - :param trace: The level of tracing that should be used when - parsing a text. ``0`` will generate no tracing output; - ``1`` will generate normal tracing output; and ``2`` or - higher will generate verbose tracing output. This value - overrides the trace level value that was given to the - constructor. - :return: the chunked output. - :rtype: Tree - """ - if trace is None: - trace = self._trace - for i in range(self._loop): - for parser in self._stages: - chunk_struct = parser.parse(chunk_struct, trace=trace) - return chunk_struct - - def __repr__(self): - """ - :return: a concise string representation of this ``chunk.RegexpParser``. - :rtype: str - """ - return "" % len(self._stages) - - def __str__(self): - """ - :return: a verbose string representation of this - ``RegexpParser``. - :rtype: str - """ - s = "chunk.RegexpParser with %d stages:\n" % len(self._stages) - margin = 0 - for parser in self._stages: - s += "%s\n" % parser - return s[:-1] - - -# ////////////////////////////////////////////////////// -# Demonstration code -# ////////////////////////////////////////////////////// - - -def demo_eval(chunkparser, text): - """ - Demonstration code for evaluating a chunk parser, using a - ``ChunkScore``. This function assumes that ``text`` contains one - sentence per line, and that each sentence has the form expected by - ``tree.chunk``. It runs the given chunk parser on each sentence in - the text, and scores the result. It prints the final score - (precision, recall, and f-measure); and reports the set of chunks - that were missed and the set of chunks that were incorrect. (At - most 10 missing chunks and 10 incorrect chunks are reported). - - :param chunkparser: The chunkparser to be tested - :type chunkparser: ChunkParserI - :param text: The chunked tagged text that should be used for - evaluation. - :type text: str - """ - from nltk import chunk - from nltk.tree import Tree - - # Evaluate our chunk parser. - chunkscore = chunk.ChunkScore() - - for sentence in text.split("\n"): - print(sentence) - sentence = sentence.strip() - if not sentence: - continue - gold = chunk.tagstr2tree(sentence) - tokens = gold.leaves() - test = chunkparser.parse(Tree("S", tokens), trace=1) - chunkscore.score(gold, test) - print() - - print("/" + ("=" * 75) + "\\") - print("Scoring", chunkparser) - print("-" * 77) - print("Precision: %5.1f%%" % (chunkscore.precision() * 100), " " * 4, end=" ") - print("Recall: %5.1f%%" % (chunkscore.recall() * 100), " " * 6, end=" ") - print("F-Measure: %5.1f%%" % (chunkscore.f_measure() * 100)) - - # Missed chunks. - if chunkscore.missed(): - print("Missed:") - missed = chunkscore.missed() - for chunk in missed[:10]: - print(" ", " ".join(map(str, chunk))) - if len(chunkscore.missed()) > 10: - print(" ...") - - # Incorrect chunks. - if chunkscore.incorrect(): - print("Incorrect:") - incorrect = chunkscore.incorrect() - for chunk in incorrect[:10]: - print(" ", " ".join(map(str, chunk))) - if len(chunkscore.incorrect()) > 10: - print(" ...") - - print("\\" + ("=" * 75) + "/") - print() - - -def demo(): - """ - A demonstration for the ``RegexpChunkParser`` class. A single text is - parsed with four different chunk parsers, using a variety of rules - and strategies. - """ - - from nltk import Tree, chunk - - text = """\ - [ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] ./. - [ John/NNP ] saw/VBD [the/DT cats/NNS] [the/DT dog/NN] chased/VBD ./. - [ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./. - """ - - print("*" * 75) - print("Evaluation text:") - print(text) - print("*" * 75) - print() - - grammar = r""" - NP: # NP stage - {
    ?*} # chunk determiners, adjectives and nouns - {+} # chunk proper nouns - """ - cp = chunk.RegexpParser(grammar) - demo_eval(cp, text) - - grammar = r""" - NP: - {<.*>} # start by chunking each tag - }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods - {} # merge det/adj with nouns - """ - cp = chunk.RegexpParser(grammar) - demo_eval(cp, text) - - grammar = r""" - NP: {
    ?*} # chunk determiners, adjectives and nouns - VP: {?} # VP = verb words - """ - cp = chunk.RegexpParser(grammar) - demo_eval(cp, text) - - grammar = r""" - NP: {<.*>*} # start by chunking everything - }<[\.VI].*>+{ # strip any verbs, prepositions or periods - <.*>}{
    # separate on determiners - PP: {} # PP = preposition + noun phrase - VP: {*} # VP = verb words + NPs and PPs - """ - cp = chunk.RegexpParser(grammar) - demo_eval(cp, text) - - # Evaluation - - from nltk.corpus import conll2000 - - print() - print("Demonstration of empty grammar:") - - cp = chunk.RegexpParser("") - print(chunk.accuracy(cp, conll2000.chunked_sents("test.txt", chunk_types=("NP",)))) - - print() - print("Demonstration of accuracy evaluation using CoNLL tags:") - - grammar = r""" - NP: - {<.*>} # start by chunking each tag - }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods - {} # merge det/adj with nouns - """ - cp = chunk.RegexpParser(grammar) - print(chunk.accuracy(cp, conll2000.chunked_sents("test.txt")[:5])) - - print() - print("Demonstration of tagged token input") - - grammar = r""" - NP: {<.*>*} # start by chunking everything - }<[\.VI].*>+{ # strip any verbs, prepositions or periods - <.*>}{
    # separate on determiners - PP: {} # PP = preposition + noun phrase - VP: {*} # VP = verb words + NPs and PPs - """ - cp = chunk.RegexpParser(grammar) - print( - cp.parse( - [ - ("the", "DT"), - ("little", "JJ"), - ("cat", "NN"), - ("sat", "VBD"), - ("on", "IN"), - ("the", "DT"), - ("mat", "NN"), - (".", "."), - ] - ) - ) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/chunk/util.py b/pipeline/nltk/chunk/util.py deleted file mode 100644 index 64ab90f52d1cecd133d3c6511c71e10e44b7bbf1..0000000000000000000000000000000000000000 --- a/pipeline/nltk/chunk/util.py +++ /dev/null @@ -1,643 +0,0 @@ -# Natural Language Toolkit: Chunk format conversions -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird (minor additions) -# URL: -# For license information, see LICENSE.TXT - -import re - -from nltk.metrics import accuracy as _accuracy -from nltk.tag.mapping import map_tag -from nltk.tag.util import str2tuple -from nltk.tree import Tree - -##////////////////////////////////////////////////////// -## EVALUATION -##////////////////////////////////////////////////////// - - -def accuracy(chunker, gold): - """ - Score the accuracy of the chunker against the gold standard. - Strip the chunk information from the gold standard and rechunk it using - the chunker, then compute the accuracy score. - - :type chunker: ChunkParserI - :param chunker: The chunker being evaluated. - :type gold: tree - :param gold: The chunk structures to score the chunker on. - :rtype: float - """ - - gold_tags = [] - test_tags = [] - for gold_tree in gold: - test_tree = chunker.parse(gold_tree.flatten()) - gold_tags += tree2conlltags(gold_tree) - test_tags += tree2conlltags(test_tree) - - # print 'GOLD:', gold_tags[:50] - # print 'TEST:', test_tags[:50] - return _accuracy(gold_tags, test_tags) - - -# Patched for increased performance by Yoav Goldberg , 2006-01-13 -# -- statistics are evaluated only on demand, instead of at every sentence evaluation -# -# SB: use nltk.metrics for precision/recall scoring? -# -class ChunkScore: - """ - A utility class for scoring chunk parsers. ``ChunkScore`` can - evaluate a chunk parser's output, based on a number of statistics - (precision, recall, f-measure, misssed chunks, incorrect chunks). - It can also combine the scores from the parsing of multiple texts; - this makes it significantly easier to evaluate a chunk parser that - operates one sentence at a time. - - Texts are evaluated with the ``score`` method. The results of - evaluation can be accessed via a number of accessor methods, such - as ``precision`` and ``f_measure``. A typical use of the - ``ChunkScore`` class is:: - - >>> chunkscore = ChunkScore() # doctest: +SKIP - >>> for correct in correct_sentences: # doctest: +SKIP - ... guess = chunkparser.parse(correct.leaves()) # doctest: +SKIP - ... chunkscore.score(correct, guess) # doctest: +SKIP - >>> print('F Measure:', chunkscore.f_measure()) # doctest: +SKIP - F Measure: 0.823 - - :ivar kwargs: Keyword arguments: - - - max_tp_examples: The maximum number actual examples of true - positives to record. This affects the ``correct`` member - function: ``correct`` will not return more than this number - of true positive examples. This does *not* affect any of - the numerical metrics (precision, recall, or f-measure) - - - max_fp_examples: The maximum number actual examples of false - positives to record. This affects the ``incorrect`` member - function and the ``guessed`` member function: ``incorrect`` - will not return more than this number of examples, and - ``guessed`` will not return more than this number of true - positive examples. This does *not* affect any of the - numerical metrics (precision, recall, or f-measure) - - - max_fn_examples: The maximum number actual examples of false - negatives to record. This affects the ``missed`` member - function and the ``correct`` member function: ``missed`` - will not return more than this number of examples, and - ``correct`` will not return more than this number of true - negative examples. This does *not* affect any of the - numerical metrics (precision, recall, or f-measure) - - - chunk_label: A regular expression indicating which chunks - should be compared. Defaults to ``'.*'`` (i.e., all chunks). - - :type _tp: list(Token) - :ivar _tp: List of true positives - :type _fp: list(Token) - :ivar _fp: List of false positives - :type _fn: list(Token) - :ivar _fn: List of false negatives - - :type _tp_num: int - :ivar _tp_num: Number of true positives - :type _fp_num: int - :ivar _fp_num: Number of false positives - :type _fn_num: int - :ivar _fn_num: Number of false negatives. - """ - - def __init__(self, **kwargs): - self._correct = set() - self._guessed = set() - self._tp = set() - self._fp = set() - self._fn = set() - self._max_tp = kwargs.get("max_tp_examples", 100) - self._max_fp = kwargs.get("max_fp_examples", 100) - self._max_fn = kwargs.get("max_fn_examples", 100) - self._chunk_label = kwargs.get("chunk_label", ".*") - self._tp_num = 0 - self._fp_num = 0 - self._fn_num = 0 - self._count = 0 - self._tags_correct = 0.0 - self._tags_total = 0.0 - - self._measuresNeedUpdate = False - - def _updateMeasures(self): - if self._measuresNeedUpdate: - self._tp = self._guessed & self._correct - self._fn = self._correct - self._guessed - self._fp = self._guessed - self._correct - self._tp_num = len(self._tp) - self._fp_num = len(self._fp) - self._fn_num = len(self._fn) - self._measuresNeedUpdate = False - - def score(self, correct, guessed): - """ - Given a correctly chunked sentence, score another chunked - version of the same sentence. - - :type correct: chunk structure - :param correct: The known-correct ("gold standard") chunked - sentence. - :type guessed: chunk structure - :param guessed: The chunked sentence to be scored. - """ - self._correct |= _chunksets(correct, self._count, self._chunk_label) - self._guessed |= _chunksets(guessed, self._count, self._chunk_label) - self._count += 1 - self._measuresNeedUpdate = True - # Keep track of per-tag accuracy (if possible) - try: - correct_tags = tree2conlltags(correct) - guessed_tags = tree2conlltags(guessed) - except ValueError: - # This exception case is for nested chunk structures, - # where tree2conlltags will fail with a ValueError: "Tree - # is too deeply nested to be printed in CoNLL format." - correct_tags = guessed_tags = () - self._tags_total += len(correct_tags) - self._tags_correct += sum( - 1 for (t, g) in zip(guessed_tags, correct_tags) if t == g - ) - - def accuracy(self): - """ - Return the overall tag-based accuracy for all text that have - been scored by this ``ChunkScore``, using the IOB (conll2000) - tag encoding. - - :rtype: float - """ - if self._tags_total == 0: - return 1 - return self._tags_correct / self._tags_total - - def precision(self): - """ - Return the overall precision for all texts that have been - scored by this ``ChunkScore``. - - :rtype: float - """ - self._updateMeasures() - div = self._tp_num + self._fp_num - if div == 0: - return 0 - else: - return self._tp_num / div - - def recall(self): - """ - Return the overall recall for all texts that have been - scored by this ``ChunkScore``. - - :rtype: float - """ - self._updateMeasures() - div = self._tp_num + self._fn_num - if div == 0: - return 0 - else: - return self._tp_num / div - - def f_measure(self, alpha=0.5): - """ - Return the overall F measure for all texts that have been - scored by this ``ChunkScore``. - - :param alpha: the relative weighting of precision and recall. - Larger alpha biases the score towards the precision value, - while smaller alpha biases the score towards the recall - value. ``alpha`` should have a value in the range [0,1]. - :type alpha: float - :rtype: float - """ - self._updateMeasures() - p = self.precision() - r = self.recall() - if p == 0 or r == 0: # what if alpha is 0 or 1? - return 0 - return 1 / (alpha / p + (1 - alpha) / r) - - def missed(self): - """ - Return the chunks which were included in the - correct chunk structures, but not in the guessed chunk - structures, listed in input order. - - :rtype: list of chunks - """ - self._updateMeasures() - chunks = list(self._fn) - return [c[1] for c in chunks] # discard position information - - def incorrect(self): - """ - Return the chunks which were included in the guessed chunk structures, - but not in the correct chunk structures, listed in input order. - - :rtype: list of chunks - """ - self._updateMeasures() - chunks = list(self._fp) - return [c[1] for c in chunks] # discard position information - - def correct(self): - """ - Return the chunks which were included in the correct - chunk structures, listed in input order. - - :rtype: list of chunks - """ - chunks = list(self._correct) - return [c[1] for c in chunks] # discard position information - - def guessed(self): - """ - Return the chunks which were included in the guessed - chunk structures, listed in input order. - - :rtype: list of chunks - """ - chunks = list(self._guessed) - return [c[1] for c in chunks] # discard position information - - def __len__(self): - self._updateMeasures() - return self._tp_num + self._fn_num - - def __repr__(self): - """ - Return a concise representation of this ``ChunkScoring``. - - :rtype: str - """ - return "" - - def __str__(self): - """ - Return a verbose representation of this ``ChunkScoring``. - This representation includes the precision, recall, and - f-measure scores. For other information about the score, - use the accessor methods (e.g., ``missed()`` and ``incorrect()``). - - :rtype: str - """ - return ( - "ChunkParse score:\n" - + (f" IOB Accuracy: {self.accuracy() * 100:5.1f}%%\n") - + (f" Precision: {self.precision() * 100:5.1f}%%\n") - + (f" Recall: {self.recall() * 100:5.1f}%%\n") - + (f" F-Measure: {self.f_measure() * 100:5.1f}%%") - ) - - -# extract chunks, and assign unique id, the absolute position of -# the first word of the chunk -def _chunksets(t, count, chunk_label): - pos = 0 - chunks = [] - for child in t: - if isinstance(child, Tree): - if re.match(chunk_label, child.label()): - chunks.append(((count, pos), child.freeze())) - pos += len(child.leaves()) - else: - pos += 1 - return set(chunks) - - -def tagstr2tree( - s, chunk_label="NP", root_label="S", sep="/", source_tagset=None, target_tagset=None -): - """ - Divide a string of bracketted tagged text into - chunks and unchunked tokens, and produce a Tree. - Chunks are marked by square brackets (``[...]``). Words are - delimited by whitespace, and each word should have the form - ``text/tag``. Words that do not contain a slash are - assigned a ``tag`` of None. - - :param s: The string to be converted - :type s: str - :param chunk_label: The label to use for chunk nodes - :type chunk_label: str - :param root_label: The label to use for the root of the tree - :type root_label: str - :rtype: Tree - """ - - WORD_OR_BRACKET = re.compile(r"\[|\]|[^\[\]\s]+") - - stack = [Tree(root_label, [])] - for match in WORD_OR_BRACKET.finditer(s): - text = match.group() - if text[0] == "[": - if len(stack) != 1: - raise ValueError(f"Unexpected [ at char {match.start():d}") - chunk = Tree(chunk_label, []) - stack[-1].append(chunk) - stack.append(chunk) - elif text[0] == "]": - if len(stack) != 2: - raise ValueError(f"Unexpected ] at char {match.start():d}") - stack.pop() - else: - if sep is None: - stack[-1].append(text) - else: - word, tag = str2tuple(text, sep) - if source_tagset and target_tagset: - tag = map_tag(source_tagset, target_tagset, tag) - stack[-1].append((word, tag)) - - if len(stack) != 1: - raise ValueError(f"Expected ] at char {len(s):d}") - return stack[0] - - -### CONLL - -_LINE_RE = re.compile(r"(\S+)\s+(\S+)\s+([IOB])-?(\S+)?") - - -def conllstr2tree(s, chunk_types=("NP", "PP", "VP"), root_label="S"): - """ - Return a chunk structure for a single sentence - encoded in the given CONLL 2000 style string. - This function converts a CoNLL IOB string into a tree. - It uses the specified chunk types - (defaults to NP, PP and VP), and creates a tree rooted at a node - labeled S (by default). - - :param s: The CoNLL string to be converted. - :type s: str - :param chunk_types: The chunk types to be converted. - :type chunk_types: tuple - :param root_label: The node label to use for the root. - :type root_label: str - :rtype: Tree - """ - - stack = [Tree(root_label, [])] - - for lineno, line in enumerate(s.split("\n")): - if not line.strip(): - continue - - # Decode the line. - match = _LINE_RE.match(line) - if match is None: - raise ValueError(f"Error on line {lineno:d}") - (word, tag, state, chunk_type) = match.groups() - - # If it's a chunk type we don't care about, treat it as O. - if chunk_types is not None and chunk_type not in chunk_types: - state = "O" - - # For "Begin"/"Outside", finish any completed chunks - - # also do so for "Inside" which don't match the previous token. - mismatch_I = state == "I" and chunk_type != stack[-1].label() - if state in "BO" or mismatch_I: - if len(stack) == 2: - stack.pop() - - # For "Begin", start a new chunk. - if state == "B" or mismatch_I: - chunk = Tree(chunk_type, []) - stack[-1].append(chunk) - stack.append(chunk) - - # Add the new word token. - stack[-1].append((word, tag)) - - return stack[0] - - -def tree2conlltags(t): - """ - Return a list of 3-tuples containing ``(word, tag, IOB-tag)``. - Convert a tree to the CoNLL IOB tag format. - - :param t: The tree to be converted. - :type t: Tree - :rtype: list(tuple) - """ - - tags = [] - for child in t: - try: - category = child.label() - prefix = "B-" - for contents in child: - if isinstance(contents, Tree): - raise ValueError( - "Tree is too deeply nested to be printed in CoNLL format" - ) - tags.append((contents[0], contents[1], prefix + category)) - prefix = "I-" - except AttributeError: - tags.append((child[0], child[1], "O")) - return tags - - -def conlltags2tree( - sentence, chunk_types=("NP", "PP", "VP"), root_label="S", strict=False -): - """ - Convert the CoNLL IOB format to a tree. - """ - tree = Tree(root_label, []) - for (word, postag, chunktag) in sentence: - if chunktag is None: - if strict: - raise ValueError("Bad conll tag sequence") - else: - # Treat as O - tree.append((word, postag)) - elif chunktag.startswith("B-"): - tree.append(Tree(chunktag[2:], [(word, postag)])) - elif chunktag.startswith("I-"): - if ( - len(tree) == 0 - or not isinstance(tree[-1], Tree) - or tree[-1].label() != chunktag[2:] - ): - if strict: - raise ValueError("Bad conll tag sequence") - else: - # Treat as B-* - tree.append(Tree(chunktag[2:], [(word, postag)])) - else: - tree[-1].append((word, postag)) - elif chunktag == "O": - tree.append((word, postag)) - else: - raise ValueError(f"Bad conll tag {chunktag!r}") - return tree - - -def tree2conllstr(t): - """ - Return a multiline string where each line contains a word, tag and IOB tag. - Convert a tree to the CoNLL IOB string format - - :param t: The tree to be converted. - :type t: Tree - :rtype: str - """ - lines = [" ".join(token) for token in tree2conlltags(t)] - return "\n".join(lines) - - -### IEER - -_IEER_DOC_RE = re.compile( - r"\s*" - r"(\s*(?P.+?)\s*\s*)?" - r"(\s*(?P.+?)\s*\s*)?" - r"(\s*(?P.+?)\s*\s*)?" - r"\s*" - r"(\s*(?P.+?)\s*\s*)?" - r"(?P.*?)\s*" - r"\s*\s*", - re.DOTALL, -) - -_IEER_TYPE_RE = re.compile(r']*?type="(?P\w+)"') - - -def _ieer_read_text(s, root_label): - stack = [Tree(root_label, [])] - # s will be None if there is no headline in the text - # return the empty list in place of a Tree - if s is None: - return [] - for piece_m in re.finditer(r"<[^>]+>|[^\s<]+", s): - piece = piece_m.group() - try: - if piece.startswith(".... - m = _IEER_DOC_RE.match(s) - if m: - return { - "text": _ieer_read_text(m.group("text"), root_label), - "docno": m.group("docno"), - "doctype": m.group("doctype"), - "date_time": m.group("date_time"), - #'headline': m.group('headline') - # we want to capture NEs in the headline too! - "headline": _ieer_read_text(m.group("headline"), root_label), - } - else: - return _ieer_read_text(s, root_label) - - -def demo(): - - s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./." - import nltk - - t = nltk.chunk.tagstr2tree(s, chunk_label="NP") - t.pprint() - print() - - s = """ -These DT B-NP -research NN I-NP -protocols NNS I-NP -offer VBP B-VP -to TO B-PP -the DT B-NP -patient NN I-NP -not RB O -only RB O -the DT B-NP -very RB I-NP -best JJS I-NP -therapy NN I-NP -which WDT B-NP -we PRP B-NP -have VBP B-VP -established VBN I-VP -today NN B-NP -but CC B-NP -also RB I-NP -the DT B-NP -hope NN I-NP -of IN B-PP -something NN B-NP -still RB B-ADJP -better JJR I-ADJP -. . O -""" - - conll_tree = conllstr2tree(s, chunk_types=("NP", "PP")) - conll_tree.pprint() - - # Demonstrate CoNLL output - print("CoNLL output:") - print(nltk.chunk.tree2conllstr(conll_tree)) - print() - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/classify/__init__.py b/pipeline/nltk/classify/__init__.py deleted file mode 100644 index 238522fd6f7cedce69faf8bfb3384b22cc509cbb..0000000000000000000000000000000000000000 --- a/pipeline/nltk/classify/__init__.py +++ /dev/null @@ -1,101 +0,0 @@ -# Natural Language Toolkit: Classifiers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -Classes and interfaces for labeling tokens with category labels (or -"class labels"). Typically, labels are represented with strings -(such as ``'health'`` or ``'sports'``). Classifiers can be used to -perform a wide range of classification tasks. For example, -classifiers can be used... - -- to classify documents by topic -- to classify ambiguous words by which word sense is intended -- to classify acoustic signals by which phoneme they represent -- to classify sentences by their author - -Features -======== -In order to decide which category label is appropriate for a given -token, classifiers examine one or more 'features' of the token. These -"features" are typically chosen by hand, and indicate which aspects -of the token are relevant to the classification decision. For -example, a document classifier might use a separate feature for each -word, recording how often that word occurred in the document. - -Featuresets -=========== -The features describing a token are encoded using a "featureset", -which is a dictionary that maps from "feature names" to "feature -values". Feature names are unique strings that indicate what aspect -of the token is encoded by the feature. Examples include -``'prevword'``, for a feature whose value is the previous word; and -``'contains-word(library)'`` for a feature that is true when a document -contains the word ``'library'``. Feature values are typically -booleans, numbers, or strings, depending on which feature they -describe. - -Featuresets are typically constructed using a "feature detector" -(also known as a "feature extractor"). A feature detector is a -function that takes a token (and sometimes information about its -context) as its input, and returns a featureset describing that token. -For example, the following feature detector converts a document -(stored as a list of words) to a featureset describing the set of -words included in the document: - - >>> # Define a feature detector function. - >>> def document_features(document): - ... return dict([('contains-word(%s)' % w, True) for w in document]) - -Feature detectors are typically applied to each token before it is fed -to the classifier: - - >>> # Classify each Gutenberg document. - >>> from nltk.corpus import gutenberg - >>> for fileid in gutenberg.fileids(): # doctest: +SKIP - ... doc = gutenberg.words(fileid) # doctest: +SKIP - ... print(fileid, classifier.classify(document_features(doc))) # doctest: +SKIP - -The parameters that a feature detector expects will vary, depending on -the task and the needs of the feature detector. For example, a -feature detector for word sense disambiguation (WSD) might take as its -input a sentence, and the index of a word that should be classified, -and return a featureset for that word. The following feature detector -for WSD includes features describing the left and right contexts of -the target word: - - >>> def wsd_features(sentence, index): - ... featureset = {} - ... for i in range(max(0, index-3), index): - ... featureset['left-context(%s)' % sentence[i]] = True - ... for i in range(index, max(index+3, len(sentence))): - ... featureset['right-context(%s)' % sentence[i]] = True - ... return featureset - -Training Classifiers -==================== -Most classifiers are built by training them on a list of hand-labeled -examples, known as the "training set". Training sets are represented -as lists of ``(featuredict, label)`` tuples. -""" - -from nltk.classify.api import ClassifierI, MultiClassifierI -from nltk.classify.decisiontree import DecisionTreeClassifier -from nltk.classify.maxent import ( - BinaryMaxentFeatureEncoding, - ConditionalExponentialClassifier, - MaxentClassifier, - TypedMaxentFeatureEncoding, -) -from nltk.classify.megam import call_megam, config_megam -from nltk.classify.naivebayes import NaiveBayesClassifier -from nltk.classify.positivenaivebayes import PositiveNaiveBayesClassifier -from nltk.classify.rte_classify import RTEFeatureExtractor, rte_classifier, rte_features -from nltk.classify.scikitlearn import SklearnClassifier -from nltk.classify.senna import Senna -from nltk.classify.textcat import TextCat -from nltk.classify.util import accuracy, apply_features, log_likelihood -from nltk.classify.weka import WekaClassifier, config_weka diff --git a/pipeline/nltk/classify/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/classify/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 5fa782a6ec31c9f79a1a4f0ff30b6b23dc366157..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/classify/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/classify/__pycache__/api.cpython-39.pyc b/pipeline/nltk/classify/__pycache__/api.cpython-39.pyc deleted file mode 100644 index 3483e530221a90c27c2d8cd6f15466230c3bd08f..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/classify/__pycache__/api.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/classify/__pycache__/decisiontree.cpython-39.pyc b/pipeline/nltk/classify/__pycache__/decisiontree.cpython-39.pyc deleted file mode 100644 index a530420f0a363f3e864a9ceb407f8b15cce67ed6..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/classify/__pycache__/decisiontree.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/classify/__pycache__/maxent.cpython-39.pyc b/pipeline/nltk/classify/__pycache__/maxent.cpython-39.pyc deleted file mode 100644 index 8367d6424a9b4166ef17e09193c7e840d97ede25..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/classify/__pycache__/maxent.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/classify/__pycache__/megam.cpython-39.pyc b/pipeline/nltk/classify/__pycache__/megam.cpython-39.pyc deleted file mode 100644 index 51b3ecb9b8475f828102a9fa0f24210c06957aec..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/classify/__pycache__/megam.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/classify/__pycache__/naivebayes.cpython-39.pyc b/pipeline/nltk/classify/__pycache__/naivebayes.cpython-39.pyc deleted file mode 100644 index 5fc8556a57d34eec1960fdc961582cbbef675c34..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/classify/__pycache__/naivebayes.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/classify/__pycache__/positivenaivebayes.cpython-39.pyc b/pipeline/nltk/classify/__pycache__/positivenaivebayes.cpython-39.pyc deleted file mode 100644 index 55bc9afb62b18241e22692f618f8a4a442635e9d..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/classify/__pycache__/positivenaivebayes.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/classify/__pycache__/rte_classify.cpython-39.pyc b/pipeline/nltk/classify/__pycache__/rte_classify.cpython-39.pyc deleted file mode 100644 index 9b8011353be9d36d8be46c548436bc5c32bd8c3d..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/classify/__pycache__/rte_classify.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/classify/__pycache__/scikitlearn.cpython-39.pyc b/pipeline/nltk/classify/__pycache__/scikitlearn.cpython-39.pyc deleted file mode 100644 index cb78d68c858c413b4bc19ba50d68f3d481be0d2d..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/classify/__pycache__/scikitlearn.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/classify/__pycache__/senna.cpython-39.pyc b/pipeline/nltk/classify/__pycache__/senna.cpython-39.pyc deleted file mode 100644 index 22bf08c1889fe44049de242e38baa4f6eb7c3b7e..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/classify/__pycache__/senna.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/classify/__pycache__/svm.cpython-39.pyc b/pipeline/nltk/classify/__pycache__/svm.cpython-39.pyc deleted file mode 100644 index 25c0e448d7a828aa7717ba35ce09afdbae560a69..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/classify/__pycache__/svm.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/classify/__pycache__/tadm.cpython-39.pyc b/pipeline/nltk/classify/__pycache__/tadm.cpython-39.pyc deleted file mode 100644 index 8765bfbb2d0fa6b54e0b222044be1433d6f2699c..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/classify/__pycache__/tadm.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/classify/__pycache__/textcat.cpython-39.pyc b/pipeline/nltk/classify/__pycache__/textcat.cpython-39.pyc deleted file mode 100644 index 94febeb6dc2413118062661969c073b86bb89276..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/classify/__pycache__/textcat.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/classify/__pycache__/util.cpython-39.pyc b/pipeline/nltk/classify/__pycache__/util.cpython-39.pyc deleted file mode 100644 index e61ffdf88b939e29414981a462309fa6931fdc92..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/classify/__pycache__/util.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/classify/__pycache__/weka.cpython-39.pyc b/pipeline/nltk/classify/__pycache__/weka.cpython-39.pyc deleted file mode 100644 index f381b0529aa4658eb2c4c7204004a3c91db4cc5f..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/classify/__pycache__/weka.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/classify/api.py b/pipeline/nltk/classify/api.py deleted file mode 100644 index f2c1e25990ecc91dec7d5dd79a69e9a6d4e9fbec..0000000000000000000000000000000000000000 --- a/pipeline/nltk/classify/api.py +++ /dev/null @@ -1,195 +0,0 @@ -# Natural Language Toolkit: Classifier Interface -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird (minor additions) -# URL: -# For license information, see LICENSE.TXT - -""" -Interfaces for labeling tokens with category labels (or "class labels"). - -``ClassifierI`` is a standard interface for "single-category -classification", in which the set of categories is known, the number -of categories is finite, and each text belongs to exactly one -category. - -``MultiClassifierI`` is a standard interface for "multi-category -classification", which is like single-category classification except -that each text belongs to zero or more categories. -""" -from nltk.internals import overridden - -##////////////////////////////////////////////////////// -# { Classification Interfaces -##////////////////////////////////////////////////////// - - -class ClassifierI: - """ - A processing interface for labeling tokens with a single category - label (or "class"). Labels are typically strs or - ints, but can be any immutable type. The set of labels - that the classifier chooses from must be fixed and finite. - - Subclasses must define: - - ``labels()`` - - either ``classify()`` or ``classify_many()`` (or both) - - Subclasses may define: - - either ``prob_classify()`` or ``prob_classify_many()`` (or both) - """ - - def labels(self): - """ - :return: the list of category labels used by this classifier. - :rtype: list of (immutable) - """ - raise NotImplementedError() - - def classify(self, featureset): - """ - :return: the most appropriate label for the given featureset. - :rtype: label - """ - if overridden(self.classify_many): - return self.classify_many([featureset])[0] - else: - raise NotImplementedError() - - def prob_classify(self, featureset): - """ - :return: a probability distribution over labels for the given - featureset. - :rtype: ProbDistI - """ - if overridden(self.prob_classify_many): - return self.prob_classify_many([featureset])[0] - else: - raise NotImplementedError() - - def classify_many(self, featuresets): - """ - Apply ``self.classify()`` to each element of ``featuresets``. I.e.: - - return [self.classify(fs) for fs in featuresets] - - :rtype: list(label) - """ - return [self.classify(fs) for fs in featuresets] - - def prob_classify_many(self, featuresets): - """ - Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.: - - return [self.prob_classify(fs) for fs in featuresets] - - :rtype: list(ProbDistI) - """ - return [self.prob_classify(fs) for fs in featuresets] - - -class MultiClassifierI: - """ - A processing interface for labeling tokens with zero or more - category labels (or "labels"). Labels are typically strs - or ints, but can be any immutable type. The set of labels - that the multi-classifier chooses from must be fixed and finite. - - Subclasses must define: - - ``labels()`` - - either ``classify()`` or ``classify_many()`` (or both) - - Subclasses may define: - - either ``prob_classify()`` or ``prob_classify_many()`` (or both) - """ - - def labels(self): - """ - :return: the list of category labels used by this classifier. - :rtype: list of (immutable) - """ - raise NotImplementedError() - - def classify(self, featureset): - """ - :return: the most appropriate set of labels for the given featureset. - :rtype: set(label) - """ - if overridden(self.classify_many): - return self.classify_many([featureset])[0] - else: - raise NotImplementedError() - - def prob_classify(self, featureset): - """ - :return: a probability distribution over sets of labels for the - given featureset. - :rtype: ProbDistI - """ - if overridden(self.prob_classify_many): - return self.prob_classify_many([featureset])[0] - else: - raise NotImplementedError() - - def classify_many(self, featuresets): - """ - Apply ``self.classify()`` to each element of ``featuresets``. I.e.: - - return [self.classify(fs) for fs in featuresets] - - :rtype: list(set(label)) - """ - return [self.classify(fs) for fs in featuresets] - - def prob_classify_many(self, featuresets): - """ - Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.: - - return [self.prob_classify(fs) for fs in featuresets] - - :rtype: list(ProbDistI) - """ - return [self.prob_classify(fs) for fs in featuresets] - - -# # [XX] IN PROGRESS: -# class SequenceClassifierI: -# """ -# A processing interface for labeling sequences of tokens with a -# single category label (or "class"). Labels are typically -# strs or ints, but can be any immutable type. The set -# of labels that the classifier chooses from must be fixed and -# finite. -# """ -# def labels(self): -# """ -# :return: the list of category labels used by this classifier. -# :rtype: list of (immutable) -# """ -# raise NotImplementedError() - -# def prob_classify(self, featureset): -# """ -# Return a probability distribution over labels for the given -# featureset. - -# If ``featureset`` is a list of featuresets, then return a -# corresponding list containing the probability distribution -# over labels for each of the given featuresets, where the -# *i*\ th element of this list is the most appropriate label for -# the *i*\ th element of ``featuresets``. -# """ -# raise NotImplementedError() - -# def classify(self, featureset): -# """ -# Return the most appropriate label for the given featureset. - -# If ``featureset`` is a list of featuresets, then return a -# corresponding list containing the most appropriate label for -# each of the given featuresets, where the *i*\ th element of -# this list is the most appropriate label for the *i*\ th element -# of ``featuresets``. -# """ -# raise NotImplementedError() diff --git a/pipeline/nltk/classify/decisiontree.py b/pipeline/nltk/classify/decisiontree.py deleted file mode 100644 index 41d04e8ad8fe15e5ac1f2319f2e28f83f5dcfe7a..0000000000000000000000000000000000000000 --- a/pipeline/nltk/classify/decisiontree.py +++ /dev/null @@ -1,349 +0,0 @@ -# Natural Language Toolkit: Decision Tree Classifiers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -A classifier model that decides which label to assign to a token on -the basis of a tree structure, where branches correspond to conditions -on feature values, and leaves correspond to label assignments. -""" - -from collections import defaultdict - -from nltk.classify.api import ClassifierI -from nltk.probability import FreqDist, MLEProbDist, entropy - - -class DecisionTreeClassifier(ClassifierI): - def __init__(self, label, feature_name=None, decisions=None, default=None): - """ - :param label: The most likely label for tokens that reach - this node in the decision tree. If this decision tree - has no children, then this label will be assigned to - any token that reaches this decision tree. - :param feature_name: The name of the feature that this - decision tree selects for. - :param decisions: A dictionary mapping from feature values - for the feature identified by ``feature_name`` to - child decision trees. - :param default: The child that will be used if the value of - feature ``feature_name`` does not match any of the keys in - ``decisions``. This is used when constructing binary - decision trees. - """ - self._label = label - self._fname = feature_name - self._decisions = decisions - self._default = default - - def labels(self): - labels = [self._label] - if self._decisions is not None: - for dt in self._decisions.values(): - labels.extend(dt.labels()) - if self._default is not None: - labels.extend(self._default.labels()) - return list(set(labels)) - - def classify(self, featureset): - # Decision leaf: - if self._fname is None: - return self._label - - # Decision tree: - fval = featureset.get(self._fname) - if fval in self._decisions: - return self._decisions[fval].classify(featureset) - elif self._default is not None: - return self._default.classify(featureset) - else: - return self._label - - def error(self, labeled_featuresets): - errors = 0 - for featureset, label in labeled_featuresets: - if self.classify(featureset) != label: - errors += 1 - return errors / len(labeled_featuresets) - - def pretty_format(self, width=70, prefix="", depth=4): - """ - Return a string containing a pretty-printed version of this - decision tree. Each line in this string corresponds to a - single decision tree node or leaf, and indentation is used to - display the structure of the decision tree. - """ - # [xx] display default!! - if self._fname is None: - n = width - len(prefix) - 15 - return "{}{} {}\n".format(prefix, "." * n, self._label) - s = "" - for i, (fval, result) in enumerate( - sorted( - self._decisions.items(), - key=lambda item: (item[0] in [None, False, True], str(item[0]).lower()), - ) - ): - hdr = f"{prefix}{self._fname}={fval}? " - n = width - 15 - len(hdr) - s += "{}{} {}\n".format(hdr, "." * (n), result._label) - if result._fname is not None and depth > 1: - s += result.pretty_format(width, prefix + " ", depth - 1) - if self._default is not None: - n = width - len(prefix) - 21 - s += "{}else: {} {}\n".format(prefix, "." * n, self._default._label) - if self._default._fname is not None and depth > 1: - s += self._default.pretty_format(width, prefix + " ", depth - 1) - return s - - def pseudocode(self, prefix="", depth=4): - """ - Return a string representation of this decision tree that - expresses the decisions it makes as a nested set of pseudocode - if statements. - """ - if self._fname is None: - return f"{prefix}return {self._label!r}\n" - s = "" - for (fval, result) in sorted( - self._decisions.items(), - key=lambda item: (item[0] in [None, False, True], str(item[0]).lower()), - ): - s += f"{prefix}if {self._fname} == {fval!r}: " - if result._fname is not None and depth > 1: - s += "\n" + result.pseudocode(prefix + " ", depth - 1) - else: - s += f"return {result._label!r}\n" - if self._default is not None: - if len(self._decisions) == 1: - s += "{}if {} != {!r}: ".format( - prefix, self._fname, list(self._decisions.keys())[0] - ) - else: - s += f"{prefix}else: " - if self._default._fname is not None and depth > 1: - s += "\n" + self._default.pseudocode(prefix + " ", depth - 1) - else: - s += f"return {self._default._label!r}\n" - return s - - def __str__(self): - return self.pretty_format() - - @staticmethod - def train( - labeled_featuresets, - entropy_cutoff=0.05, - depth_cutoff=100, - support_cutoff=10, - binary=False, - feature_values=None, - verbose=False, - ): - """ - :param binary: If true, then treat all feature/value pairs as - individual binary features, rather than using a single n-way - branch for each feature. - """ - # Collect a list of all feature names. - feature_names = set() - for featureset, label in labeled_featuresets: - for fname in featureset: - feature_names.add(fname) - - # Collect a list of the values each feature can take. - if feature_values is None and binary: - feature_values = defaultdict(set) - for featureset, label in labeled_featuresets: - for fname, fval in featureset.items(): - feature_values[fname].add(fval) - - # Start with a stump. - if not binary: - tree = DecisionTreeClassifier.best_stump( - feature_names, labeled_featuresets, verbose - ) - else: - tree = DecisionTreeClassifier.best_binary_stump( - feature_names, labeled_featuresets, feature_values, verbose - ) - - # Refine the stump. - tree.refine( - labeled_featuresets, - entropy_cutoff, - depth_cutoff - 1, - support_cutoff, - binary, - feature_values, - verbose, - ) - - # Return it - return tree - - @staticmethod - def leaf(labeled_featuresets): - label = FreqDist(label for (featureset, label) in labeled_featuresets).max() - return DecisionTreeClassifier(label) - - @staticmethod - def stump(feature_name, labeled_featuresets): - label = FreqDist(label for (featureset, label) in labeled_featuresets).max() - - # Find the best label for each value. - freqs = defaultdict(FreqDist) # freq(label|value) - for featureset, label in labeled_featuresets: - feature_value = featureset.get(feature_name) - freqs[feature_value][label] += 1 - - decisions = {val: DecisionTreeClassifier(freqs[val].max()) for val in freqs} - return DecisionTreeClassifier(label, feature_name, decisions) - - def refine( - self, - labeled_featuresets, - entropy_cutoff, - depth_cutoff, - support_cutoff, - binary=False, - feature_values=None, - verbose=False, - ): - if len(labeled_featuresets) <= support_cutoff: - return - if self._fname is None: - return - if depth_cutoff <= 0: - return - for fval in self._decisions: - fval_featuresets = [ - (featureset, label) - for (featureset, label) in labeled_featuresets - if featureset.get(self._fname) == fval - ] - - label_freqs = FreqDist(label for (featureset, label) in fval_featuresets) - if entropy(MLEProbDist(label_freqs)) > entropy_cutoff: - self._decisions[fval] = DecisionTreeClassifier.train( - fval_featuresets, - entropy_cutoff, - depth_cutoff, - support_cutoff, - binary, - feature_values, - verbose, - ) - if self._default is not None: - default_featuresets = [ - (featureset, label) - for (featureset, label) in labeled_featuresets - if featureset.get(self._fname) not in self._decisions - ] - label_freqs = FreqDist(label for (featureset, label) in default_featuresets) - if entropy(MLEProbDist(label_freqs)) > entropy_cutoff: - self._default = DecisionTreeClassifier.train( - default_featuresets, - entropy_cutoff, - depth_cutoff, - support_cutoff, - binary, - feature_values, - verbose, - ) - - @staticmethod - def best_stump(feature_names, labeled_featuresets, verbose=False): - best_stump = DecisionTreeClassifier.leaf(labeled_featuresets) - best_error = best_stump.error(labeled_featuresets) - for fname in feature_names: - stump = DecisionTreeClassifier.stump(fname, labeled_featuresets) - stump_error = stump.error(labeled_featuresets) - if stump_error < best_error: - best_error = stump_error - best_stump = stump - if verbose: - print( - "best stump for {:6d} toks uses {:20} err={:6.4f}".format( - len(labeled_featuresets), best_stump._fname, best_error - ) - ) - return best_stump - - @staticmethod - def binary_stump(feature_name, feature_value, labeled_featuresets): - label = FreqDist(label for (featureset, label) in labeled_featuresets).max() - - # Find the best label for each value. - pos_fdist = FreqDist() - neg_fdist = FreqDist() - for featureset, label in labeled_featuresets: - if featureset.get(feature_name) == feature_value: - pos_fdist[label] += 1 - else: - neg_fdist[label] += 1 - - decisions = {} - default = label - # But hopefully we have observations! - if pos_fdist.N() > 0: - decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())} - if neg_fdist.N() > 0: - default = DecisionTreeClassifier(neg_fdist.max()) - - return DecisionTreeClassifier(label, feature_name, decisions, default) - - @staticmethod - def best_binary_stump( - feature_names, labeled_featuresets, feature_values, verbose=False - ): - best_stump = DecisionTreeClassifier.leaf(labeled_featuresets) - best_error = best_stump.error(labeled_featuresets) - for fname in feature_names: - for fval in feature_values[fname]: - stump = DecisionTreeClassifier.binary_stump( - fname, fval, labeled_featuresets - ) - stump_error = stump.error(labeled_featuresets) - if stump_error < best_error: - best_error = stump_error - best_stump = stump - if verbose: - if best_stump._decisions: - descr = "{}={}".format( - best_stump._fname, list(best_stump._decisions.keys())[0] - ) - else: - descr = "(default)" - print( - "best stump for {:6d} toks uses {:20} err={:6.4f}".format( - len(labeled_featuresets), descr, best_error - ) - ) - return best_stump - - -##////////////////////////////////////////////////////// -## Demo -##////////////////////////////////////////////////////// - - -def f(x): - return DecisionTreeClassifier.train(x, binary=True, verbose=True) - - -def demo(): - from nltk.classify.util import binary_names_demo_features, names_demo - - classifier = names_demo( - f, binary_names_demo_features # DecisionTreeClassifier.train, - ) - print(classifier.pretty_format(depth=7)) - print(classifier.pseudocode(depth=7)) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/classify/maxent.py b/pipeline/nltk/classify/maxent.py deleted file mode 100644 index e9f66503756a0768ece53179cf3ff8f231c2aab1..0000000000000000000000000000000000000000 --- a/pipeline/nltk/classify/maxent.py +++ /dev/null @@ -1,1569 +0,0 @@ -# Natural Language Toolkit: Maximum Entropy Classifiers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Dmitry Chichkov (TypedMaxentFeatureEncoding) -# URL: -# For license information, see LICENSE.TXT - -""" -A classifier model based on maximum entropy modeling framework. This -framework considers all of the probability distributions that are -empirically consistent with the training data; and chooses the -distribution with the highest entropy. A probability distribution is -"empirically consistent" with a set of training data if its estimated -frequency with which a class and a feature vector value co-occur is -equal to the actual frequency in the data. - -Terminology: 'feature' -====================== -The term *feature* is usually used to refer to some property of an -unlabeled token. For example, when performing word sense -disambiguation, we might define a ``'prevword'`` feature whose value is -the word preceding the target word. However, in the context of -maxent modeling, the term *feature* is typically used to refer to a -property of a "labeled" token. In order to prevent confusion, we -will introduce two distinct terms to disambiguate these two different -concepts: - - - An "input-feature" is a property of an unlabeled token. - - A "joint-feature" is a property of a labeled token. - -In the rest of the ``nltk.classify`` module, the term "features" is -used to refer to what we will call "input-features" in this module. - -In literature that describes and discusses maximum entropy models, -input-features are typically called "contexts", and joint-features -are simply referred to as "features". - -Converting Input-Features to Joint-Features -------------------------------------------- -In maximum entropy models, joint-features are required to have numeric -values. Typically, each input-feature ``input_feat`` is mapped to a -set of joint-features of the form: - -| joint_feat(token, label) = { 1 if input_feat(token) == feat_val -| { and label == some_label -| { -| { 0 otherwise - -For all values of ``feat_val`` and ``some_label``. This mapping is -performed by classes that implement the ``MaxentFeatureEncodingI`` -interface. -""" -try: - import numpy -except ImportError: - pass - -import os -import tempfile -from collections import defaultdict - -from nltk.classify.api import ClassifierI -from nltk.classify.megam import call_megam, parse_megam_weights, write_megam_file -from nltk.classify.tadm import call_tadm, parse_tadm_weights, write_tadm_file -from nltk.classify.util import CutoffChecker, accuracy, log_likelihood -from nltk.data import gzip_open_unicode -from nltk.probability import DictionaryProbDist -from nltk.util import OrderedDict - -__docformat__ = "epytext en" - -###################################################################### -# { Classifier Model -###################################################################### - - -class MaxentClassifier(ClassifierI): - """ - A maximum entropy classifier (also known as a "conditional - exponential classifier"). This classifier is parameterized by a - set of "weights", which are used to combine the joint-features - that are generated from a featureset by an "encoding". In - particular, the encoding maps each ``(featureset, label)`` pair to - a vector. The probability of each label is then computed using - the following equation:: - - dotprod(weights, encode(fs,label)) - prob(fs|label) = --------------------------------------------------- - sum(dotprod(weights, encode(fs,l)) for l in labels) - - Where ``dotprod`` is the dot product:: - - dotprod(a,b) = sum(x*y for (x,y) in zip(a,b)) - """ - - def __init__(self, encoding, weights, logarithmic=True): - """ - Construct a new maxent classifier model. Typically, new - classifier models are created using the ``train()`` method. - - :type encoding: MaxentFeatureEncodingI - :param encoding: An encoding that is used to convert the - featuresets that are given to the ``classify`` method into - joint-feature vectors, which are used by the maxent - classifier model. - - :type weights: list of float - :param weights: The feature weight vector for this classifier. - - :type logarithmic: bool - :param logarithmic: If false, then use non-logarithmic weights. - """ - self._encoding = encoding - self._weights = weights - self._logarithmic = logarithmic - # self._logarithmic = False - assert encoding.length() == len(weights) - - def labels(self): - return self._encoding.labels() - - def set_weights(self, new_weights): - """ - Set the feature weight vector for this classifier. - :param new_weights: The new feature weight vector. - :type new_weights: list of float - """ - self._weights = new_weights - assert self._encoding.length() == len(new_weights) - - def weights(self): - """ - :return: The feature weight vector for this classifier. - :rtype: list of float - """ - return self._weights - - def classify(self, featureset): - return self.prob_classify(featureset).max() - - def prob_classify(self, featureset): - prob_dict = {} - for label in self._encoding.labels(): - feature_vector = self._encoding.encode(featureset, label) - - if self._logarithmic: - total = 0.0 - for (f_id, f_val) in feature_vector: - total += self._weights[f_id] * f_val - prob_dict[label] = total - - else: - prod = 1.0 - for (f_id, f_val) in feature_vector: - prod *= self._weights[f_id] ** f_val - prob_dict[label] = prod - - # Normalize the dictionary to give a probability distribution - return DictionaryProbDist(prob_dict, log=self._logarithmic, normalize=True) - - def explain(self, featureset, columns=4): - """ - Print a table showing the effect of each of the features in - the given feature set, and how they combine to determine the - probabilities of each label for that featureset. - """ - descr_width = 50 - TEMPLATE = " %-" + str(descr_width - 2) + "s%s%8.3f" - - pdist = self.prob_classify(featureset) - labels = sorted(pdist.samples(), key=pdist.prob, reverse=True) - labels = labels[:columns] - print( - " Feature".ljust(descr_width) - + "".join("%8s" % (("%s" % l)[:7]) for l in labels) - ) - print(" " + "-" * (descr_width - 2 + 8 * len(labels))) - sums = defaultdict(int) - for i, label in enumerate(labels): - feature_vector = self._encoding.encode(featureset, label) - feature_vector.sort( - key=lambda fid__: abs(self._weights[fid__[0]]), reverse=True - ) - for (f_id, f_val) in feature_vector: - if self._logarithmic: - score = self._weights[f_id] * f_val - else: - score = self._weights[f_id] ** f_val - descr = self._encoding.describe(f_id) - descr = descr.split(" and label is ")[0] # hack - descr += " (%s)" % f_val # hack - if len(descr) > 47: - descr = descr[:44] + "..." - print(TEMPLATE % (descr, i * 8 * " ", score)) - sums[label] += score - print(" " + "-" * (descr_width - 1 + 8 * len(labels))) - print( - " TOTAL:".ljust(descr_width) + "".join("%8.3f" % sums[l] for l in labels) - ) - print( - " PROBS:".ljust(descr_width) - + "".join("%8.3f" % pdist.prob(l) for l in labels) - ) - - def most_informative_features(self, n=10): - """ - Generates the ranked list of informative features from most to least. - """ - if hasattr(self, "_most_informative_features"): - return self._most_informative_features[:n] - else: - self._most_informative_features = sorted( - list(range(len(self._weights))), - key=lambda fid: abs(self._weights[fid]), - reverse=True, - ) - return self._most_informative_features[:n] - - def show_most_informative_features(self, n=10, show="all"): - """ - :param show: all, neg, or pos (for negative-only or positive-only) - :type show: str - :param n: The no. of top features - :type n: int - """ - # Use None the full list of ranked features. - fids = self.most_informative_features(None) - if show == "pos": - fids = [fid for fid in fids if self._weights[fid] > 0] - elif show == "neg": - fids = [fid for fid in fids if self._weights[fid] < 0] - for fid in fids[:n]: - print(f"{self._weights[fid]:8.3f} {self._encoding.describe(fid)}") - - def __repr__(self): - return "" % ( - len(self._encoding.labels()), - self._encoding.length(), - ) - - #: A list of the algorithm names that are accepted for the - #: ``train()`` method's ``algorithm`` parameter. - ALGORITHMS = ["GIS", "IIS", "MEGAM", "TADM"] - - @classmethod - def train( - cls, - train_toks, - algorithm=None, - trace=3, - encoding=None, - labels=None, - gaussian_prior_sigma=0, - **cutoffs, - ): - """ - Train a new maxent classifier based on the given corpus of - training samples. This classifier will have its weights - chosen to maximize entropy while remaining empirically - consistent with the training corpus. - - :rtype: MaxentClassifier - :return: The new maxent classifier - - :type train_toks: list - :param train_toks: Training data, represented as a list of - pairs, the first member of which is a featureset, - and the second of which is a classification label. - - :type algorithm: str - :param algorithm: A case-insensitive string, specifying which - algorithm should be used to train the classifier. The - following algorithms are currently available. - - - Iterative Scaling Methods: Generalized Iterative Scaling (``'GIS'``), - Improved Iterative Scaling (``'IIS'``) - - External Libraries (requiring megam): - LM-BFGS algorithm, with training performed by Megam (``'megam'``) - - The default algorithm is ``'IIS'``. - - :type trace: int - :param trace: The level of diagnostic tracing output to produce. - Higher values produce more verbose output. - :type encoding: MaxentFeatureEncodingI - :param encoding: A feature encoding, used to convert featuresets - into feature vectors. If none is specified, then a - ``BinaryMaxentFeatureEncoding`` will be built based on the - features that are attested in the training corpus. - :type labels: list(str) - :param labels: The set of possible labels. If none is given, then - the set of all labels attested in the training data will be - used instead. - :param gaussian_prior_sigma: The sigma value for a gaussian - prior on model weights. Currently, this is supported by - ``megam``. For other algorithms, its value is ignored. - :param cutoffs: Arguments specifying various conditions under - which the training should be halted. (Some of the cutoff - conditions are not supported by some algorithms.) - - - ``max_iter=v``: Terminate after ``v`` iterations. - - ``min_ll=v``: Terminate after the negative average - log-likelihood drops under ``v``. - - ``min_lldelta=v``: Terminate if a single iteration improves - log likelihood by less than ``v``. - """ - if algorithm is None: - algorithm = "iis" - for key in cutoffs: - if key not in ( - "max_iter", - "min_ll", - "min_lldelta", - "max_acc", - "min_accdelta", - "count_cutoff", - "norm", - "explicit", - "bernoulli", - ): - raise TypeError("Unexpected keyword arg %r" % key) - algorithm = algorithm.lower() - if algorithm == "iis": - return train_maxent_classifier_with_iis( - train_toks, trace, encoding, labels, **cutoffs - ) - elif algorithm == "gis": - return train_maxent_classifier_with_gis( - train_toks, trace, encoding, labels, **cutoffs - ) - elif algorithm == "megam": - return train_maxent_classifier_with_megam( - train_toks, trace, encoding, labels, gaussian_prior_sigma, **cutoffs - ) - elif algorithm == "tadm": - kwargs = cutoffs - kwargs["trace"] = trace - kwargs["encoding"] = encoding - kwargs["labels"] = labels - kwargs["gaussian_prior_sigma"] = gaussian_prior_sigma - return TadmMaxentClassifier.train(train_toks, **kwargs) - else: - raise ValueError("Unknown algorithm %s" % algorithm) - - -#: Alias for MaxentClassifier. -ConditionalExponentialClassifier = MaxentClassifier - - -###################################################################### -# { Feature Encodings -###################################################################### - - -class MaxentFeatureEncodingI: - """ - A mapping that converts a set of input-feature values to a vector - of joint-feature values, given a label. This conversion is - necessary to translate featuresets into a format that can be used - by maximum entropy models. - - The set of joint-features used by a given encoding is fixed, and - each index in the generated joint-feature vectors corresponds to a - single joint-feature. The length of the generated joint-feature - vectors is therefore constant (for a given encoding). - - Because the joint-feature vectors generated by - ``MaxentFeatureEncodingI`` are typically very sparse, they are - represented as a list of ``(index, value)`` tuples, specifying the - value of each non-zero joint-feature. - - Feature encodings are generally created using the ``train()`` - method, which generates an appropriate encoding based on the - input-feature values and labels that are present in a given - corpus. - """ - - def encode(self, featureset, label): - """ - Given a (featureset, label) pair, return the corresponding - vector of joint-feature values. This vector is represented as - a list of ``(index, value)`` tuples, specifying the value of - each non-zero joint-feature. - - :type featureset: dict - :rtype: list(tuple(int, int)) - """ - raise NotImplementedError() - - def length(self): - """ - :return: The size of the fixed-length joint-feature vectors - that are generated by this encoding. - :rtype: int - """ - raise NotImplementedError() - - def labels(self): - """ - :return: A list of the \"known labels\" -- i.e., all labels - ``l`` such that ``self.encode(fs,l)`` can be a nonzero - joint-feature vector for some value of ``fs``. - :rtype: list - """ - raise NotImplementedError() - - def describe(self, fid): - """ - :return: A string describing the value of the joint-feature - whose index in the generated feature vectors is ``fid``. - :rtype: str - """ - raise NotImplementedError() - - def train(cls, train_toks): - """ - Construct and return new feature encoding, based on a given - training corpus ``train_toks``. - - :type train_toks: list(tuple(dict, str)) - :param train_toks: Training data, represented as a list of - pairs, the first member of which is a feature dictionary, - and the second of which is a classification label. - """ - raise NotImplementedError() - - -class FunctionBackedMaxentFeatureEncoding(MaxentFeatureEncodingI): - """ - A feature encoding that calls a user-supplied function to map a - given featureset/label pair to a sparse joint-feature vector. - """ - - def __init__(self, func, length, labels): - """ - Construct a new feature encoding based on the given function. - - :type func: (callable) - :param func: A function that takes two arguments, a featureset - and a label, and returns the sparse joint feature vector - that encodes them:: - - func(featureset, label) -> feature_vector - - This sparse joint feature vector (``feature_vector``) is a - list of ``(index,value)`` tuples. - - :type length: int - :param length: The size of the fixed-length joint-feature - vectors that are generated by this encoding. - - :type labels: list - :param labels: A list of the \"known labels\" for this - encoding -- i.e., all labels ``l`` such that - ``self.encode(fs,l)`` can be a nonzero joint-feature vector - for some value of ``fs``. - """ - self._length = length - self._func = func - self._labels = labels - - def encode(self, featureset, label): - return self._func(featureset, label) - - def length(self): - return self._length - - def labels(self): - return self._labels - - def describe(self, fid): - return "no description available" - - -class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI): - """ - A feature encoding that generates vectors containing a binary - joint-features of the form: - - | joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label) - | { - | { 0 otherwise - - Where ``fname`` is the name of an input-feature, ``fval`` is a value - for that input-feature, and ``label`` is a label. - - Typically, these features are constructed based on a training - corpus, using the ``train()`` method. This method will create one - feature for each combination of ``fname``, ``fval``, and ``label`` - that occurs at least once in the training corpus. - - The ``unseen_features`` parameter can be used to add "unseen-value - features", which are used whenever an input feature has a value - that was not encountered in the training corpus. These features - have the form: - - | joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname]) - | { and l == label - | { - | { 0 otherwise - - Where ``is_unseen(fname, fval)`` is true if the encoding does not - contain any joint features that are true when ``fs[fname]==fval``. - - The ``alwayson_features`` parameter can be used to add "always-on - features", which have the form:: - - | joint_feat(fs, l) = { 1 if (l == label) - | { - | { 0 otherwise - - These always-on features allow the maxent model to directly model - the prior probabilities of each label. - """ - - def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False): - """ - :param labels: A list of the \"known labels\" for this encoding. - - :param mapping: A dictionary mapping from ``(fname,fval,label)`` - tuples to corresponding joint-feature indexes. These - indexes must be the set of integers from 0...len(mapping). - If ``mapping[fname,fval,label]=id``, then - ``self.encode(..., fname:fval, ..., label)[id]`` is 1; - otherwise, it is 0. - - :param unseen_features: If true, then include unseen value - features in the generated joint-feature vectors. - - :param alwayson_features: If true, then include always-on - features in the generated joint-feature vectors. - """ - if set(mapping.values()) != set(range(len(mapping))): - raise ValueError( - "Mapping values must be exactly the " - "set of integers from 0...len(mapping)" - ) - - self._labels = list(labels) - """A list of attested labels.""" - - self._mapping = mapping - """dict mapping from (fname,fval,label) -> fid""" - - self._length = len(mapping) - """The length of generated joint feature vectors.""" - - self._alwayson = None - """dict mapping from label -> fid""" - - self._unseen = None - """dict mapping from fname -> fid""" - - if alwayson_features: - self._alwayson = { - label: i + self._length for (i, label) in enumerate(labels) - } - self._length += len(self._alwayson) - - if unseen_features: - fnames = {fname for (fname, fval, label) in mapping} - self._unseen = {fname: i + self._length for (i, fname) in enumerate(fnames)} - self._length += len(fnames) - - def encode(self, featureset, label): - # Inherit docs. - encoding = [] - - # Convert input-features to joint-features: - for fname, fval in featureset.items(): - # Known feature name & value: - if (fname, fval, label) in self._mapping: - encoding.append((self._mapping[fname, fval, label], 1)) - - # Otherwise, we might want to fire an "unseen-value feature". - elif self._unseen: - # Have we seen this fname/fval combination with any label? - for label2 in self._labels: - if (fname, fval, label2) in self._mapping: - break # we've seen this fname/fval combo - # We haven't -- fire the unseen-value feature - else: - if fname in self._unseen: - encoding.append((self._unseen[fname], 1)) - - # Add always-on features: - if self._alwayson and label in self._alwayson: - encoding.append((self._alwayson[label], 1)) - - return encoding - - def describe(self, f_id): - # Inherit docs. - if not isinstance(f_id, int): - raise TypeError("describe() expected an int") - try: - self._inv_mapping - except AttributeError: - self._inv_mapping = [-1] * len(self._mapping) - for (info, i) in self._mapping.items(): - self._inv_mapping[i] = info - - if f_id < len(self._mapping): - (fname, fval, label) = self._inv_mapping[f_id] - return f"{fname}=={fval!r} and label is {label!r}" - elif self._alwayson and f_id in self._alwayson.values(): - for (label, f_id2) in self._alwayson.items(): - if f_id == f_id2: - return "label is %r" % label - elif self._unseen and f_id in self._unseen.values(): - for (fname, f_id2) in self._unseen.items(): - if f_id == f_id2: - return "%s is unseen" % fname - else: - raise ValueError("Bad feature id") - - def labels(self): - # Inherit docs. - return self._labels - - def length(self): - # Inherit docs. - return self._length - - @classmethod - def train(cls, train_toks, count_cutoff=0, labels=None, **options): - """ - Construct and return new feature encoding, based on a given - training corpus ``train_toks``. See the class description - ``BinaryMaxentFeatureEncoding`` for a description of the - joint-features that will be included in this encoding. - - :type train_toks: list(tuple(dict, str)) - :param train_toks: Training data, represented as a list of - pairs, the first member of which is a feature dictionary, - and the second of which is a classification label. - - :type count_cutoff: int - :param count_cutoff: A cutoff value that is used to discard - rare joint-features. If a joint-feature's value is 1 - fewer than ``count_cutoff`` times in the training corpus, - then that joint-feature is not included in the generated - encoding. - - :type labels: list - :param labels: A list of labels that should be used by the - classifier. If not specified, then the set of labels - attested in ``train_toks`` will be used. - - :param options: Extra parameters for the constructor, such as - ``unseen_features`` and ``alwayson_features``. - """ - mapping = {} # maps (fname, fval, label) -> fid - seen_labels = set() # The set of labels we've encountered - count = defaultdict(int) # maps (fname, fval) -> count - - for (tok, label) in train_toks: - if labels and label not in labels: - raise ValueError("Unexpected label %s" % label) - seen_labels.add(label) - - # Record each of the features. - for (fname, fval) in tok.items(): - - # If a count cutoff is given, then only add a joint - # feature once the corresponding (fname, fval, label) - # tuple exceeds that cutoff. - count[fname, fval] += 1 - if count[fname, fval] >= count_cutoff: - if (fname, fval, label) not in mapping: - mapping[fname, fval, label] = len(mapping) - - if labels is None: - labels = seen_labels - return cls(labels, mapping, **options) - - -class GISEncoding(BinaryMaxentFeatureEncoding): - """ - A binary feature encoding which adds one new joint-feature to the - joint-features defined by ``BinaryMaxentFeatureEncoding``: a - correction feature, whose value is chosen to ensure that the - sparse vector always sums to a constant non-negative number. This - new feature is used to ensure two preconditions for the GIS - training algorithm: - - - At least one feature vector index must be nonzero for every - token. - - The feature vector must sum to a constant non-negative number - for every token. - """ - - def __init__( - self, labels, mapping, unseen_features=False, alwayson_features=False, C=None - ): - """ - :param C: The correction constant. The value of the correction - feature is based on this value. In particular, its value is - ``C - sum([v for (f,v) in encoding])``. - :seealso: ``BinaryMaxentFeatureEncoding.__init__`` - """ - BinaryMaxentFeatureEncoding.__init__( - self, labels, mapping, unseen_features, alwayson_features - ) - if C is None: - C = len({fname for (fname, fval, label) in mapping}) + 1 - self._C = C - - @property - def C(self): - """The non-negative constant that all encoded feature vectors - will sum to.""" - return self._C - - def encode(self, featureset, label): - # Get the basic encoding. - encoding = BinaryMaxentFeatureEncoding.encode(self, featureset, label) - base_length = BinaryMaxentFeatureEncoding.length(self) - - # Add a correction feature. - total = sum(v for (f, v) in encoding) - if total >= self._C: - raise ValueError("Correction feature is not high enough!") - encoding.append((base_length, self._C - total)) - - # Return the result - return encoding - - def length(self): - return BinaryMaxentFeatureEncoding.length(self) + 1 - - def describe(self, f_id): - if f_id == BinaryMaxentFeatureEncoding.length(self): - return "Correction feature (%s)" % self._C - else: - return BinaryMaxentFeatureEncoding.describe(self, f_id) - - -class TadmEventMaxentFeatureEncoding(BinaryMaxentFeatureEncoding): - def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False): - self._mapping = OrderedDict(mapping) - self._label_mapping = OrderedDict() - BinaryMaxentFeatureEncoding.__init__( - self, labels, self._mapping, unseen_features, alwayson_features - ) - - def encode(self, featureset, label): - encoding = [] - for feature, value in featureset.items(): - if (feature, label) not in self._mapping: - self._mapping[(feature, label)] = len(self._mapping) - if value not in self._label_mapping: - if not isinstance(value, int): - self._label_mapping[value] = len(self._label_mapping) - else: - self._label_mapping[value] = value - encoding.append( - (self._mapping[(feature, label)], self._label_mapping[value]) - ) - return encoding - - def labels(self): - return self._labels - - def describe(self, fid): - for (feature, label) in self._mapping: - if self._mapping[(feature, label)] == fid: - return (feature, label) - - def length(self): - return len(self._mapping) - - @classmethod - def train(cls, train_toks, count_cutoff=0, labels=None, **options): - mapping = OrderedDict() - if not labels: - labels = [] - - # This gets read twice, so compute the values in case it's lazy. - train_toks = list(train_toks) - - for (featureset, label) in train_toks: - if label not in labels: - labels.append(label) - - for (featureset, label) in train_toks: - for label in labels: - for feature in featureset: - if (feature, label) not in mapping: - mapping[(feature, label)] = len(mapping) - - return cls(labels, mapping, **options) - - -class TypedMaxentFeatureEncoding(MaxentFeatureEncodingI): - """ - A feature encoding that generates vectors containing integer, - float and binary joint-features of the form: - - Binary (for string and boolean features): - - | joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label) - | { - | { 0 otherwise - - Value (for integer and float features): - - | joint_feat(fs, l) = { fval if (fs[fname] == type(fval)) - | { and (l == label) - | { - | { not encoded otherwise - - Where ``fname`` is the name of an input-feature, ``fval`` is a value - for that input-feature, and ``label`` is a label. - - Typically, these features are constructed based on a training - corpus, using the ``train()`` method. - - For string and boolean features [type(fval) not in (int, float)] - this method will create one feature for each combination of - ``fname``, ``fval``, and ``label`` that occurs at least once in the - training corpus. - - For integer and float features [type(fval) in (int, float)] this - method will create one feature for each combination of ``fname`` - and ``label`` that occurs at least once in the training corpus. - - For binary features the ``unseen_features`` parameter can be used - to add "unseen-value features", which are used whenever an input - feature has a value that was not encountered in the training - corpus. These features have the form: - - | joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname]) - | { and l == label - | { - | { 0 otherwise - - Where ``is_unseen(fname, fval)`` is true if the encoding does not - contain any joint features that are true when ``fs[fname]==fval``. - - The ``alwayson_features`` parameter can be used to add "always-on - features", which have the form: - - | joint_feat(fs, l) = { 1 if (l == label) - | { - | { 0 otherwise - - These always-on features allow the maxent model to directly model - the prior probabilities of each label. - """ - - def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False): - """ - :param labels: A list of the \"known labels\" for this encoding. - - :param mapping: A dictionary mapping from ``(fname,fval,label)`` - tuples to corresponding joint-feature indexes. These - indexes must be the set of integers from 0...len(mapping). - If ``mapping[fname,fval,label]=id``, then - ``self.encode({..., fname:fval, ...``, label)[id]} is 1; - otherwise, it is 0. - - :param unseen_features: If true, then include unseen value - features in the generated joint-feature vectors. - - :param alwayson_features: If true, then include always-on - features in the generated joint-feature vectors. - """ - if set(mapping.values()) != set(range(len(mapping))): - raise ValueError( - "Mapping values must be exactly the " - "set of integers from 0...len(mapping)" - ) - - self._labels = list(labels) - """A list of attested labels.""" - - self._mapping = mapping - """dict mapping from (fname,fval,label) -> fid""" - - self._length = len(mapping) - """The length of generated joint feature vectors.""" - - self._alwayson = None - """dict mapping from label -> fid""" - - self._unseen = None - """dict mapping from fname -> fid""" - - if alwayson_features: - self._alwayson = { - label: i + self._length for (i, label) in enumerate(labels) - } - self._length += len(self._alwayson) - - if unseen_features: - fnames = {fname for (fname, fval, label) in mapping} - self._unseen = {fname: i + self._length for (i, fname) in enumerate(fnames)} - self._length += len(fnames) - - def encode(self, featureset, label): - # Inherit docs. - encoding = [] - - # Convert input-features to joint-features: - for fname, fval in featureset.items(): - if isinstance(fval, (int, float)): - # Known feature name & value: - if (fname, type(fval), label) in self._mapping: - encoding.append((self._mapping[fname, type(fval), label], fval)) - else: - # Known feature name & value: - if (fname, fval, label) in self._mapping: - encoding.append((self._mapping[fname, fval, label], 1)) - - # Otherwise, we might want to fire an "unseen-value feature". - elif self._unseen: - # Have we seen this fname/fval combination with any label? - for label2 in self._labels: - if (fname, fval, label2) in self._mapping: - break # we've seen this fname/fval combo - # We haven't -- fire the unseen-value feature - else: - if fname in self._unseen: - encoding.append((self._unseen[fname], 1)) - - # Add always-on features: - if self._alwayson and label in self._alwayson: - encoding.append((self._alwayson[label], 1)) - - return encoding - - def describe(self, f_id): - # Inherit docs. - if not isinstance(f_id, int): - raise TypeError("describe() expected an int") - try: - self._inv_mapping - except AttributeError: - self._inv_mapping = [-1] * len(self._mapping) - for (info, i) in self._mapping.items(): - self._inv_mapping[i] = info - - if f_id < len(self._mapping): - (fname, fval, label) = self._inv_mapping[f_id] - return f"{fname}=={fval!r} and label is {label!r}" - elif self._alwayson and f_id in self._alwayson.values(): - for (label, f_id2) in self._alwayson.items(): - if f_id == f_id2: - return "label is %r" % label - elif self._unseen and f_id in self._unseen.values(): - for (fname, f_id2) in self._unseen.items(): - if f_id == f_id2: - return "%s is unseen" % fname - else: - raise ValueError("Bad feature id") - - def labels(self): - # Inherit docs. - return self._labels - - def length(self): - # Inherit docs. - return self._length - - @classmethod - def train(cls, train_toks, count_cutoff=0, labels=None, **options): - """ - Construct and return new feature encoding, based on a given - training corpus ``train_toks``. See the class description - ``TypedMaxentFeatureEncoding`` for a description of the - joint-features that will be included in this encoding. - - Note: recognized feature values types are (int, float), over - types are interpreted as regular binary features. - - :type train_toks: list(tuple(dict, str)) - :param train_toks: Training data, represented as a list of - pairs, the first member of which is a feature dictionary, - and the second of which is a classification label. - - :type count_cutoff: int - :param count_cutoff: A cutoff value that is used to discard - rare joint-features. If a joint-feature's value is 1 - fewer than ``count_cutoff`` times in the training corpus, - then that joint-feature is not included in the generated - encoding. - - :type labels: list - :param labels: A list of labels that should be used by the - classifier. If not specified, then the set of labels - attested in ``train_toks`` will be used. - - :param options: Extra parameters for the constructor, such as - ``unseen_features`` and ``alwayson_features``. - """ - mapping = {} # maps (fname, fval, label) -> fid - seen_labels = set() # The set of labels we've encountered - count = defaultdict(int) # maps (fname, fval) -> count - - for (tok, label) in train_toks: - if labels and label not in labels: - raise ValueError("Unexpected label %s" % label) - seen_labels.add(label) - - # Record each of the features. - for (fname, fval) in tok.items(): - if type(fval) in (int, float): - fval = type(fval) - # If a count cutoff is given, then only add a joint - # feature once the corresponding (fname, fval, label) - # tuple exceeds that cutoff. - count[fname, fval] += 1 - if count[fname, fval] >= count_cutoff: - if (fname, fval, label) not in mapping: - mapping[fname, fval, label] = len(mapping) - - if labels is None: - labels = seen_labels - return cls(labels, mapping, **options) - - -###################################################################### -# { Classifier Trainer: Generalized Iterative Scaling -###################################################################### - - -def train_maxent_classifier_with_gis( - train_toks, trace=3, encoding=None, labels=None, **cutoffs -): - """ - Train a new ``ConditionalExponentialClassifier``, using the given - training samples, using the Generalized Iterative Scaling - algorithm. This ``ConditionalExponentialClassifier`` will encode - the model that maximizes entropy from all the models that are - empirically consistent with ``train_toks``. - - :see: ``train_maxent_classifier()`` for parameter descriptions. - """ - cutoffs.setdefault("max_iter", 100) - cutoffchecker = CutoffChecker(cutoffs) - - # Construct an encoding from the training data. - if encoding is None: - encoding = GISEncoding.train(train_toks, labels=labels) - - if not hasattr(encoding, "C"): - raise TypeError( - "The GIS algorithm requires an encoding that " - "defines C (e.g., GISEncoding)." - ) - - # Cinv is the inverse of the sum of each joint feature vector. - # This controls the learning rate: higher Cinv (or lower C) gives - # faster learning. - Cinv = 1.0 / encoding.C - - # Count how many times each feature occurs in the training data. - empirical_fcount = calculate_empirical_fcount(train_toks, encoding) - - # Check for any features that are not attested in train_toks. - unattested = set(numpy.nonzero(empirical_fcount == 0)[0]) - - # Build the classifier. Start with weight=0 for each attested - # feature, and weight=-infinity for each unattested feature. - weights = numpy.zeros(len(empirical_fcount), "d") - for fid in unattested: - weights[fid] = numpy.NINF - classifier = ConditionalExponentialClassifier(encoding, weights) - - # Take the log of the empirical fcount. - log_empirical_fcount = numpy.log2(empirical_fcount) - del empirical_fcount - - if trace > 0: - print(" ==> Training (%d iterations)" % cutoffs["max_iter"]) - if trace > 2: - print() - print(" Iteration Log Likelihood Accuracy") - print(" ---------------------------------------") - - # Train the classifier. - try: - while True: - if trace > 2: - ll = cutoffchecker.ll or log_likelihood(classifier, train_toks) - acc = cutoffchecker.acc or accuracy(classifier, train_toks) - iternum = cutoffchecker.iter - print(" %9d %14.5f %9.3f" % (iternum, ll, acc)) - - # Use the model to estimate the number of times each - # feature should occur in the training data. - estimated_fcount = calculate_estimated_fcount( - classifier, train_toks, encoding - ) - - # Take the log of estimated fcount (avoid taking log(0).) - for fid in unattested: - estimated_fcount[fid] += 1 - log_estimated_fcount = numpy.log2(estimated_fcount) - del estimated_fcount - - # Update the classifier weights - weights = classifier.weights() - weights += (log_empirical_fcount - log_estimated_fcount) * Cinv - classifier.set_weights(weights) - - # Check the log-likelihood & accuracy cutoffs. - if cutoffchecker.check(classifier, train_toks): - break - - except KeyboardInterrupt: - print(" Training stopped: keyboard interrupt") - except: - raise - - if trace > 2: - ll = log_likelihood(classifier, train_toks) - acc = accuracy(classifier, train_toks) - print(f" Final {ll:14.5f} {acc:9.3f}") - - # Return the classifier. - return classifier - - -def calculate_empirical_fcount(train_toks, encoding): - fcount = numpy.zeros(encoding.length(), "d") - - for tok, label in train_toks: - for (index, val) in encoding.encode(tok, label): - fcount[index] += val - - return fcount - - -def calculate_estimated_fcount(classifier, train_toks, encoding): - fcount = numpy.zeros(encoding.length(), "d") - - for tok, label in train_toks: - pdist = classifier.prob_classify(tok) - for label in pdist.samples(): - prob = pdist.prob(label) - for (fid, fval) in encoding.encode(tok, label): - fcount[fid] += prob * fval - - return fcount - - -###################################################################### -# { Classifier Trainer: Improved Iterative Scaling -###################################################################### - - -def train_maxent_classifier_with_iis( - train_toks, trace=3, encoding=None, labels=None, **cutoffs -): - """ - Train a new ``ConditionalExponentialClassifier``, using the given - training samples, using the Improved Iterative Scaling algorithm. - This ``ConditionalExponentialClassifier`` will encode the model - that maximizes entropy from all the models that are empirically - consistent with ``train_toks``. - - :see: ``train_maxent_classifier()`` for parameter descriptions. - """ - cutoffs.setdefault("max_iter", 100) - cutoffchecker = CutoffChecker(cutoffs) - - # Construct an encoding from the training data. - if encoding is None: - encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels) - - # Count how many times each feature occurs in the training data. - empirical_ffreq = calculate_empirical_fcount(train_toks, encoding) / len(train_toks) - - # Find the nf map, and related variables nfarray and nfident. - # nf is the sum of the features for a given labeled text. - # nfmap compresses this sparse set of values to a dense list. - # nfarray performs the reverse operation. nfident is - # nfarray multiplied by an identity matrix. - nfmap = calculate_nfmap(train_toks, encoding) - nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), "d") - nftranspose = numpy.reshape(nfarray, (len(nfarray), 1)) - - # Check for any features that are not attested in train_toks. - unattested = set(numpy.nonzero(empirical_ffreq == 0)[0]) - - # Build the classifier. Start with weight=0 for each attested - # feature, and weight=-infinity for each unattested feature. - weights = numpy.zeros(len(empirical_ffreq), "d") - for fid in unattested: - weights[fid] = numpy.NINF - classifier = ConditionalExponentialClassifier(encoding, weights) - - if trace > 0: - print(" ==> Training (%d iterations)" % cutoffs["max_iter"]) - if trace > 2: - print() - print(" Iteration Log Likelihood Accuracy") - print(" ---------------------------------------") - - # Train the classifier. - try: - while True: - if trace > 2: - ll = cutoffchecker.ll or log_likelihood(classifier, train_toks) - acc = cutoffchecker.acc or accuracy(classifier, train_toks) - iternum = cutoffchecker.iter - print(" %9d %14.5f %9.3f" % (iternum, ll, acc)) - - # Calculate the deltas for this iteration, using Newton's method. - deltas = calculate_deltas( - train_toks, - classifier, - unattested, - empirical_ffreq, - nfmap, - nfarray, - nftranspose, - encoding, - ) - - # Use the deltas to update our weights. - weights = classifier.weights() - weights += deltas - classifier.set_weights(weights) - - # Check the log-likelihood & accuracy cutoffs. - if cutoffchecker.check(classifier, train_toks): - break - - except KeyboardInterrupt: - print(" Training stopped: keyboard interrupt") - except: - raise - - if trace > 2: - ll = log_likelihood(classifier, train_toks) - acc = accuracy(classifier, train_toks) - print(f" Final {ll:14.5f} {acc:9.3f}") - - # Return the classifier. - return classifier - - -def calculate_nfmap(train_toks, encoding): - """ - Construct a map that can be used to compress ``nf`` (which is - typically sparse). - - *nf(feature_vector)* is the sum of the feature values for - *feature_vector*. - - This represents the number of features that are active for a - given labeled text. This method finds all values of *nf(t)* - that are attested for at least one token in the given list of - training tokens; and constructs a dictionary mapping these - attested values to a continuous range *0...N*. For example, - if the only values of *nf()* that were attested were 3, 5, and - 7, then ``_nfmap`` might return the dictionary ``{3:0, 5:1, 7:2}``. - - :return: A map that can be used to compress ``nf`` to a dense - vector. - :rtype: dict(int -> int) - """ - # Map from nf to indices. This allows us to use smaller arrays. - nfset = set() - for tok, _ in train_toks: - for label in encoding.labels(): - nfset.add(sum(val for (id, val) in encoding.encode(tok, label))) - return {nf: i for (i, nf) in enumerate(nfset)} - - -def calculate_deltas( - train_toks, - classifier, - unattested, - ffreq_empirical, - nfmap, - nfarray, - nftranspose, - encoding, -): - r""" - Calculate the update values for the classifier weights for - this iteration of IIS. These update weights are the value of - ``delta`` that solves the equation:: - - ffreq_empirical[i] - = - SUM[fs,l] (classifier.prob_classify(fs).prob(l) * - feature_vector(fs,l)[i] * - exp(delta[i] * nf(feature_vector(fs,l)))) - - Where: - - *(fs,l)* is a (featureset, label) tuple from ``train_toks`` - - *feature_vector(fs,l)* = ``encoding.encode(fs,l)`` - - *nf(vector)* = ``sum([val for (id,val) in vector])`` - - This method uses Newton's method to solve this equation for - *delta[i]*. In particular, it starts with a guess of - ``delta[i]`` = 1; and iteratively updates ``delta`` with: - - | delta[i] -= (ffreq_empirical[i] - sum1[i])/(-sum2[i]) - - until convergence, where *sum1* and *sum2* are defined as: - - | sum1[i](delta) = SUM[fs,l] f[i](fs,l,delta) - | sum2[i](delta) = SUM[fs,l] (f[i](fs,l,delta).nf(feature_vector(fs,l))) - | f[i](fs,l,delta) = (classifier.prob_classify(fs).prob(l) . - | feature_vector(fs,l)[i] . - | exp(delta[i] . nf(feature_vector(fs,l)))) - - Note that *sum1* and *sum2* depend on ``delta``; so they need - to be re-computed each iteration. - - The variables ``nfmap``, ``nfarray``, and ``nftranspose`` are - used to generate a dense encoding for *nf(ltext)*. This - allows ``_deltas`` to calculate *sum1* and *sum2* using - matrices, which yields a significant performance improvement. - - :param train_toks: The set of training tokens. - :type train_toks: list(tuple(dict, str)) - :param classifier: The current classifier. - :type classifier: ClassifierI - :param ffreq_empirical: An array containing the empirical - frequency for each feature. The *i*\ th element of this - array is the empirical frequency for feature *i*. - :type ffreq_empirical: sequence of float - :param unattested: An array that is 1 for features that are - not attested in the training data; and 0 for features that - are attested. In other words, ``unattested[i]==0`` iff - ``ffreq_empirical[i]==0``. - :type unattested: sequence of int - :param nfmap: A map that can be used to compress ``nf`` to a dense - vector. - :type nfmap: dict(int -> int) - :param nfarray: An array that can be used to uncompress ``nf`` - from a dense vector. - :type nfarray: array(float) - :param nftranspose: The transpose of ``nfarray`` - :type nftranspose: array(float) - """ - # These parameters control when we decide that we've - # converged. It probably should be possible to set these - # manually, via keyword arguments to train. - NEWTON_CONVERGE = 1e-12 - MAX_NEWTON = 300 - - deltas = numpy.ones(encoding.length(), "d") - - # Precompute the A matrix: - # A[nf][id] = sum ( p(fs) * p(label|fs) * f(fs,label) ) - # over all label,fs s.t. num_features[label,fs]=nf - A = numpy.zeros((len(nfmap), encoding.length()), "d") - - for tok, label in train_toks: - dist = classifier.prob_classify(tok) - - for label in encoding.labels(): - # Generate the feature vector - feature_vector = encoding.encode(tok, label) - # Find the number of active features - nf = sum(val for (id, val) in feature_vector) - # Update the A matrix - for (id, val) in feature_vector: - A[nfmap[nf], id] += dist.prob(label) * val - A /= len(train_toks) - - # Iteratively solve for delta. Use the following variables: - # - nf_delta[x][y] = nfarray[x] * delta[y] - # - exp_nf_delta[x][y] = exp(nf[x] * delta[y]) - # - nf_exp_nf_delta[x][y] = nf[x] * exp(nf[x] * delta[y]) - # - sum1[i][nf] = sum p(fs)p(label|fs)f[i](label,fs) - # exp(delta[i]nf) - # - sum2[i][nf] = sum p(fs)p(label|fs)f[i](label,fs) - # nf exp(delta[i]nf) - for rangenum in range(MAX_NEWTON): - nf_delta = numpy.outer(nfarray, deltas) - exp_nf_delta = 2**nf_delta - nf_exp_nf_delta = nftranspose * exp_nf_delta - sum1 = numpy.sum(exp_nf_delta * A, axis=0) - sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0) - - # Avoid division by zero. - for fid in unattested: - sum2[fid] += 1 - - # Update the deltas. - deltas -= (ffreq_empirical - sum1) / -sum2 - - # We can stop once we converge. - n_error = numpy.sum(abs(ffreq_empirical - sum1)) / numpy.sum(abs(deltas)) - if n_error < NEWTON_CONVERGE: - return deltas - - return deltas - - -###################################################################### -# { Classifier Trainer: megam -###################################################################### - -# [xx] possible extension: add support for using implicit file format; -# this would need to put requirements on what encoding is used. But -# we may need this for other maxent classifier trainers that require -# implicit formats anyway. -def train_maxent_classifier_with_megam( - train_toks, trace=3, encoding=None, labels=None, gaussian_prior_sigma=0, **kwargs -): - """ - Train a new ``ConditionalExponentialClassifier``, using the given - training samples, using the external ``megam`` library. This - ``ConditionalExponentialClassifier`` will encode the model that - maximizes entropy from all the models that are empirically - consistent with ``train_toks``. - - :see: ``train_maxent_classifier()`` for parameter descriptions. - :see: ``nltk.classify.megam`` - """ - - explicit = True - bernoulli = True - if "explicit" in kwargs: - explicit = kwargs["explicit"] - if "bernoulli" in kwargs: - bernoulli = kwargs["bernoulli"] - - # Construct an encoding from the training data. - if encoding is None: - # Count cutoff can also be controlled by megam with the -minfc - # option. Not sure where the best place for it is. - count_cutoff = kwargs.get("count_cutoff", 0) - encoding = BinaryMaxentFeatureEncoding.train( - train_toks, count_cutoff, labels=labels, alwayson_features=True - ) - elif labels is not None: - raise ValueError("Specify encoding or labels, not both") - - # Write a training file for megam. - try: - fd, trainfile_name = tempfile.mkstemp(prefix="nltk-") - with open(trainfile_name, "w") as trainfile: - write_megam_file( - train_toks, encoding, trainfile, explicit=explicit, bernoulli=bernoulli - ) - os.close(fd) - except (OSError, ValueError) as e: - raise ValueError("Error while creating megam training file: %s" % e) from e - - # Run megam on the training file. - options = [] - options += ["-nobias", "-repeat", "10"] - if explicit: - options += ["-explicit"] - if not bernoulli: - options += ["-fvals"] - if gaussian_prior_sigma: - # Lambda is just the precision of the Gaussian prior, i.e. it's the - # inverse variance, so the parameter conversion is 1.0/sigma**2. - # See https://users.umiacs.umd.edu/~hal/docs/daume04cg-bfgs.pdf - inv_variance = 1.0 / gaussian_prior_sigma**2 - else: - inv_variance = 0 - options += ["-lambda", "%.2f" % inv_variance, "-tune"] - if trace < 3: - options += ["-quiet"] - if "max_iter" in kwargs: - options += ["-maxi", "%s" % kwargs["max_iter"]] - if "ll_delta" in kwargs: - # [xx] this is actually a perplexity delta, not a log - # likelihood delta - options += ["-dpp", "%s" % abs(kwargs["ll_delta"])] - if hasattr(encoding, "cost"): - options += ["-multilabel"] # each possible la - options += ["multiclass", trainfile_name] - stdout = call_megam(options) - # print('./megam_i686.opt ', ' '.join(options)) - # Delete the training file - try: - os.remove(trainfile_name) - except OSError as e: - print(f"Warning: unable to delete {trainfile_name}: {e}") - - # Parse the generated weight vector. - weights = parse_megam_weights(stdout, encoding.length(), explicit) - - # Convert from base-e to base-2 weights. - weights *= numpy.log2(numpy.e) - - # Build the classifier - return MaxentClassifier(encoding, weights) - - -###################################################################### -# { Classifier Trainer: tadm -###################################################################### - - -class TadmMaxentClassifier(MaxentClassifier): - @classmethod - def train(cls, train_toks, **kwargs): - algorithm = kwargs.get("algorithm", "tao_lmvm") - trace = kwargs.get("trace", 3) - encoding = kwargs.get("encoding", None) - labels = kwargs.get("labels", None) - sigma = kwargs.get("gaussian_prior_sigma", 0) - count_cutoff = kwargs.get("count_cutoff", 0) - max_iter = kwargs.get("max_iter") - ll_delta = kwargs.get("min_lldelta") - - # Construct an encoding from the training data. - if not encoding: - encoding = TadmEventMaxentFeatureEncoding.train( - train_toks, count_cutoff, labels=labels - ) - - trainfile_fd, trainfile_name = tempfile.mkstemp( - prefix="nltk-tadm-events-", suffix=".gz" - ) - weightfile_fd, weightfile_name = tempfile.mkstemp(prefix="nltk-tadm-weights-") - - trainfile = gzip_open_unicode(trainfile_name, "w") - write_tadm_file(train_toks, encoding, trainfile) - trainfile.close() - - options = [] - options.extend(["-monitor"]) - options.extend(["-method", algorithm]) - if sigma: - options.extend(["-l2", "%.6f" % sigma**2]) - if max_iter: - options.extend(["-max_it", "%d" % max_iter]) - if ll_delta: - options.extend(["-fatol", "%.6f" % abs(ll_delta)]) - options.extend(["-events_in", trainfile_name]) - options.extend(["-params_out", weightfile_name]) - if trace < 3: - options.extend(["2>&1"]) - else: - options.extend(["-summary"]) - - call_tadm(options) - - with open(weightfile_name) as weightfile: - weights = parse_tadm_weights(weightfile) - - os.remove(trainfile_name) - os.remove(weightfile_name) - - # Convert from base-e to base-2 weights. - weights *= numpy.log2(numpy.e) - - # Build the classifier - return cls(encoding, weights) - - -###################################################################### -# { Demo -###################################################################### -def demo(): - from nltk.classify.util import names_demo - - classifier = names_demo(MaxentClassifier.train) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/classify/megam.py b/pipeline/nltk/classify/megam.py deleted file mode 100644 index ce9690cecad7cb767cc95925fc1b12026200720e..0000000000000000000000000000000000000000 --- a/pipeline/nltk/classify/megam.py +++ /dev/null @@ -1,184 +0,0 @@ -# Natural Language Toolkit: Interface to Megam Classifier -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -A set of functions used to interface with the external megam_ maxent -optimization package. Before megam can be used, you should tell NLTK where it -can find the megam binary, using the ``config_megam()`` function. Typical -usage: - - >>> from nltk.classify import megam - >>> megam.config_megam() # pass path to megam if not found in PATH # doctest: +SKIP - [Found megam: ...] - -Use with MaxentClassifier. Example below, see MaxentClassifier documentation -for details. - - nltk.classify.MaxentClassifier.train(corpus, 'megam') - -.. _megam: https://www.umiacs.umd.edu/~hal/megam/index.html -""" -import subprocess - -from nltk.internals import find_binary - -try: - import numpy -except ImportError: - numpy = None - -###################################################################### -# { Configuration -###################################################################### - -_megam_bin = None - - -def config_megam(bin=None): - """ - Configure NLTK's interface to the ``megam`` maxent optimization - package. - - :param bin: The full path to the ``megam`` binary. If not specified, - then nltk will search the system for a ``megam`` binary; and if - one is not found, it will raise a ``LookupError`` exception. - :type bin: str - """ - global _megam_bin - _megam_bin = find_binary( - "megam", - bin, - env_vars=["MEGAM"], - binary_names=["megam.opt", "megam", "megam_686", "megam_i686.opt"], - url="https://www.umiacs.umd.edu/~hal/megam/index.html", - ) - - -###################################################################### -# { Megam Interface Functions -###################################################################### - - -def write_megam_file(train_toks, encoding, stream, bernoulli=True, explicit=True): - """ - Generate an input file for ``megam`` based on the given corpus of - classified tokens. - - :type train_toks: list(tuple(dict, str)) - :param train_toks: Training data, represented as a list of - pairs, the first member of which is a feature dictionary, - and the second of which is a classification label. - - :type encoding: MaxentFeatureEncodingI - :param encoding: A feature encoding, used to convert featuresets - into feature vectors. May optionally implement a cost() method - in order to assign different costs to different class predictions. - - :type stream: stream - :param stream: The stream to which the megam input file should be - written. - - :param bernoulli: If true, then use the 'bernoulli' format. I.e., - all joint features have binary values, and are listed iff they - are true. Otherwise, list feature values explicitly. If - ``bernoulli=False``, then you must call ``megam`` with the - ``-fvals`` option. - - :param explicit: If true, then use the 'explicit' format. I.e., - list the features that would fire for any of the possible - labels, for each token. If ``explicit=True``, then you must - call ``megam`` with the ``-explicit`` option. - """ - # Look up the set of labels. - labels = encoding.labels() - labelnum = {label: i for (i, label) in enumerate(labels)} - - # Write the file, which contains one line per instance. - for featureset, label in train_toks: - # First, the instance number (or, in the weighted multiclass case, the cost of each label). - if hasattr(encoding, "cost"): - stream.write( - ":".join(str(encoding.cost(featureset, label, l)) for l in labels) - ) - else: - stream.write("%d" % labelnum[label]) - - # For implicit file formats, just list the features that fire - # for this instance's actual label. - if not explicit: - _write_megam_features(encoding.encode(featureset, label), stream, bernoulli) - - # For explicit formats, list the features that would fire for - # any of the possible labels. - else: - for l in labels: - stream.write(" #") - _write_megam_features(encoding.encode(featureset, l), stream, bernoulli) - - # End of the instance. - stream.write("\n") - - -def parse_megam_weights(s, features_count, explicit=True): - """ - Given the stdout output generated by ``megam`` when training a - model, return a ``numpy`` array containing the corresponding weight - vector. This function does not currently handle bias features. - """ - if numpy is None: - raise ValueError("This function requires that numpy be installed") - assert explicit, "non-explicit not supported yet" - lines = s.strip().split("\n") - weights = numpy.zeros(features_count, "d") - for line in lines: - if line.strip(): - fid, weight = line.split() - weights[int(fid)] = float(weight) - return weights - - -def _write_megam_features(vector, stream, bernoulli): - if not vector: - raise ValueError( - "MEGAM classifier requires the use of an " "always-on feature." - ) - for (fid, fval) in vector: - if bernoulli: - if fval == 1: - stream.write(" %s" % fid) - elif fval != 0: - raise ValueError( - "If bernoulli=True, then all" "features must be binary." - ) - else: - stream.write(f" {fid} {fval}") - - -def call_megam(args): - """ - Call the ``megam`` binary with the given arguments. - """ - if isinstance(args, str): - raise TypeError("args should be a list of strings") - if _megam_bin is None: - config_megam() - - # Call megam via a subprocess - cmd = [_megam_bin] + args - p = subprocess.Popen(cmd, stdout=subprocess.PIPE) - (stdout, stderr) = p.communicate() - - # Check the return code. - if p.returncode != 0: - print() - print(stderr) - raise OSError("megam command failed!") - - if isinstance(stdout, str): - return stdout - else: - return stdout.decode("utf-8") diff --git a/pipeline/nltk/classify/naivebayes.py b/pipeline/nltk/classify/naivebayes.py deleted file mode 100644 index caf4b73dd6a13ddcbc1f078441ec06254be4a6d9..0000000000000000000000000000000000000000 --- a/pipeline/nltk/classify/naivebayes.py +++ /dev/null @@ -1,260 +0,0 @@ -# Natural Language Toolkit: Naive Bayes Classifiers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -A classifier based on the Naive Bayes algorithm. In order to find the -probability for a label, this algorithm first uses the Bayes rule to -express P(label|features) in terms of P(label) and P(features|label): - -| P(label) * P(features|label) -| P(label|features) = ------------------------------ -| P(features) - -The algorithm then makes the 'naive' assumption that all features are -independent, given the label: - -| P(label) * P(f1|label) * ... * P(fn|label) -| P(label|features) = -------------------------------------------- -| P(features) - -Rather than computing P(features) explicitly, the algorithm just -calculates the numerator for each label, and normalizes them so they -sum to one: - -| P(label) * P(f1|label) * ... * P(fn|label) -| P(label|features) = -------------------------------------------- -| SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) ) -""" - -from collections import defaultdict - -from nltk.classify.api import ClassifierI -from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist, sum_logs - -##////////////////////////////////////////////////////// -## Naive Bayes Classifier -##////////////////////////////////////////////////////// - - -class NaiveBayesClassifier(ClassifierI): - """ - A Naive Bayes classifier. Naive Bayes classifiers are - paramaterized by two probability distributions: - - - P(label) gives the probability that an input will receive each - label, given no information about the input's features. - - - P(fname=fval|label) gives the probability that a given feature - (fname) will receive a given value (fval), given that the - label (label). - - If the classifier encounters an input with a feature that has - never been seen with any label, then rather than assigning a - probability of 0 to all labels, it will ignore that feature. - - The feature value 'None' is reserved for unseen feature values; - you generally should not use 'None' as a feature value for one of - your own features. - """ - - def __init__(self, label_probdist, feature_probdist): - """ - :param label_probdist: P(label), the probability distribution - over labels. It is expressed as a ``ProbDistI`` whose - samples are labels. I.e., P(label) = - ``label_probdist.prob(label)``. - - :param feature_probdist: P(fname=fval|label), the probability - distribution for feature values, given labels. It is - expressed as a dictionary whose keys are ``(label, fname)`` - pairs and whose values are ``ProbDistI`` objects over feature - values. I.e., P(fname=fval|label) = - ``feature_probdist[label,fname].prob(fval)``. If a given - ``(label,fname)`` is not a key in ``feature_probdist``, then - it is assumed that the corresponding P(fname=fval|label) - is 0 for all values of ``fval``. - """ - self._label_probdist = label_probdist - self._feature_probdist = feature_probdist - self._labels = list(label_probdist.samples()) - - def labels(self): - return self._labels - - def classify(self, featureset): - return self.prob_classify(featureset).max() - - def prob_classify(self, featureset): - # Discard any feature names that we've never seen before. - # Otherwise, we'll just assign a probability of 0 to - # everything. - featureset = featureset.copy() - for fname in list(featureset.keys()): - for label in self._labels: - if (label, fname) in self._feature_probdist: - break - else: - # print('Ignoring unseen feature %s' % fname) - del featureset[fname] - - # Find the log probability of each label, given the features. - # Start with the log probability of the label itself. - logprob = {} - for label in self._labels: - logprob[label] = self._label_probdist.logprob(label) - - # Then add in the log probability of features given labels. - for label in self._labels: - for (fname, fval) in featureset.items(): - if (label, fname) in self._feature_probdist: - feature_probs = self._feature_probdist[label, fname] - logprob[label] += feature_probs.logprob(fval) - else: - # nb: This case will never come up if the - # classifier was created by - # NaiveBayesClassifier.train(). - logprob[label] += sum_logs([]) # = -INF. - - return DictionaryProbDist(logprob, normalize=True, log=True) - - def show_most_informative_features(self, n=10): - # Determine the most relevant features, and display them. - cpdist = self._feature_probdist - print("Most Informative Features") - - for (fname, fval) in self.most_informative_features(n): - - def labelprob(l): - return cpdist[l, fname].prob(fval) - - labels = sorted( - (l for l in self._labels if fval in cpdist[l, fname].samples()), - key=lambda element: (-labelprob(element), element), - reverse=True, - ) - if len(labels) == 1: - continue - l0 = labels[0] - l1 = labels[-1] - if cpdist[l0, fname].prob(fval) == 0: - ratio = "INF" - else: - ratio = "%8.1f" % ( - cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval) - ) - print( - "%24s = %-14r %6s : %-6s = %s : 1.0" - % (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio) - ) - - def most_informative_features(self, n=100): - """ - Return a list of the 'most informative' features used by this - classifier. For the purpose of this function, the - informativeness of a feature ``(fname,fval)`` is equal to the - highest value of P(fname=fval|label), for any label, divided by - the lowest value of P(fname=fval|label), for any label: - - | max[ P(fname=fval|label1) / P(fname=fval|label2) ] - """ - if hasattr(self, "_most_informative_features"): - return self._most_informative_features[:n] - else: - # The set of (fname, fval) pairs used by this classifier. - features = set() - # The max & min probability associated w/ each (fname, fval) - # pair. Maps (fname,fval) -> float. - maxprob = defaultdict(lambda: 0.0) - minprob = defaultdict(lambda: 1.0) - - for (label, fname), probdist in self._feature_probdist.items(): - for fval in probdist.samples(): - feature = (fname, fval) - features.add(feature) - p = probdist.prob(fval) - maxprob[feature] = max(p, maxprob[feature]) - minprob[feature] = min(p, minprob[feature]) - if minprob[feature] == 0: - features.discard(feature) - - # Convert features to a list, & sort it by how informative - # features are. - self._most_informative_features = sorted( - features, - key=lambda feature_: ( - minprob[feature_] / maxprob[feature_], - feature_[0], - feature_[1] in [None, False, True], - str(feature_[1]).lower(), - ), - ) - return self._most_informative_features[:n] - - @classmethod - def train(cls, labeled_featuresets, estimator=ELEProbDist): - """ - :param labeled_featuresets: A list of classified featuresets, - i.e., a list of tuples ``(featureset, label)``. - """ - label_freqdist = FreqDist() - feature_freqdist = defaultdict(FreqDist) - feature_values = defaultdict(set) - fnames = set() - - # Count up how many times each feature value occurred, given - # the label and featurename. - for featureset, label in labeled_featuresets: - label_freqdist[label] += 1 - for fname, fval in featureset.items(): - # Increment freq(fval|label, fname) - feature_freqdist[label, fname][fval] += 1 - # Record that fname can take the value fval. - feature_values[fname].add(fval) - # Keep a list of all feature names. - fnames.add(fname) - - # If a feature didn't have a value given for an instance, then - # we assume that it gets the implicit value 'None.' This loop - # counts up the number of 'missing' feature values for each - # (label,fname) pair, and increments the count of the fval - # 'None' by that amount. - for label in label_freqdist: - num_samples = label_freqdist[label] - for fname in fnames: - count = feature_freqdist[label, fname].N() - # Only add a None key when necessary, i.e. if there are - # any samples with feature 'fname' missing. - if num_samples - count > 0: - feature_freqdist[label, fname][None] += num_samples - count - feature_values[fname].add(None) - - # Create the P(label) distribution - label_probdist = estimator(label_freqdist) - - # Create the P(fval|label, fname) distribution - feature_probdist = {} - for ((label, fname), freqdist) in feature_freqdist.items(): - probdist = estimator(freqdist, bins=len(feature_values[fname])) - feature_probdist[label, fname] = probdist - - return cls(label_probdist, feature_probdist) - - -##////////////////////////////////////////////////////// -## Demo -##////////////////////////////////////////////////////// - - -def demo(): - from nltk.classify.util import names_demo - - classifier = names_demo(NaiveBayesClassifier.train) - classifier.show_most_informative_features() - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/classify/positivenaivebayes.py b/pipeline/nltk/classify/positivenaivebayes.py deleted file mode 100644 index 23797f0970848ce9e3617b16dbf54352e5f1523c..0000000000000000000000000000000000000000 --- a/pipeline/nltk/classify/positivenaivebayes.py +++ /dev/null @@ -1,180 +0,0 @@ -# Natural Language Toolkit: Positive Naive Bayes Classifier -# -# Copyright (C) 2012 NLTK Project -# Author: Alessandro Presta -# URL: -# For license information, see LICENSE.TXT - -""" -A variant of the Naive Bayes Classifier that performs binary classification with -partially-labeled training sets. In other words, assume we want to build a classifier -that assigns each example to one of two complementary classes (e.g., male names and -female names). -If we have a training set with labeled examples for both classes, we can use a -standard Naive Bayes Classifier. However, consider the case when we only have labeled -examples for one of the classes, and other, unlabeled, examples. -Then, assuming a prior distribution on the two labels, we can use the unlabeled set -to estimate the frequencies of the various features. - -Let the two possible labels be 1 and 0, and let's say we only have examples labeled 1 -and unlabeled examples. We are also given an estimate of P(1). - -We compute P(feature|1) exactly as in the standard case. - -To compute P(feature|0), we first estimate P(feature) from the unlabeled set (we are -assuming that the unlabeled examples are drawn according to the given prior distribution) -and then express the conditional probability as: - -| P(feature) - P(feature|1) * P(1) -| P(feature|0) = ---------------------------------- -| P(0) - -Example: - - >>> from nltk.classify import PositiveNaiveBayesClassifier - -Some sentences about sports: - - >>> sports_sentences = [ 'The team dominated the game', - ... 'They lost the ball', - ... 'The game was intense', - ... 'The goalkeeper catched the ball', - ... 'The other team controlled the ball' ] - -Mixed topics, including sports: - - >>> various_sentences = [ 'The President did not comment', - ... 'I lost the keys', - ... 'The team won the game', - ... 'Sara has two kids', - ... 'The ball went off the court', - ... 'They had the ball for the whole game', - ... 'The show is over' ] - -The features of a sentence are simply the words it contains: - - >>> def features(sentence): - ... words = sentence.lower().split() - ... return dict(('contains(%s)' % w, True) for w in words) - -We use the sports sentences as positive examples, the mixed ones ad unlabeled examples: - - >>> positive_featuresets = map(features, sports_sentences) - >>> unlabeled_featuresets = map(features, various_sentences) - >>> classifier = PositiveNaiveBayesClassifier.train(positive_featuresets, - ... unlabeled_featuresets) - -Is the following sentence about sports? - - >>> classifier.classify(features('The cat is on the table')) - False - -What about this one? - - >>> classifier.classify(features('My team lost the game')) - True -""" - -from collections import defaultdict - -from nltk.classify.naivebayes import NaiveBayesClassifier -from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist - -##////////////////////////////////////////////////////// -## Positive Naive Bayes Classifier -##////////////////////////////////////////////////////// - - -class PositiveNaiveBayesClassifier(NaiveBayesClassifier): - @staticmethod - def train( - positive_featuresets, - unlabeled_featuresets, - positive_prob_prior=0.5, - estimator=ELEProbDist, - ): - """ - :param positive_featuresets: An iterable of featuresets that are known as positive - examples (i.e., their label is ``True``). - - :param unlabeled_featuresets: An iterable of featuresets whose label is unknown. - - :param positive_prob_prior: A prior estimate of the probability of the label - ``True`` (default 0.5). - """ - positive_feature_freqdist = defaultdict(FreqDist) - unlabeled_feature_freqdist = defaultdict(FreqDist) - feature_values = defaultdict(set) - fnames = set() - - # Count up how many times each feature value occurred in positive examples. - num_positive_examples = 0 - for featureset in positive_featuresets: - for fname, fval in featureset.items(): - positive_feature_freqdist[fname][fval] += 1 - feature_values[fname].add(fval) - fnames.add(fname) - num_positive_examples += 1 - - # Count up how many times each feature value occurred in unlabeled examples. - num_unlabeled_examples = 0 - for featureset in unlabeled_featuresets: - for fname, fval in featureset.items(): - unlabeled_feature_freqdist[fname][fval] += 1 - feature_values[fname].add(fval) - fnames.add(fname) - num_unlabeled_examples += 1 - - # If a feature didn't have a value given for an instance, then we assume that - # it gets the implicit value 'None'. - for fname in fnames: - count = positive_feature_freqdist[fname].N() - positive_feature_freqdist[fname][None] += num_positive_examples - count - feature_values[fname].add(None) - - for fname in fnames: - count = unlabeled_feature_freqdist[fname].N() - unlabeled_feature_freqdist[fname][None] += num_unlabeled_examples - count - feature_values[fname].add(None) - - negative_prob_prior = 1.0 - positive_prob_prior - - # Create the P(label) distribution. - label_probdist = DictionaryProbDist( - {True: positive_prob_prior, False: negative_prob_prior} - ) - - # Create the P(fval|label, fname) distribution. - feature_probdist = {} - for fname, freqdist in positive_feature_freqdist.items(): - probdist = estimator(freqdist, bins=len(feature_values[fname])) - feature_probdist[True, fname] = probdist - - for fname, freqdist in unlabeled_feature_freqdist.items(): - global_probdist = estimator(freqdist, bins=len(feature_values[fname])) - negative_feature_probs = {} - for fval in feature_values[fname]: - prob = ( - global_probdist.prob(fval) - - positive_prob_prior * feature_probdist[True, fname].prob(fval) - ) / negative_prob_prior - # TODO: We need to add some kind of smoothing here, instead of - # setting negative probabilities to zero and normalizing. - negative_feature_probs[fval] = max(prob, 0.0) - feature_probdist[False, fname] = DictionaryProbDist( - negative_feature_probs, normalize=True - ) - - return PositiveNaiveBayesClassifier(label_probdist, feature_probdist) - - -##////////////////////////////////////////////////////// -## Demo -##////////////////////////////////////////////////////// - - -def demo(): - from nltk.classify.util import partial_names_demo - - classifier = partial_names_demo(PositiveNaiveBayesClassifier.train) - classifier.show_most_informative_features() diff --git a/pipeline/nltk/classify/rte_classify.py b/pipeline/nltk/classify/rte_classify.py deleted file mode 100644 index cbb416633eb65576d3c8241d93e23880e1b3f73b..0000000000000000000000000000000000000000 --- a/pipeline/nltk/classify/rte_classify.py +++ /dev/null @@ -1,183 +0,0 @@ -# Natural Language Toolkit: RTE Classifier -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ewan Klein -# URL: -# For license information, see LICENSE.TXT - -""" -Simple classifier for RTE corpus. - -It calculates the overlap in words and named entities between text and -hypothesis, and also whether there are words / named entities in the -hypothesis which fail to occur in the text, since this is an indicator that -the hypothesis is more informative than (i.e not entailed by) the text. - -TO DO: better Named Entity classification -TO DO: add lemmatization -""" - -from nltk.classify.maxent import MaxentClassifier -from nltk.classify.util import accuracy -from nltk.tokenize import RegexpTokenizer - - -class RTEFeatureExtractor: - """ - This builds a bag of words for both the text and the hypothesis after - throwing away some stopwords, then calculates overlap and difference. - """ - - def __init__(self, rtepair, stop=True, use_lemmatize=False): - """ - :param rtepair: a ``RTEPair`` from which features should be extracted - :param stop: if ``True``, stopwords are thrown away. - :type stop: bool - """ - self.stop = stop - self.stopwords = { - "a", - "the", - "it", - "they", - "of", - "in", - "to", - "is", - "have", - "are", - "were", - "and", - "very", - ".", - ",", - } - - self.negwords = {"no", "not", "never", "failed", "rejected", "denied"} - # Try to tokenize so that abbreviations, monetary amounts, email - # addresses, URLs are single tokens. - tokenizer = RegexpTokenizer(r"[\w.@:/]+|\w+|\$[\d.]+") - - # Get the set of word types for text and hypothesis - self.text_tokens = tokenizer.tokenize(rtepair.text) - self.hyp_tokens = tokenizer.tokenize(rtepair.hyp) - self.text_words = set(self.text_tokens) - self.hyp_words = set(self.hyp_tokens) - - if use_lemmatize: - self.text_words = {self._lemmatize(token) for token in self.text_tokens} - self.hyp_words = {self._lemmatize(token) for token in self.hyp_tokens} - - if self.stop: - self.text_words = self.text_words - self.stopwords - self.hyp_words = self.hyp_words - self.stopwords - - self._overlap = self.hyp_words & self.text_words - self._hyp_extra = self.hyp_words - self.text_words - self._txt_extra = self.text_words - self.hyp_words - - def overlap(self, toktype, debug=False): - """ - Compute the overlap between text and hypothesis. - - :param toktype: distinguish Named Entities from ordinary words - :type toktype: 'ne' or 'word' - """ - ne_overlap = {token for token in self._overlap if self._ne(token)} - if toktype == "ne": - if debug: - print("ne overlap", ne_overlap) - return ne_overlap - elif toktype == "word": - if debug: - print("word overlap", self._overlap - ne_overlap) - return self._overlap - ne_overlap - else: - raise ValueError("Type not recognized:'%s'" % toktype) - - def hyp_extra(self, toktype, debug=True): - """ - Compute the extraneous material in the hypothesis. - - :param toktype: distinguish Named Entities from ordinary words - :type toktype: 'ne' or 'word' - """ - ne_extra = {token for token in self._hyp_extra if self._ne(token)} - if toktype == "ne": - return ne_extra - elif toktype == "word": - return self._hyp_extra - ne_extra - else: - raise ValueError("Type not recognized: '%s'" % toktype) - - @staticmethod - def _ne(token): - """ - This just assumes that words in all caps or titles are - named entities. - - :type token: str - """ - if token.istitle() or token.isupper(): - return True - return False - - @staticmethod - def _lemmatize(word): - """ - Use morphy from WordNet to find the base form of verbs. - """ - from nltk.corpus import wordnet as wn - - lemma = wn.morphy(word, pos=wn.VERB) - if lemma is not None: - return lemma - return word - - -def rte_features(rtepair): - extractor = RTEFeatureExtractor(rtepair) - features = {} - features["alwayson"] = True - features["word_overlap"] = len(extractor.overlap("word")) - features["word_hyp_extra"] = len(extractor.hyp_extra("word")) - features["ne_overlap"] = len(extractor.overlap("ne")) - features["ne_hyp_extra"] = len(extractor.hyp_extra("ne")) - features["neg_txt"] = len(extractor.negwords & extractor.text_words) - features["neg_hyp"] = len(extractor.negwords & extractor.hyp_words) - return features - - -def rte_featurize(rte_pairs): - return [(rte_features(pair), pair.value) for pair in rte_pairs] - - -def rte_classifier(algorithm, sample_N=None): - from nltk.corpus import rte as rte_corpus - - train_set = rte_corpus.pairs(["rte1_dev.xml", "rte2_dev.xml", "rte3_dev.xml"]) - test_set = rte_corpus.pairs(["rte1_test.xml", "rte2_test.xml", "rte3_test.xml"]) - - if sample_N is not None: - train_set = train_set[:sample_N] - test_set = test_set[:sample_N] - - featurized_train_set = rte_featurize(train_set) - featurized_test_set = rte_featurize(test_set) - - # Train the classifier - print("Training classifier...") - if algorithm in ["megam"]: # MEGAM based algorithms. - clf = MaxentClassifier.train(featurized_train_set, algorithm) - elif algorithm in ["GIS", "IIS"]: # Use default GIS/IIS MaxEnt algorithm - clf = MaxentClassifier.train(featurized_train_set, algorithm) - else: - err_msg = str( - "RTEClassifier only supports these algorithms:\n " - "'megam', 'GIS', 'IIS'.\n" - ) - raise Exception(err_msg) - print("Testing classifier...") - acc = accuracy(clf, featurized_test_set) - print("Accuracy: %6.4f" % acc) - return clf diff --git a/pipeline/nltk/classify/scikitlearn.py b/pipeline/nltk/classify/scikitlearn.py deleted file mode 100644 index c1a35a416e2aebc873dad0559b75f85be3ad8200..0000000000000000000000000000000000000000 --- a/pipeline/nltk/classify/scikitlearn.py +++ /dev/null @@ -1,143 +0,0 @@ -# Natural Language Toolkit: Interface to scikit-learn classifiers -# -# Author: Lars Buitinck -# URL: -# For license information, see LICENSE.TXT -""" -scikit-learn (https://scikit-learn.org) is a machine learning library for -Python. It supports many classification algorithms, including SVMs, -Naive Bayes, logistic regression (MaxEnt) and decision trees. - -This package implements a wrapper around scikit-learn classifiers. To use this -wrapper, construct a scikit-learn estimator object, then use that to construct -a SklearnClassifier. E.g., to wrap a linear SVM with default settings: - ->>> from sklearn.svm import LinearSVC ->>> from nltk.classify.scikitlearn import SklearnClassifier ->>> classif = SklearnClassifier(LinearSVC()) - -A scikit-learn classifier may include preprocessing steps when it's wrapped -in a Pipeline object. The following constructs and wraps a Naive Bayes text -classifier with tf-idf weighting and chi-square feature selection to get the -best 1000 features: - ->>> from sklearn.feature_extraction.text import TfidfTransformer ->>> from sklearn.feature_selection import SelectKBest, chi2 ->>> from sklearn.naive_bayes import MultinomialNB ->>> from sklearn.pipeline import Pipeline ->>> pipeline = Pipeline([('tfidf', TfidfTransformer()), -... ('chi2', SelectKBest(chi2, k=1000)), -... ('nb', MultinomialNB())]) ->>> classif = SklearnClassifier(pipeline) -""" - -from nltk.classify.api import ClassifierI -from nltk.probability import DictionaryProbDist - -try: - from sklearn.feature_extraction import DictVectorizer - from sklearn.preprocessing import LabelEncoder -except ImportError: - pass - -__all__ = ["SklearnClassifier"] - - -class SklearnClassifier(ClassifierI): - """Wrapper for scikit-learn classifiers.""" - - def __init__(self, estimator, dtype=float, sparse=True): - """ - :param estimator: scikit-learn classifier object. - - :param dtype: data type used when building feature array. - scikit-learn estimators work exclusively on numeric data. The - default value should be fine for almost all situations. - - :param sparse: Whether to use sparse matrices internally. - The estimator must support these; not all scikit-learn classifiers - do (see their respective documentation and look for "sparse - matrix"). The default value is True, since most NLP problems - involve sparse feature sets. Setting this to False may take a - great amount of memory. - :type sparse: boolean. - """ - self._clf = estimator - self._encoder = LabelEncoder() - self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse) - - def __repr__(self): - return "" % self._clf - - def classify_many(self, featuresets): - """Classify a batch of samples. - - :param featuresets: An iterable over featuresets, each a dict mapping - strings to either numbers, booleans or strings. - :return: The predicted class label for each input sample. - :rtype: list - """ - X = self._vectorizer.transform(featuresets) - classes = self._encoder.classes_ - return [classes[i] for i in self._clf.predict(X)] - - def prob_classify_many(self, featuresets): - """Compute per-class probabilities for a batch of samples. - - :param featuresets: An iterable over featuresets, each a dict mapping - strings to either numbers, booleans or strings. - :rtype: list of ``ProbDistI`` - """ - X = self._vectorizer.transform(featuresets) - y_proba_list = self._clf.predict_proba(X) - return [self._make_probdist(y_proba) for y_proba in y_proba_list] - - def labels(self): - """The class labels used by this classifier. - - :rtype: list - """ - return list(self._encoder.classes_) - - def train(self, labeled_featuresets): - """ - Train (fit) the scikit-learn estimator. - - :param labeled_featuresets: A list of ``(featureset, label)`` - where each ``featureset`` is a dict mapping strings to either - numbers, booleans or strings. - """ - - X, y = list(zip(*labeled_featuresets)) - X = self._vectorizer.fit_transform(X) - y = self._encoder.fit_transform(y) - self._clf.fit(X, y) - - return self - - def _make_probdist(self, y_proba): - classes = self._encoder.classes_ - return DictionaryProbDist({classes[i]: p for i, p in enumerate(y_proba)}) - - -if __name__ == "__main__": - from sklearn.linear_model import LogisticRegression - from sklearn.naive_bayes import BernoulliNB - - from nltk.classify.util import names_demo, names_demo_features - - # Bernoulli Naive Bayes is designed for binary classification. We set the - # binarize option to False since we know we're passing boolean features. - print("scikit-learn Naive Bayes:") - names_demo( - SklearnClassifier(BernoulliNB(binarize=False)).train, - features=names_demo_features, - ) - - # The C parameter on logistic regression (MaxEnt) controls regularization. - # The higher it's set, the less regularized the classifier is. - print("\n\nscikit-learn logistic regression:") - names_demo( - SklearnClassifier(LogisticRegression(C=1000)).train, - features=names_demo_features, - ) diff --git a/pipeline/nltk/classify/senna.py b/pipeline/nltk/classify/senna.py deleted file mode 100644 index a1099ffd668574c636412b1bec9c94fd07865651..0000000000000000000000000000000000000000 --- a/pipeline/nltk/classify/senna.py +++ /dev/null @@ -1,176 +0,0 @@ -# Natural Language Toolkit: Senna Interface -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Rami Al-Rfou' -# URL: -# For license information, see LICENSE.TXT - -""" -A general interface to the SENNA pipeline that supports any of the -operations specified in SUPPORTED_OPERATIONS. - -Applying multiple operations at once has the speed advantage. For example, -Senna will automatically determine POS tags if you are extracting named -entities. Applying both of the operations will cost only the time of -extracting the named entities. - -The SENNA pipeline has a fixed maximum size of the sentences that it can read. -By default it is 1024 token/sentence. If you have larger sentences, changing -the MAX_SENTENCE_SIZE value in SENNA_main.c should be considered and your -system specific binary should be rebuilt. Otherwise this could introduce -misalignment errors. - -The input is: - -- path to the directory that contains SENNA executables. If the path is incorrect, - Senna will automatically search for executable file specified in SENNA environment variable -- List of the operations needed to be performed. -- (optionally) the encoding of the input data (default:utf-8) - -Note: Unit tests for this module can be found in test/unit/test_senna.py - ->>> from nltk.classify import Senna ->>> pipeline = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner']) # doctest: +SKIP ->>> sent = 'Dusseldorf is an international business center'.split() ->>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)] # doctest: +SKIP -[('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'), -('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')] -""" - -from os import environ, path, sep -from platform import architecture, system -from subprocess import PIPE, Popen - -from nltk.tag.api import TaggerI - - -class Senna(TaggerI): - - SUPPORTED_OPERATIONS = ["pos", "chk", "ner"] - - def __init__(self, senna_path, operations, encoding="utf-8"): - self._encoding = encoding - self._path = path.normpath(senna_path) + sep - - # Verifies the existence of the executable on the self._path first - # senna_binary_file_1 = self.executable(self._path) - exe_file_1 = self.executable(self._path) - if not path.isfile(exe_file_1): - # Check for the system environment - if "SENNA" in environ: - # self._path = path.join(environ['SENNA'],'') - self._path = path.normpath(environ["SENNA"]) + sep - exe_file_2 = self.executable(self._path) - if not path.isfile(exe_file_2): - raise LookupError( - "Senna executable expected at %s or %s but not found" - % (exe_file_1, exe_file_2) - ) - - self.operations = operations - - def executable(self, base_path): - """ - The function that determines the system specific binary that should be - used in the pipeline. In case, the system is not known the default senna binary will - be used. - """ - os_name = system() - if os_name == "Linux": - bits = architecture()[0] - if bits == "64bit": - return path.join(base_path, "senna-linux64") - return path.join(base_path, "senna-linux32") - if os_name == "Windows": - return path.join(base_path, "senna-win32.exe") - if os_name == "Darwin": - return path.join(base_path, "senna-osx") - return path.join(base_path, "senna") - - def _map(self): - """ - A method that calculates the order of the columns that SENNA pipeline - will output the tags into. This depends on the operations being ordered. - """ - _map = {} - i = 1 - for operation in Senna.SUPPORTED_OPERATIONS: - if operation in self.operations: - _map[operation] = i - i += 1 - return _map - - def tag(self, tokens): - """ - Applies the specified operation(s) on a list of tokens. - """ - return self.tag_sents([tokens])[0] - - def tag_sents(self, sentences): - """ - Applies the tag method over a list of sentences. This method will return a - list of dictionaries. Every dictionary will contain a word with its - calculated annotations/tags. - """ - encoding = self._encoding - - if not path.isfile(self.executable(self._path)): - raise LookupError( - "Senna executable expected at %s but not found" - % self.executable(self._path) - ) - - # Build the senna command to run the tagger - _senna_cmd = [ - self.executable(self._path), - "-path", - self._path, - "-usrtokens", - "-iobtags", - ] - _senna_cmd.extend(["-" + op for op in self.operations]) - - # Serialize the actual sentences to a temporary string - _input = "\n".join(" ".join(x) for x in sentences) + "\n" - if isinstance(_input, str) and encoding: - _input = _input.encode(encoding) - - # Run the tagger and get the output - p = Popen(_senna_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE) - (stdout, stderr) = p.communicate(input=_input) - senna_output = stdout - - # Check the return code. - if p.returncode != 0: - raise RuntimeError("Senna command failed! Details: %s" % stderr) - - if encoding: - senna_output = stdout.decode(encoding) - - # Output the tagged sentences - map_ = self._map() - tagged_sentences = [[]] - sentence_index = 0 - token_index = 0 - for tagged_word in senna_output.strip().split("\n"): - if not tagged_word: - tagged_sentences.append([]) - sentence_index += 1 - token_index = 0 - continue - tags = tagged_word.split("\t") - result = {} - for tag in map_: - result[tag] = tags[map_[tag]].strip() - try: - result["word"] = sentences[sentence_index][token_index] - except IndexError as e: - raise IndexError( - "Misalignment error occurred at sentence number %d. Possible reason" - " is that the sentence size exceeded the maximum size. Check the " - "documentation of Senna class for more information." - % sentence_index - ) from e - tagged_sentences[-1].append(result) - token_index += 1 - return tagged_sentences diff --git a/pipeline/nltk/classify/svm.py b/pipeline/nltk/classify/svm.py deleted file mode 100644 index 65b852c2748b0df620a4bc4340197ea1026e6728..0000000000000000000000000000000000000000 --- a/pipeline/nltk/classify/svm.py +++ /dev/null @@ -1,17 +0,0 @@ -# Natural Language Toolkit: SVM-based classifier -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Leon Derczynski -# -# URL: -# For license information, see LICENSE.TXT -""" -nltk.classify.svm was deprecated. For classification based -on support vector machines SVMs use nltk.classify.scikitlearn -(or `scikit-learn `_ directly). -""" - - -class SvmClassifier: - def __init__(self, *args, **kwargs): - raise NotImplementedError(__doc__) diff --git a/pipeline/nltk/classify/tadm.py b/pipeline/nltk/classify/tadm.py deleted file mode 100644 index f8eb4b3daa2b7e904856b3fa2b4f16378427715f..0000000000000000000000000000000000000000 --- a/pipeline/nltk/classify/tadm.py +++ /dev/null @@ -1,122 +0,0 @@ -# Natural Language Toolkit: Interface to TADM Classifier -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Joseph Frazee -# URL: -# For license information, see LICENSE.TXT - -import subprocess -import sys - -from nltk.internals import find_binary - -try: - import numpy -except ImportError: - pass - -_tadm_bin = None - - -def config_tadm(bin=None): - global _tadm_bin - _tadm_bin = find_binary( - "tadm", bin, env_vars=["TADM"], binary_names=["tadm"], url="http://tadm.sf.net" - ) - - -def write_tadm_file(train_toks, encoding, stream): - """ - Generate an input file for ``tadm`` based on the given corpus of - classified tokens. - - :type train_toks: list(tuple(dict, str)) - :param train_toks: Training data, represented as a list of - pairs, the first member of which is a feature dictionary, - and the second of which is a classification label. - :type encoding: TadmEventMaxentFeatureEncoding - :param encoding: A feature encoding, used to convert featuresets - into feature vectors. - :type stream: stream - :param stream: The stream to which the ``tadm`` input file should be - written. - """ - # See the following for a file format description: - # - # https://sf.net/forum/forum.php?thread_id=1391502&forum_id=473054 - # https://sf.net/forum/forum.php?thread_id=1675097&forum_id=473054 - labels = encoding.labels() - for featureset, label in train_toks: - length_line = "%d\n" % len(labels) - stream.write(length_line) - for known_label in labels: - v = encoding.encode(featureset, known_label) - line = "%d %d %s\n" % ( - int(label == known_label), - len(v), - " ".join("%d %d" % u for u in v), - ) - stream.write(line) - - -def parse_tadm_weights(paramfile): - """ - Given the stdout output generated by ``tadm`` when training a - model, return a ``numpy`` array containing the corresponding weight - vector. - """ - weights = [] - for line in paramfile: - weights.append(float(line.strip())) - return numpy.array(weights, "d") - - -def call_tadm(args): - """ - Call the ``tadm`` binary with the given arguments. - """ - if isinstance(args, str): - raise TypeError("args should be a list of strings") - if _tadm_bin is None: - config_tadm() - - # Call tadm via a subprocess - cmd = [_tadm_bin] + args - p = subprocess.Popen(cmd, stdout=sys.stdout) - (stdout, stderr) = p.communicate() - - # Check the return code. - if p.returncode != 0: - print() - print(stderr) - raise OSError("tadm command failed!") - - -def names_demo(): - from nltk.classify.maxent import TadmMaxentClassifier - from nltk.classify.util import names_demo - - classifier = names_demo(TadmMaxentClassifier.train) - - -def encoding_demo(): - import sys - - from nltk.classify.maxent import TadmEventMaxentFeatureEncoding - - tokens = [ - ({"f0": 1, "f1": 1, "f3": 1}, "A"), - ({"f0": 1, "f2": 1, "f4": 1}, "B"), - ({"f0": 2, "f2": 1, "f3": 1, "f4": 1}, "A"), - ] - encoding = TadmEventMaxentFeatureEncoding.train(tokens) - write_tadm_file(tokens, encoding, sys.stdout) - print() - for i in range(encoding.length()): - print("%s --> %d" % (encoding.describe(i), i)) - print() - - -if __name__ == "__main__": - encoding_demo() - names_demo() diff --git a/pipeline/nltk/classify/textcat.py b/pipeline/nltk/classify/textcat.py deleted file mode 100644 index b8176d3b668d2d233023cd22e7ae3854df8043d5..0000000000000000000000000000000000000000 --- a/pipeline/nltk/classify/textcat.py +++ /dev/null @@ -1,197 +0,0 @@ -# Natural Language Toolkit: Language ID module using TextCat algorithm -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Avital Pekker -# -# URL: -# For license information, see LICENSE.TXT - -""" -A module for language identification using the TextCat algorithm. -An implementation of the text categorization algorithm -presented in Cavnar, W. B. and J. M. Trenkle, -"N-Gram-Based Text Categorization". - -The algorithm takes advantage of Zipf's law and uses -n-gram frequencies to profile languages and text-yet to -be identified-then compares using a distance measure. - -Language n-grams are provided by the "An Crubadan" -project. A corpus reader was created separately to read -those files. - -For details regarding the algorithm, see: -https://www.let.rug.nl/~vannoord/TextCat/textcat.pdf - -For details about An Crubadan, see: -https://borel.slu.edu/crubadan/index.html -""" - -from sys import maxsize - -from nltk.util import trigrams - -# Note: this is NOT "re" you're likely used to. The regex module -# is an alternative to the standard re module that supports -# Unicode codepoint properties with the \p{} syntax. -# You may have to "pip install regx" -try: - import regex as re -except ImportError: - re = None -###################################################################### -## Language identification using TextCat -###################################################################### - - -class TextCat: - - _corpus = None - fingerprints = {} - _START_CHAR = "<" - _END_CHAR = ">" - - last_distances = {} - - def __init__(self): - if not re: - raise OSError( - "classify.textcat requires the regex module that " - "supports unicode. Try '$ pip install regex' and " - "see https://pypi.python.org/pypi/regex for " - "further details." - ) - - from nltk.corpus import crubadan - - self._corpus = crubadan - # Load all language ngrams into cache - for lang in self._corpus.langs(): - self._corpus.lang_freq(lang) - - def remove_punctuation(self, text): - """Get rid of punctuation except apostrophes""" - return re.sub(r"[^\P{P}\']+", "", text) - - def profile(self, text): - """Create FreqDist of trigrams within text""" - from nltk import FreqDist, word_tokenize - - clean_text = self.remove_punctuation(text) - tokens = word_tokenize(clean_text) - - fingerprint = FreqDist() - for t in tokens: - token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR) - token_trigrams = ["".join(tri) for tri in token_trigram_tuples] - - for cur_trigram in token_trigrams: - if cur_trigram in fingerprint: - fingerprint[cur_trigram] += 1 - else: - fingerprint[cur_trigram] = 1 - - return fingerprint - - def calc_dist(self, lang, trigram, text_profile): - """Calculate the "out-of-place" measure between the - text and language profile for a single trigram""" - - lang_fd = self._corpus.lang_freq(lang) - dist = 0 - - if trigram in lang_fd: - idx_lang_profile = list(lang_fd.keys()).index(trigram) - idx_text = list(text_profile.keys()).index(trigram) - - # print(idx_lang_profile, ", ", idx_text) - dist = abs(idx_lang_profile - idx_text) - else: - # Arbitrary but should be larger than - # any possible trigram file length - # in terms of total lines - dist = maxsize - - return dist - - def lang_dists(self, text): - """Calculate the "out-of-place" measure between - the text and all languages""" - - distances = {} - profile = self.profile(text) - # For all the languages - for lang in self._corpus._all_lang_freq.keys(): - # Calculate distance metric for every trigram in - # input text to be identified - lang_dist = 0 - for trigram in profile: - lang_dist += self.calc_dist(lang, trigram, profile) - - distances[lang] = lang_dist - - return distances - - def guess_language(self, text): - """Find the language with the min distance - to the text and return its ISO 639-3 code""" - self.last_distances = self.lang_dists(text) - - return min(self.last_distances, key=self.last_distances.get) - #################################################') - - -def demo(): - from nltk.corpus import udhr - - langs = [ - "Kurdish-UTF8", - "Abkhaz-UTF8", - "Farsi_Persian-UTF8", - "Hindi-UTF8", - "Hawaiian-UTF8", - "Russian-UTF8", - "Vietnamese-UTF8", - "Serbian_Srpski-UTF8", - "Esperanto-UTF8", - ] - - friendly = { - "kmr": "Northern Kurdish", - "abk": "Abkhazian", - "pes": "Iranian Persian", - "hin": "Hindi", - "haw": "Hawaiian", - "rus": "Russian", - "vie": "Vietnamese", - "srp": "Serbian", - "epo": "Esperanto", - } - - tc = TextCat() - - for cur_lang in langs: - # Get raw data from UDHR corpus - raw_sentences = udhr.sents(cur_lang) - rows = len(raw_sentences) - 1 - cols = list(map(len, raw_sentences)) - - sample = "" - - # Generate a sample text of the language - for i in range(0, rows): - cur_sent = "" - for j in range(0, cols[i]): - cur_sent += " " + raw_sentences[i][j] - - sample += cur_sent - - # Try to detect what it is - print("Language snippet: " + sample[0:140] + "...") - guess = tc.guess_language(sample) - print(f"Language detection: {guess} ({friendly[guess]})") - print("#" * 140) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/classify/util.py b/pipeline/nltk/classify/util.py deleted file mode 100644 index f6ada2c3e30a97debe6c6f03ee5c7be06f5e9d2e..0000000000000000000000000000000000000000 --- a/pipeline/nltk/classify/util.py +++ /dev/null @@ -1,346 +0,0 @@ -# Natural Language Toolkit: Classifier Utility Functions -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird (minor additions) -# URL: -# For license information, see LICENSE.TXT - -""" -Utility functions and classes for classifiers. -""" - -import math - -# from nltk.util import Deprecated -import nltk.classify.util # for accuracy & log_likelihood -from nltk.util import LazyMap - -###################################################################### -# { Helper Functions -###################################################################### - -# alternative name possibility: 'map_featurefunc()'? -# alternative name possibility: 'detect_features()'? -# alternative name possibility: 'map_featuredetect()'? -# or.. just have users use LazyMap directly? -def apply_features(feature_func, toks, labeled=None): - """ - Use the ``LazyMap`` class to construct a lazy list-like - object that is analogous to ``map(feature_func, toks)``. In - particular, if ``labeled=False``, then the returned list-like - object's values are equal to:: - - [feature_func(tok) for tok in toks] - - If ``labeled=True``, then the returned list-like object's values - are equal to:: - - [(feature_func(tok), label) for (tok, label) in toks] - - The primary purpose of this function is to avoid the memory - overhead involved in storing all the featuresets for every token - in a corpus. Instead, these featuresets are constructed lazily, - as-needed. The reduction in memory overhead can be especially - significant when the underlying list of tokens is itself lazy (as - is the case with many corpus readers). - - :param feature_func: The function that will be applied to each - token. It should return a featureset -- i.e., a dict - mapping feature names to feature values. - :param toks: The list of tokens to which ``feature_func`` should be - applied. If ``labeled=True``, then the list elements will be - passed directly to ``feature_func()``. If ``labeled=False``, - then the list elements should be tuples ``(tok,label)``, and - ``tok`` will be passed to ``feature_func()``. - :param labeled: If true, then ``toks`` contains labeled tokens -- - i.e., tuples of the form ``(tok, label)``. (Default: - auto-detect based on types.) - """ - if labeled is None: - labeled = toks and isinstance(toks[0], (tuple, list)) - if labeled: - - def lazy_func(labeled_token): - return (feature_func(labeled_token[0]), labeled_token[1]) - - return LazyMap(lazy_func, toks) - else: - return LazyMap(feature_func, toks) - - -def attested_labels(tokens): - """ - :return: A list of all labels that are attested in the given list - of tokens. - :rtype: list of (immutable) - :param tokens: The list of classified tokens from which to extract - labels. A classified token has the form ``(token, label)``. - :type tokens: list - """ - return tuple({label for (tok, label) in tokens}) - - -def log_likelihood(classifier, gold): - results = classifier.prob_classify_many([fs for (fs, l) in gold]) - ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)] - return math.log(sum(ll) / len(ll)) - - -def accuracy(classifier, gold): - results = classifier.classify_many([fs for (fs, l) in gold]) - correct = [l == r for ((fs, l), r) in zip(gold, results)] - if correct: - return sum(correct) / len(correct) - else: - return 0 - - -class CutoffChecker: - """ - A helper class that implements cutoff checks based on number of - iterations and log likelihood. - - Accuracy cutoffs are also implemented, but they're almost never - a good idea to use. - """ - - def __init__(self, cutoffs): - self.cutoffs = cutoffs.copy() - if "min_ll" in cutoffs: - cutoffs["min_ll"] = -abs(cutoffs["min_ll"]) - if "min_lldelta" in cutoffs: - cutoffs["min_lldelta"] = abs(cutoffs["min_lldelta"]) - self.ll = None - self.acc = None - self.iter = 1 - - def check(self, classifier, train_toks): - cutoffs = self.cutoffs - self.iter += 1 - if "max_iter" in cutoffs and self.iter >= cutoffs["max_iter"]: - return True # iteration cutoff. - - new_ll = nltk.classify.util.log_likelihood(classifier, train_toks) - if math.isnan(new_ll): - return True - - if "min_ll" in cutoffs or "min_lldelta" in cutoffs: - if "min_ll" in cutoffs and new_ll >= cutoffs["min_ll"]: - return True # log likelihood cutoff - if ( - "min_lldelta" in cutoffs - and self.ll - and ((new_ll - self.ll) <= abs(cutoffs["min_lldelta"])) - ): - return True # log likelihood delta cutoff - self.ll = new_ll - - if "max_acc" in cutoffs or "min_accdelta" in cutoffs: - new_acc = nltk.classify.util.log_likelihood(classifier, train_toks) - if "max_acc" in cutoffs and new_acc >= cutoffs["max_acc"]: - return True # log likelihood cutoff - if ( - "min_accdelta" in cutoffs - and self.acc - and ((new_acc - self.acc) <= abs(cutoffs["min_accdelta"])) - ): - return True # log likelihood delta cutoff - self.acc = new_acc - - return False # no cutoff reached. - - -###################################################################### -# { Demos -###################################################################### - - -def names_demo_features(name): - features = {} - features["alwayson"] = True - features["startswith"] = name[0].lower() - features["endswith"] = name[-1].lower() - for letter in "abcdefghijklmnopqrstuvwxyz": - features["count(%s)" % letter] = name.lower().count(letter) - features["has(%s)" % letter] = letter in name.lower() - return features - - -def binary_names_demo_features(name): - features = {} - features["alwayson"] = True - features["startswith(vowel)"] = name[0].lower() in "aeiouy" - features["endswith(vowel)"] = name[-1].lower() in "aeiouy" - for letter in "abcdefghijklmnopqrstuvwxyz": - features["count(%s)" % letter] = name.lower().count(letter) - features["has(%s)" % letter] = letter in name.lower() - features["startswith(%s)" % letter] = letter == name[0].lower() - features["endswith(%s)" % letter] = letter == name[-1].lower() - return features - - -def names_demo(trainer, features=names_demo_features): - import random - - from nltk.corpus import names - - # Construct a list of classified names, using the names corpus. - namelist = [(name, "male") for name in names.words("male.txt")] + [ - (name, "female") for name in names.words("female.txt") - ] - - # Randomly split the names into a test & train set. - random.seed(123456) - random.shuffle(namelist) - train = namelist[:5000] - test = namelist[5000:5500] - - # Train up a classifier. - print("Training classifier...") - classifier = trainer([(features(n), g) for (n, g) in train]) - - # Run the classifier on the test data. - print("Testing classifier...") - acc = accuracy(classifier, [(features(n), g) for (n, g) in test]) - print("Accuracy: %6.4f" % acc) - - # For classifiers that can find probabilities, show the log - # likelihood and some sample probability distributions. - try: - test_featuresets = [features(n) for (n, g) in test] - pdists = classifier.prob_classify_many(test_featuresets) - ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] - print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test))) - print() - print("Unseen Names P(Male) P(Female)\n" + "-" * 40) - for ((name, gender), pdist) in list(zip(test, pdists))[:5]: - if gender == "male": - fmt = " %-15s *%6.4f %6.4f" - else: - fmt = " %-15s %6.4f *%6.4f" - print(fmt % (name, pdist.prob("male"), pdist.prob("female"))) - except NotImplementedError: - pass - - # Return the classifier - return classifier - - -def partial_names_demo(trainer, features=names_demo_features): - import random - - from nltk.corpus import names - - male_names = names.words("male.txt") - female_names = names.words("female.txt") - - random.seed(654321) - random.shuffle(male_names) - random.shuffle(female_names) - - # Create a list of male names to be used as positive-labeled examples for training - positive = map(features, male_names[:2000]) - - # Create a list of male and female names to be used as unlabeled examples - unlabeled = map(features, male_names[2000:2500] + female_names[:500]) - - # Create a test set with correctly-labeled male and female names - test = [(name, True) for name in male_names[2500:2750]] + [ - (name, False) for name in female_names[500:750] - ] - - random.shuffle(test) - - # Train up a classifier. - print("Training classifier...") - classifier = trainer(positive, unlabeled) - - # Run the classifier on the test data. - print("Testing classifier...") - acc = accuracy(classifier, [(features(n), m) for (n, m) in test]) - print("Accuracy: %6.4f" % acc) - - # For classifiers that can find probabilities, show the log - # likelihood and some sample probability distributions. - try: - test_featuresets = [features(n) for (n, m) in test] - pdists = classifier.prob_classify_many(test_featuresets) - ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] - print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test))) - print() - print("Unseen Names P(Male) P(Female)\n" + "-" * 40) - for ((name, is_male), pdist) in zip(test, pdists)[:5]: - if is_male == True: - fmt = " %-15s *%6.4f %6.4f" - else: - fmt = " %-15s %6.4f *%6.4f" - print(fmt % (name, pdist.prob(True), pdist.prob(False))) - except NotImplementedError: - pass - - # Return the classifier - return classifier - - -_inst_cache = {} - - -def wsd_demo(trainer, word, features, n=1000): - import random - - from nltk.corpus import senseval - - # Get the instances. - print("Reading data...") - global _inst_cache - if word not in _inst_cache: - _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)] - instances = _inst_cache[word][:] - if n > len(instances): - n = len(instances) - senses = list({l for (i, l) in instances}) - print(" Senses: " + " ".join(senses)) - - # Randomly split the names into a test & train set. - print("Splitting into test & train...") - random.seed(123456) - random.shuffle(instances) - train = instances[: int(0.8 * n)] - test = instances[int(0.8 * n) : n] - - # Train up a classifier. - print("Training classifier...") - classifier = trainer([(features(i), l) for (i, l) in train]) - - # Run the classifier on the test data. - print("Testing classifier...") - acc = accuracy(classifier, [(features(i), l) for (i, l) in test]) - print("Accuracy: %6.4f" % acc) - - # For classifiers that can find probabilities, show the log - # likelihood and some sample probability distributions. - try: - test_featuresets = [features(i) for (i, n) in test] - pdists = classifier.prob_classify_many(test_featuresets) - ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] - print("Avg. log likelihood: %6.4f" % (sum(ll) / len(test))) - except NotImplementedError: - pass - - # Return the classifier - return classifier - - -def check_megam_config(): - """ - Checks whether the MEGAM binary is configured. - """ - try: - _megam_bin - except NameError as e: - err_msg = str( - "Please configure your megam binary first, e.g.\n" - ">>> nltk.config_megam('/usr/bin/local/megam')" - ) - raise NameError(err_msg) from e diff --git a/pipeline/nltk/classify/weka.py b/pipeline/nltk/classify/weka.py deleted file mode 100644 index b02505f0139bc4e0c516d0384b6c9d3224297dbd..0000000000000000000000000000000000000000 --- a/pipeline/nltk/classify/weka.py +++ /dev/null @@ -1,377 +0,0 @@ -# Natural Language Toolkit: Interface to Weka Classsifiers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -Classifiers that make use of the external 'Weka' package. -""" - -import os -import re -import subprocess -import tempfile -import time -import zipfile -from sys import stdin - -from nltk.classify.api import ClassifierI -from nltk.internals import config_java, java -from nltk.probability import DictionaryProbDist - -_weka_classpath = None -_weka_search = [ - ".", - "/usr/share/weka", - "/usr/local/share/weka", - "/usr/lib/weka", - "/usr/local/lib/weka", -] - - -def config_weka(classpath=None): - global _weka_classpath - - # Make sure java's configured first. - config_java() - - if classpath is not None: - _weka_classpath = classpath - - if _weka_classpath is None: - searchpath = _weka_search - if "WEKAHOME" in os.environ: - searchpath.insert(0, os.environ["WEKAHOME"]) - - for path in searchpath: - if os.path.exists(os.path.join(path, "weka.jar")): - _weka_classpath = os.path.join(path, "weka.jar") - version = _check_weka_version(_weka_classpath) - if version: - print(f"[Found Weka: {_weka_classpath} (version {version})]") - else: - print("[Found Weka: %s]" % _weka_classpath) - _check_weka_version(_weka_classpath) - - if _weka_classpath is None: - raise LookupError( - "Unable to find weka.jar! Use config_weka() " - "or set the WEKAHOME environment variable. " - "For more information about Weka, please see " - "https://www.cs.waikato.ac.nz/ml/weka/" - ) - - -def _check_weka_version(jar): - try: - zf = zipfile.ZipFile(jar) - except (SystemExit, KeyboardInterrupt): - raise - except: - return None - try: - try: - return zf.read("weka/core/version.txt") - except KeyError: - return None - finally: - zf.close() - - -class WekaClassifier(ClassifierI): - def __init__(self, formatter, model_filename): - self._formatter = formatter - self._model = model_filename - - def prob_classify_many(self, featuresets): - return self._classify_many(featuresets, ["-p", "0", "-distribution"]) - - def classify_many(self, featuresets): - return self._classify_many(featuresets, ["-p", "0"]) - - def _classify_many(self, featuresets, options): - # Make sure we can find java & weka. - config_weka() - - temp_dir = tempfile.mkdtemp() - try: - # Write the test data file. - test_filename = os.path.join(temp_dir, "test.arff") - self._formatter.write(test_filename, featuresets) - - # Call weka to classify the data. - cmd = [ - "weka.classifiers.bayes.NaiveBayes", - "-l", - self._model, - "-T", - test_filename, - ] + options - (stdout, stderr) = java( - cmd, - classpath=_weka_classpath, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - - # Check if something went wrong: - if stderr and not stdout: - if "Illegal options: -distribution" in stderr: - raise ValueError( - "The installed version of weka does " - "not support probability distribution " - "output." - ) - else: - raise ValueError("Weka failed to generate output:\n%s" % stderr) - - # Parse weka's output. - return self.parse_weka_output(stdout.decode(stdin.encoding).split("\n")) - - finally: - for f in os.listdir(temp_dir): - os.remove(os.path.join(temp_dir, f)) - os.rmdir(temp_dir) - - def parse_weka_distribution(self, s): - probs = [float(v) for v in re.split("[*,]+", s) if v.strip()] - probs = dict(zip(self._formatter.labels(), probs)) - return DictionaryProbDist(probs) - - def parse_weka_output(self, lines): - # Strip unwanted text from stdout - for i, line in enumerate(lines): - if line.strip().startswith("inst#"): - lines = lines[i:] - break - - if lines[0].split() == ["inst#", "actual", "predicted", "error", "prediction"]: - return [line.split()[2].split(":")[1] for line in lines[1:] if line.strip()] - elif lines[0].split() == [ - "inst#", - "actual", - "predicted", - "error", - "distribution", - ]: - return [ - self.parse_weka_distribution(line.split()[-1]) - for line in lines[1:] - if line.strip() - ] - - # is this safe:? - elif re.match(r"^0 \w+ [01]\.[0-9]* \?\s*$", lines[0]): - return [line.split()[1] for line in lines if line.strip()] - - else: - for line in lines[:10]: - print(line) - raise ValueError( - "Unhandled output format -- your version " - "of weka may not be supported.\n" - " Header: %s" % lines[0] - ) - - # [xx] full list of classifiers (some may be abstract?): - # ADTree, AODE, BayesNet, ComplementNaiveBayes, ConjunctiveRule, - # DecisionStump, DecisionTable, HyperPipes, IB1, IBk, Id3, J48, - # JRip, KStar, LBR, LeastMedSq, LinearRegression, LMT, Logistic, - # LogisticBase, M5Base, MultilayerPerceptron, - # MultipleClassifiersCombiner, NaiveBayes, NaiveBayesMultinomial, - # NaiveBayesSimple, NBTree, NNge, OneR, PaceRegression, PART, - # PreConstructedLinearModel, Prism, RandomForest, - # RandomizableClassifier, RandomTree, RBFNetwork, REPTree, Ridor, - # RuleNode, SimpleLinearRegression, SimpleLogistic, - # SingleClassifierEnhancer, SMO, SMOreg, UserClassifier, VFI, - # VotedPerceptron, Winnow, ZeroR - - _CLASSIFIER_CLASS = { - "naivebayes": "weka.classifiers.bayes.NaiveBayes", - "C4.5": "weka.classifiers.trees.J48", - "log_regression": "weka.classifiers.functions.Logistic", - "svm": "weka.classifiers.functions.SMO", - "kstar": "weka.classifiers.lazy.KStar", - "ripper": "weka.classifiers.rules.JRip", - } - - @classmethod - def train( - cls, - model_filename, - featuresets, - classifier="naivebayes", - options=[], - quiet=True, - ): - # Make sure we can find java & weka. - config_weka() - - # Build an ARFF formatter. - formatter = ARFF_Formatter.from_train(featuresets) - - temp_dir = tempfile.mkdtemp() - try: - # Write the training data file. - train_filename = os.path.join(temp_dir, "train.arff") - formatter.write(train_filename, featuresets) - - if classifier in cls._CLASSIFIER_CLASS: - javaclass = cls._CLASSIFIER_CLASS[classifier] - elif classifier in cls._CLASSIFIER_CLASS.values(): - javaclass = classifier - else: - raise ValueError("Unknown classifier %s" % classifier) - - # Train the weka model. - cmd = [javaclass, "-d", model_filename, "-t", train_filename] - cmd += list(options) - if quiet: - stdout = subprocess.PIPE - else: - stdout = None - java(cmd, classpath=_weka_classpath, stdout=stdout) - - # Return the new classifier. - return WekaClassifier(formatter, model_filename) - - finally: - for f in os.listdir(temp_dir): - os.remove(os.path.join(temp_dir, f)) - os.rmdir(temp_dir) - - -class ARFF_Formatter: - """ - Converts featuresets and labeled featuresets to ARFF-formatted - strings, appropriate for input into Weka. - - Features and classes can be specified manually in the constructor, or may - be determined from data using ``from_train``. - """ - - def __init__(self, labels, features): - """ - :param labels: A list of all class labels that can be generated. - :param features: A list of feature specifications, where - each feature specification is a tuple (fname, ftype); - and ftype is an ARFF type string such as NUMERIC or - STRING. - """ - self._labels = labels - self._features = features - - def format(self, tokens): - """Returns a string representation of ARFF output for the given data.""" - return self.header_section() + self.data_section(tokens) - - def labels(self): - """Returns the list of classes.""" - return list(self._labels) - - def write(self, outfile, tokens): - """Writes ARFF data to a file for the given data.""" - if not hasattr(outfile, "write"): - outfile = open(outfile, "w") - outfile.write(self.format(tokens)) - outfile.close() - - @staticmethod - def from_train(tokens): - """ - Constructs an ARFF_Formatter instance with class labels and feature - types determined from the given data. Handles boolean, numeric and - string (note: not nominal) types. - """ - # Find the set of all attested labels. - labels = {label for (tok, label) in tokens} - - # Determine the types of all features. - features = {} - for tok, label in tokens: - for (fname, fval) in tok.items(): - if issubclass(type(fval), bool): - ftype = "{True, False}" - elif issubclass(type(fval), (int, float, bool)): - ftype = "NUMERIC" - elif issubclass(type(fval), str): - ftype = "STRING" - elif fval is None: - continue # can't tell the type. - else: - raise ValueError("Unsupported value type %r" % ftype) - - if features.get(fname, ftype) != ftype: - raise ValueError("Inconsistent type for %s" % fname) - features[fname] = ftype - features = sorted(features.items()) - - return ARFF_Formatter(labels, features) - - def header_section(self): - """Returns an ARFF header as a string.""" - # Header comment. - s = ( - "% Weka ARFF file\n" - + "% Generated automatically by NLTK\n" - + "%% %s\n\n" % time.ctime() - ) - - # Relation name - s += "@RELATION rel\n\n" - - # Input attribute specifications - for fname, ftype in self._features: - s += "@ATTRIBUTE %-30r %s\n" % (fname, ftype) - - # Label attribute specification - s += "@ATTRIBUTE %-30r {%s}\n" % ("-label-", ",".join(self._labels)) - - return s - - def data_section(self, tokens, labeled=None): - """ - Returns the ARFF data section for the given data. - - :param tokens: a list of featuresets (dicts) or labelled featuresets - which are tuples (featureset, label). - :param labeled: Indicates whether the given tokens are labeled - or not. If None, then the tokens will be assumed to be - labeled if the first token's value is a tuple or list. - """ - # Check if the tokens are labeled or unlabeled. If unlabeled, - # then use 'None' - if labeled is None: - labeled = tokens and isinstance(tokens[0], (tuple, list)) - if not labeled: - tokens = [(tok, None) for tok in tokens] - - # Data section - s = "\n@DATA\n" - for (tok, label) in tokens: - for fname, ftype in self._features: - s += "%s," % self._fmt_arff_val(tok.get(fname)) - s += "%s\n" % self._fmt_arff_val(label) - - return s - - def _fmt_arff_val(self, fval): - if fval is None: - return "?" - elif isinstance(fval, (bool, int)): - return "%s" % fval - elif isinstance(fval, float): - return "%r" % fval - else: - return "%r" % fval - - -if __name__ == "__main__": - from nltk.classify.util import binary_names_demo_features, names_demo - - def make_classifier(featuresets): - return WekaClassifier.train("/tmp/name.model", featuresets, "C4.5") - - classifier = names_demo(make_classifier, binary_names_demo_features) diff --git a/pipeline/nltk/cli.py b/pipeline/nltk/cli.py deleted file mode 100644 index 1a36a14f49e6cce0a0655767eddc4d82894f36d6..0000000000000000000000000000000000000000 --- a/pipeline/nltk/cli.py +++ /dev/null @@ -1,55 +0,0 @@ -# Natural Language Toolkit: NLTK Command-Line Interface -# -# Copyright (C) 2001-2023 NLTK Project -# URL: -# For license information, see LICENSE.TXT - - -import click -from tqdm import tqdm - -from nltk import word_tokenize -from nltk.util import parallelize_preprocess - -CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) - - -@click.group(context_settings=CONTEXT_SETTINGS) -@click.version_option() -def cli(): - pass - - -@cli.command("tokenize") -@click.option( - "--language", - "-l", - default="en", - help="The language for the Punkt sentence tokenization.", -) -@click.option( - "--preserve-line", - "-l", - default=True, - is_flag=True, - help="An option to keep the preserve the sentence and not sentence tokenize it.", -) -@click.option("--processes", "-j", default=1, help="No. of processes.") -@click.option("--encoding", "-e", default="utf8", help="Specify encoding of file.") -@click.option( - "--delimiter", "-d", default=" ", help="Specify delimiter to join the tokens." -) -def tokenize_file(language, preserve_line, processes, encoding, delimiter): - """This command tokenizes text stream using nltk.word_tokenize""" - with click.get_text_stream("stdin", encoding=encoding) as fin: - with click.get_text_stream("stdout", encoding=encoding) as fout: - # If it's single process, joblib parallelization is slower, - # so just process line by line normally. - if processes == 1: - for line in tqdm(fin.readlines()): - print(delimiter.join(word_tokenize(line)), end="\n", file=fout) - else: - for outline in parallelize_preprocess( - word_tokenize, fin.readlines(), processes, progress_bar=True - ): - print(delimiter.join(outline), end="\n", file=fout) diff --git a/pipeline/nltk/cluster/__init__.py b/pipeline/nltk/cluster/__init__.py deleted file mode 100644 index 9df093cb0a7964ea43df052ac42fb46b6fbadee0..0000000000000000000000000000000000000000 --- a/pipeline/nltk/cluster/__init__.py +++ /dev/null @@ -1,92 +0,0 @@ -# Natural Language Toolkit: Clusterers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Trevor Cohn -# URL: -# For license information, see LICENSE.TXT - -""" -This module contains a number of basic clustering algorithms. Clustering -describes the task of discovering groups of similar items with a large -collection. It is also describe as unsupervised machine learning, as the data -from which it learns is unannotated with class information, as is the case for -supervised learning. Annotated data is difficult and expensive to obtain in -the quantities required for the majority of supervised learning algorithms. -This problem, the knowledge acquisition bottleneck, is common to most natural -language processing tasks, thus fueling the need for quality unsupervised -approaches. - -This module contains a k-means clusterer, E-M clusterer and a group average -agglomerative clusterer (GAAC). All these clusterers involve finding good -cluster groupings for a set of vectors in multi-dimensional space. - -The K-means clusterer starts with k arbitrary chosen means then allocates each -vector to the cluster with the closest mean. It then recalculates the means of -each cluster as the centroid of the vectors in the cluster. This process -repeats until the cluster memberships stabilise. This is a hill-climbing -algorithm which may converge to a local maximum. Hence the clustering is -often repeated with random initial means and the most commonly occurring -output means are chosen. - -The GAAC clusterer starts with each of the *N* vectors as singleton clusters. -It then iteratively merges pairs of clusters which have the closest centroids. -This continues until there is only one cluster. The order of merges gives rise -to a dendrogram - a tree with the earlier merges lower than later merges. The -membership of a given number of clusters *c*, *1 <= c <= N*, can be found by -cutting the dendrogram at depth *c*. - -The Gaussian EM clusterer models the vectors as being produced by a mixture -of k Gaussian sources. The parameters of these sources (prior probability, -mean and covariance matrix) are then found to maximise the likelihood of the -given data. This is done with the expectation maximisation algorithm. It -starts with k arbitrarily chosen means, priors and covariance matrices. It -then calculates the membership probabilities for each vector in each of the -clusters - this is the 'E' step. The cluster parameters are then updated in -the 'M' step using the maximum likelihood estimate from the cluster membership -probabilities. This process continues until the likelihood of the data does -not significantly increase. - -They all extend the ClusterI interface which defines common operations -available with each clusterer. These operations include: - -- cluster: clusters a sequence of vectors -- classify: assign a vector to a cluster -- classification_probdist: give the probability distribution over cluster memberships - -The current existing classifiers also extend cluster.VectorSpace, an -abstract class which allows for singular value decomposition (SVD) and vector -normalisation. SVD is used to reduce the dimensionality of the vector space in -such a manner as to preserve as much of the variation as possible, by -reparameterising the axes in order of variability and discarding all bar the -first d dimensions. Normalisation ensures that vectors fall in the unit -hypersphere. - -Usage example (see also demo()):: - - from nltk import cluster - from nltk.cluster import euclidean_distance - from numpy import array - - vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0]]] - - # initialise the clusterer (will also assign the vectors to clusters) - clusterer = cluster.KMeansClusterer(2, euclidean_distance) - clusterer.cluster(vectors, True) - - # classify a new vector - print(clusterer.classify(array([3, 3]))) - -Note that the vectors must use numpy array-like -objects. nltk_contrib.unimelb.tacohn.SparseArrays may be used for -efficiency when required. -""" - -from nltk.cluster.em import EMClusterer -from nltk.cluster.gaac import GAAClusterer -from nltk.cluster.kmeans import KMeansClusterer -from nltk.cluster.util import ( - Dendrogram, - VectorSpaceClusterer, - cosine_distance, - euclidean_distance, -) diff --git a/pipeline/nltk/cluster/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/cluster/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 7d64052f7dd68e9ade4ccd78dbe8d758ac104149..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/cluster/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/cluster/__pycache__/api.cpython-39.pyc b/pipeline/nltk/cluster/__pycache__/api.cpython-39.pyc deleted file mode 100644 index 58fb8c2dce0f64e65c71ece0954b290e2c89a2fb..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/cluster/__pycache__/api.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/cluster/__pycache__/em.cpython-39.pyc b/pipeline/nltk/cluster/__pycache__/em.cpython-39.pyc deleted file mode 100644 index b69d48ce29642ee58ae8d532e842741a7540c696..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/cluster/__pycache__/em.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/cluster/__pycache__/gaac.cpython-39.pyc b/pipeline/nltk/cluster/__pycache__/gaac.cpython-39.pyc deleted file mode 100644 index 8c690b4a77622bf9fbfb5fda907ac8fe62b33a5c..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/cluster/__pycache__/gaac.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/cluster/__pycache__/kmeans.cpython-39.pyc b/pipeline/nltk/cluster/__pycache__/kmeans.cpython-39.pyc deleted file mode 100644 index f8068307e9e5fd563f0aa812936200f4b802e481..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/cluster/__pycache__/kmeans.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/cluster/__pycache__/util.cpython-39.pyc b/pipeline/nltk/cluster/__pycache__/util.cpython-39.pyc deleted file mode 100644 index 290af51739954e7fe08fe52b3a4e0671c80ac6b6..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/cluster/__pycache__/util.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/cluster/api.py b/pipeline/nltk/cluster/api.py deleted file mode 100644 index 8da588408f83894b512166334197ec43b6899631..0000000000000000000000000000000000000000 --- a/pipeline/nltk/cluster/api.py +++ /dev/null @@ -1,74 +0,0 @@ -# Natural Language Toolkit: Clusterer Interfaces -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Trevor Cohn -# Porting: Steven Bird -# URL: -# For license information, see LICENSE.TXT - -from abc import ABCMeta, abstractmethod - -from nltk.probability import DictionaryProbDist - - -class ClusterI(metaclass=ABCMeta): - """ - Interface covering basic clustering functionality. - """ - - @abstractmethod - def cluster(self, vectors, assign_clusters=False): - """ - Assigns the vectors to clusters, learning the clustering parameters - from the data. Returns a cluster identifier for each vector. - """ - - @abstractmethod - def classify(self, token): - """ - Classifies the token into a cluster, setting the token's CLUSTER - parameter to that cluster identifier. - """ - - def likelihood(self, vector, label): - """ - Returns the likelihood (a float) of the token having the - corresponding cluster. - """ - if self.classify(vector) == label: - return 1.0 - else: - return 0.0 - - def classification_probdist(self, vector): - """ - Classifies the token into a cluster, returning - a probability distribution over the cluster identifiers. - """ - likelihoods = {} - sum = 0.0 - for cluster in self.cluster_names(): - likelihoods[cluster] = self.likelihood(vector, cluster) - sum += likelihoods[cluster] - for cluster in self.cluster_names(): - likelihoods[cluster] /= sum - return DictionaryProbDist(likelihoods) - - @abstractmethod - def num_clusters(self): - """ - Returns the number of clusters. - """ - - def cluster_names(self): - """ - Returns the names of the clusters. - :rtype: list - """ - return list(range(self.num_clusters())) - - def cluster_name(self, index): - """ - Returns the names of the cluster at index. - """ - return index diff --git a/pipeline/nltk/cluster/em.py b/pipeline/nltk/cluster/em.py deleted file mode 100644 index cb46fe35700afed79b728336bd1f07c33ed50dcb..0000000000000000000000000000000000000000 --- a/pipeline/nltk/cluster/em.py +++ /dev/null @@ -1,219 +0,0 @@ -# Natural Language Toolkit: Expectation Maximization Clusterer -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Trevor Cohn -# URL: -# For license information, see LICENSE.TXT - -try: - import numpy -except ImportError: - pass - -from nltk.cluster.util import VectorSpaceClusterer - - -class EMClusterer(VectorSpaceClusterer): - """ - The Gaussian EM clusterer models the vectors as being produced by - a mixture of k Gaussian sources. The parameters of these sources - (prior probability, mean and covariance matrix) are then found to - maximise the likelihood of the given data. This is done with the - expectation maximisation algorithm. It starts with k arbitrarily - chosen means, priors and covariance matrices. It then calculates - the membership probabilities for each vector in each of the - clusters; this is the 'E' step. The cluster parameters are then - updated in the 'M' step using the maximum likelihood estimate from - the cluster membership probabilities. This process continues until - the likelihood of the data does not significantly increase. - """ - - def __init__( - self, - initial_means, - priors=None, - covariance_matrices=None, - conv_threshold=1e-6, - bias=0.1, - normalise=False, - svd_dimensions=None, - ): - """ - Creates an EM clusterer with the given starting parameters, - convergence threshold and vector mangling parameters. - - :param initial_means: the means of the gaussian cluster centers - :type initial_means: [seq of] numpy array or seq of SparseArray - :param priors: the prior probability for each cluster - :type priors: numpy array or seq of float - :param covariance_matrices: the covariance matrix for each cluster - :type covariance_matrices: [seq of] numpy array - :param conv_threshold: maximum change in likelihood before deemed - convergent - :type conv_threshold: int or float - :param bias: variance bias used to ensure non-singular covariance - matrices - :type bias: float - :param normalise: should vectors be normalised to length 1 - :type normalise: boolean - :param svd_dimensions: number of dimensions to use in reducing vector - dimensionsionality with SVD - :type svd_dimensions: int - """ - VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) - self._means = numpy.array(initial_means, numpy.float64) - self._num_clusters = len(initial_means) - self._conv_threshold = conv_threshold - self._covariance_matrices = covariance_matrices - self._priors = priors - self._bias = bias - - def num_clusters(self): - return self._num_clusters - - def cluster_vectorspace(self, vectors, trace=False): - assert len(vectors) > 0 - - # set the parameters to initial values - dimensions = len(vectors[0]) - means = self._means - priors = self._priors - if not priors: - priors = self._priors = ( - numpy.ones(self._num_clusters, numpy.float64) / self._num_clusters - ) - covariances = self._covariance_matrices - if not covariances: - covariances = self._covariance_matrices = [ - numpy.identity(dimensions, numpy.float64) - for i in range(self._num_clusters) - ] - - # do the E and M steps until the likelihood plateaus - lastl = self._loglikelihood(vectors, priors, means, covariances) - converged = False - - while not converged: - if trace: - print("iteration; loglikelihood", lastl) - # E-step, calculate hidden variables, h[i,j] - h = numpy.zeros((len(vectors), self._num_clusters), numpy.float64) - for i in range(len(vectors)): - for j in range(self._num_clusters): - h[i, j] = priors[j] * self._gaussian( - means[j], covariances[j], vectors[i] - ) - h[i, :] /= sum(h[i, :]) - - # M-step, update parameters - cvm, p, mean - for j in range(self._num_clusters): - covariance_before = covariances[j] - new_covariance = numpy.zeros((dimensions, dimensions), numpy.float64) - new_mean = numpy.zeros(dimensions, numpy.float64) - sum_hj = 0.0 - for i in range(len(vectors)): - delta = vectors[i] - means[j] - new_covariance += h[i, j] * numpy.multiply.outer(delta, delta) - sum_hj += h[i, j] - new_mean += h[i, j] * vectors[i] - covariances[j] = new_covariance / sum_hj - means[j] = new_mean / sum_hj - priors[j] = sum_hj / len(vectors) - - # bias term to stop covariance matrix being singular - covariances[j] += self._bias * numpy.identity(dimensions, numpy.float64) - - # calculate likelihood - FIXME: may be broken - l = self._loglikelihood(vectors, priors, means, covariances) - - # check for convergence - if abs(lastl - l) < self._conv_threshold: - converged = True - lastl = l - - def classify_vectorspace(self, vector): - best = None - for j in range(self._num_clusters): - p = self._priors[j] * self._gaussian( - self._means[j], self._covariance_matrices[j], vector - ) - if not best or p > best[0]: - best = (p, j) - return best[1] - - def likelihood_vectorspace(self, vector, cluster): - cid = self.cluster_names().index(cluster) - return self._priors[cluster] * self._gaussian( - self._means[cluster], self._covariance_matrices[cluster], vector - ) - - def _gaussian(self, mean, cvm, x): - m = len(mean) - assert cvm.shape == (m, m), "bad sized covariance matrix, %s" % str(cvm.shape) - try: - det = numpy.linalg.det(cvm) - inv = numpy.linalg.inv(cvm) - a = det**-0.5 * (2 * numpy.pi) ** (-m / 2.0) - dx = x - mean - print(dx, inv) - b = -0.5 * numpy.dot(numpy.dot(dx, inv), dx) - return a * numpy.exp(b) - except OverflowError: - # happens when the exponent is negative infinity - i.e. b = 0 - # i.e. the inverse of cvm is huge (cvm is almost zero) - return 0 - - def _loglikelihood(self, vectors, priors, means, covariances): - llh = 0.0 - for vector in vectors: - p = 0 - for j in range(len(priors)): - p += priors[j] * self._gaussian(means[j], covariances[j], vector) - llh += numpy.log(p) - return llh - - def __repr__(self): - return "" % list(self._means) - - -def demo(): - """ - Non-interactive demonstration of the clusterers with simple 2-D data. - """ - - from nltk import cluster - - # example from figure 14.10, page 519, Manning and Schutze - - vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]] - means = [[4, 2], [4, 2.01]] - - clusterer = cluster.EMClusterer(means, bias=0.1) - clusters = clusterer.cluster(vectors, True, trace=True) - - print("Clustered:", vectors) - print("As: ", clusters) - print() - - for c in range(2): - print("Cluster:", c) - print("Prior: ", clusterer._priors[c]) - print("Mean: ", clusterer._means[c]) - print("Covar: ", clusterer._covariance_matrices[c]) - print() - - # classify a new vector - vector = numpy.array([2, 2]) - print("classify(%s):" % vector, end=" ") - print(clusterer.classify(vector)) - - # show the classification probabilities - vector = numpy.array([2, 2]) - print("classification_probdist(%s):" % vector) - pdist = clusterer.classification_probdist(vector) - for sample in pdist.samples(): - print(f"{sample} => {pdist.prob(sample) * 100:.0f}%") - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/cluster/gaac.py b/pipeline/nltk/cluster/gaac.py deleted file mode 100644 index 6fb9e2c51141ba915bf4defe2d8cdeadaa14e6b0..0000000000000000000000000000000000000000 --- a/pipeline/nltk/cluster/gaac.py +++ /dev/null @@ -1,170 +0,0 @@ -# Natural Language Toolkit: Group Average Agglomerative Clusterer -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Trevor Cohn -# URL: -# For license information, see LICENSE.TXT - -try: - import numpy -except ImportError: - pass - -from nltk.cluster.util import Dendrogram, VectorSpaceClusterer, cosine_distance - - -class GAAClusterer(VectorSpaceClusterer): - """ - The Group Average Agglomerative starts with each of the N vectors as singleton - clusters. It then iteratively merges pairs of clusters which have the - closest centroids. This continues until there is only one cluster. The - order of merges gives rise to a dendrogram: a tree with the earlier merges - lower than later merges. The membership of a given number of clusters c, 1 - <= c <= N, can be found by cutting the dendrogram at depth c. - - This clusterer uses the cosine similarity metric only, which allows for - efficient speed-up in the clustering process. - """ - - def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None): - VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) - self._num_clusters = num_clusters - self._dendrogram = None - self._groups_values = None - - def cluster(self, vectors, assign_clusters=False, trace=False): - # stores the merge order - self._dendrogram = Dendrogram( - [numpy.array(vector, numpy.float64) for vector in vectors] - ) - return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace) - - def cluster_vectorspace(self, vectors, trace=False): - # variables describing the initial situation - N = len(vectors) - cluster_len = [1] * N - cluster_count = N - index_map = numpy.arange(N) - - # construct the similarity matrix - dims = (N, N) - dist = numpy.ones(dims, dtype=float) * numpy.inf - for i in range(N): - for j in range(i + 1, N): - dist[i, j] = cosine_distance(vectors[i], vectors[j]) - - while cluster_count > max(self._num_clusters, 1): - i, j = numpy.unravel_index(dist.argmin(), dims) - if trace: - print("merging %d and %d" % (i, j)) - - # update similarities for merging i and j - self._merge_similarities(dist, cluster_len, i, j) - - # remove j - dist[:, j] = numpy.inf - dist[j, :] = numpy.inf - - # merge the clusters - cluster_len[i] = cluster_len[i] + cluster_len[j] - self._dendrogram.merge(index_map[i], index_map[j]) - cluster_count -= 1 - - # update the index map to reflect the indexes if we - # had removed j - index_map[j + 1 :] -= 1 - index_map[j] = N - - self.update_clusters(self._num_clusters) - - def _merge_similarities(self, dist, cluster_len, i, j): - # the new cluster i merged from i and j adopts the average of - # i and j's similarity to each other cluster, weighted by the - # number of points in the clusters i and j - i_weight = cluster_len[i] - j_weight = cluster_len[j] - weight_sum = i_weight + j_weight - - # update for x 0 - if self._should_normalise: - centroid = self._normalise(cluster[0]) - else: - centroid = numpy.array(cluster[0]) - for vector in cluster[1:]: - if self._should_normalise: - centroid += self._normalise(vector) - else: - centroid += vector - centroid /= len(cluster) - self._centroids.append(centroid) - self._num_clusters = len(self._centroids) - - def classify_vectorspace(self, vector): - best = None - for i in range(self._num_clusters): - centroid = self._centroids[i] - dist = cosine_distance(vector, centroid) - if not best or dist < best[0]: - best = (dist, i) - return best[1] - - def dendrogram(self): - """ - :return: The dendrogram representing the current clustering - :rtype: Dendrogram - """ - return self._dendrogram - - def num_clusters(self): - return self._num_clusters - - def __repr__(self): - return "" % self._num_clusters - - -def demo(): - """ - Non-interactive demonstration of the clusterers with simple 2-D data. - """ - - from nltk.cluster import GAAClusterer - - # use a set of tokens with 2D indices - vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] - - # test the GAAC clusterer with 4 clusters - clusterer = GAAClusterer(4) - clusters = clusterer.cluster(vectors, True) - - print("Clusterer:", clusterer) - print("Clustered:", vectors) - print("As:", clusters) - print() - - # show the dendrogram - clusterer.dendrogram().show() - - # classify a new vector - vector = numpy.array([3, 3]) - print("classify(%s):" % vector, end=" ") - print(clusterer.classify(vector)) - print() - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/cluster/kmeans.py b/pipeline/nltk/cluster/kmeans.py deleted file mode 100644 index 6b0d02f7dc0178f5bb1406d7a71a07ae46acaa93..0000000000000000000000000000000000000000 --- a/pipeline/nltk/cluster/kmeans.py +++ /dev/null @@ -1,231 +0,0 @@ -# Natural Language Toolkit: K-Means Clusterer -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Trevor Cohn -# URL: -# For license information, see LICENSE.TXT - -import copy -import random -import sys - -try: - import numpy -except ImportError: - pass - - -from nltk.cluster.util import VectorSpaceClusterer - - -class KMeansClusterer(VectorSpaceClusterer): - """ - The K-means clusterer starts with k arbitrary chosen means then allocates - each vector to the cluster with the closest mean. It then recalculates the - means of each cluster as the centroid of the vectors in the cluster. This - process repeats until the cluster memberships stabilise. This is a - hill-climbing algorithm which may converge to a local maximum. Hence the - clustering is often repeated with random initial means and the most - commonly occurring output means are chosen. - """ - - def __init__( - self, - num_means, - distance, - repeats=1, - conv_test=1e-6, - initial_means=None, - normalise=False, - svd_dimensions=None, - rng=None, - avoid_empty_clusters=False, - ): - - """ - :param num_means: the number of means to use (may use fewer) - :type num_means: int - :param distance: measure of distance between two vectors - :type distance: function taking two vectors and returning a float - :param repeats: number of randomised clustering trials to use - :type repeats: int - :param conv_test: maximum variation in mean differences before - deemed convergent - :type conv_test: number - :param initial_means: set of k initial means - :type initial_means: sequence of vectors - :param normalise: should vectors be normalised to length 1 - :type normalise: boolean - :param svd_dimensions: number of dimensions to use in reducing vector - dimensionsionality with SVD - :type svd_dimensions: int - :param rng: random number generator (or None) - :type rng: Random - :param avoid_empty_clusters: include current centroid in computation - of next one; avoids undefined behavior - when clusters become empty - :type avoid_empty_clusters: boolean - """ - VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) - self._num_means = num_means - self._distance = distance - self._max_difference = conv_test - assert not initial_means or len(initial_means) == num_means - self._means = initial_means - assert repeats >= 1 - assert not (initial_means and repeats > 1) - self._repeats = repeats - self._rng = rng if rng else random.Random() - self._avoid_empty_clusters = avoid_empty_clusters - - def cluster_vectorspace(self, vectors, trace=False): - if self._means and self._repeats > 1: - print("Warning: means will be discarded for subsequent trials") - - meanss = [] - for trial in range(self._repeats): - if trace: - print("k-means trial", trial) - if not self._means or trial > 1: - self._means = self._rng.sample(list(vectors), self._num_means) - self._cluster_vectorspace(vectors, trace) - meanss.append(self._means) - - if len(meanss) > 1: - # sort the means first (so that different cluster numbering won't - # effect the distance comparison) - for means in meanss: - means.sort(key=sum) - - # find the set of means that's minimally different from the others - min_difference = min_means = None - for i in range(len(meanss)): - d = 0 - for j in range(len(meanss)): - if i != j: - d += self._sum_distances(meanss[i], meanss[j]) - if min_difference is None or d < min_difference: - min_difference, min_means = d, meanss[i] - - # use the best means - self._means = min_means - - def _cluster_vectorspace(self, vectors, trace=False): - if self._num_means < len(vectors): - # perform k-means clustering - converged = False - while not converged: - # assign the tokens to clusters based on minimum distance to - # the cluster means - clusters = [[] for m in range(self._num_means)] - for vector in vectors: - index = self.classify_vectorspace(vector) - clusters[index].append(vector) - - if trace: - print("iteration") - # for i in range(self._num_means): - # print ' mean', i, 'allocated', len(clusters[i]), 'vectors' - - # recalculate cluster means by computing the centroid of each cluster - new_means = list(map(self._centroid, clusters, self._means)) - - # measure the degree of change from the previous step for convergence - difference = self._sum_distances(self._means, new_means) - if difference < self._max_difference: - converged = True - - # remember the new means - self._means = new_means - - def classify_vectorspace(self, vector): - # finds the closest cluster centroid - # returns that cluster's index - best_distance = best_index = None - for index in range(len(self._means)): - mean = self._means[index] - dist = self._distance(vector, mean) - if best_distance is None or dist < best_distance: - best_index, best_distance = index, dist - return best_index - - def num_clusters(self): - if self._means: - return len(self._means) - else: - return self._num_means - - def means(self): - """ - The means used for clustering. - """ - return self._means - - def _sum_distances(self, vectors1, vectors2): - difference = 0.0 - for u, v in zip(vectors1, vectors2): - difference += self._distance(u, v) - return difference - - def _centroid(self, cluster, mean): - if self._avoid_empty_clusters: - centroid = copy.copy(mean) - for vector in cluster: - centroid += vector - return centroid / (1 + len(cluster)) - else: - if not len(cluster): - sys.stderr.write("Error: no centroid defined for empty cluster.\n") - sys.stderr.write( - "Try setting argument 'avoid_empty_clusters' to True\n" - ) - assert False - centroid = copy.copy(cluster[0]) - for vector in cluster[1:]: - centroid += vector - return centroid / len(cluster) - - def __repr__(self): - return "" % (self._means, self._repeats) - - -################################################################################# - - -def demo(): - # example from figure 14.9, page 517, Manning and Schutze - - from nltk.cluster import KMeansClusterer, euclidean_distance - - vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]] - means = [[4, 3], [5, 5]] - - clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means) - clusters = clusterer.cluster(vectors, True, trace=True) - - print("Clustered:", vectors) - print("As:", clusters) - print("Means:", clusterer.means()) - print() - - vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] - - # test k-means using the euclidean distance metric, 2 means and repeat - # clustering 10 times with random seeds - - clusterer = KMeansClusterer(2, euclidean_distance, repeats=10) - clusters = clusterer.cluster(vectors, True) - print("Clustered:", vectors) - print("As:", clusters) - print("Means:", clusterer.means()) - print() - - # classify a new vector - vector = numpy.array([3, 3]) - print("classify(%s):" % vector, end=" ") - print(clusterer.classify(vector)) - print() - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/cluster/util.py b/pipeline/nltk/cluster/util.py deleted file mode 100644 index 8b8ed5e9f0b97be7ce80eef87d36fdbf8c59bdfb..0000000000000000000000000000000000000000 --- a/pipeline/nltk/cluster/util.py +++ /dev/null @@ -1,300 +0,0 @@ -# Natural Language Toolkit: Clusterer Utilities -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Trevor Cohn -# Contributor: J Richard Snape -# URL: -# For license information, see LICENSE.TXT -import copy -from abc import abstractmethod -from math import sqrt -from sys import stdout - -try: - import numpy -except ImportError: - pass - -from nltk.cluster.api import ClusterI - - -class VectorSpaceClusterer(ClusterI): - """ - Abstract clusterer which takes tokens and maps them into a vector space. - Optionally performs singular value decomposition to reduce the - dimensionality. - """ - - def __init__(self, normalise=False, svd_dimensions=None): - """ - :param normalise: should vectors be normalised to length 1 - :type normalise: boolean - :param svd_dimensions: number of dimensions to use in reducing vector - dimensionsionality with SVD - :type svd_dimensions: int - """ - self._Tt = None - self._should_normalise = normalise - self._svd_dimensions = svd_dimensions - - def cluster(self, vectors, assign_clusters=False, trace=False): - assert len(vectors) > 0 - - # normalise the vectors - if self._should_normalise: - vectors = list(map(self._normalise, vectors)) - - # use SVD to reduce the dimensionality - if self._svd_dimensions and self._svd_dimensions < len(vectors[0]): - [u, d, vt] = numpy.linalg.svd(numpy.transpose(numpy.array(vectors))) - S = d[: self._svd_dimensions] * numpy.identity( - self._svd_dimensions, numpy.float64 - ) - T = u[:, : self._svd_dimensions] - Dt = vt[: self._svd_dimensions, :] - vectors = numpy.transpose(numpy.dot(S, Dt)) - self._Tt = numpy.transpose(T) - - # call abstract method to cluster the vectors - self.cluster_vectorspace(vectors, trace) - - # assign the vectors to clusters - if assign_clusters: - return [self.classify(vector) for vector in vectors] - - @abstractmethod - def cluster_vectorspace(self, vectors, trace): - """ - Finds the clusters using the given set of vectors. - """ - - def classify(self, vector): - if self._should_normalise: - vector = self._normalise(vector) - if self._Tt is not None: - vector = numpy.dot(self._Tt, vector) - cluster = self.classify_vectorspace(vector) - return self.cluster_name(cluster) - - @abstractmethod - def classify_vectorspace(self, vector): - """ - Returns the index of the appropriate cluster for the vector. - """ - - def likelihood(self, vector, label): - if self._should_normalise: - vector = self._normalise(vector) - if self._Tt is not None: - vector = numpy.dot(self._Tt, vector) - return self.likelihood_vectorspace(vector, label) - - def likelihood_vectorspace(self, vector, cluster): - """ - Returns the likelihood of the vector belonging to the cluster. - """ - predicted = self.classify_vectorspace(vector) - return 1.0 if cluster == predicted else 0.0 - - def vector(self, vector): - """ - Returns the vector after normalisation and dimensionality reduction - """ - if self._should_normalise: - vector = self._normalise(vector) - if self._Tt is not None: - vector = numpy.dot(self._Tt, vector) - return vector - - def _normalise(self, vector): - """ - Normalises the vector to unit length. - """ - return vector / sqrt(numpy.dot(vector, vector)) - - -def euclidean_distance(u, v): - """ - Returns the euclidean distance between vectors u and v. This is equivalent - to the length of the vector (u - v). - """ - diff = u - v - return sqrt(numpy.dot(diff, diff)) - - -def cosine_distance(u, v): - """ - Returns 1 minus the cosine of the angle between vectors v and u. This is - equal to ``1 - (u.v / |u||v|)``. - """ - return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v)))) - - -class _DendrogramNode: - """Tree node of a dendrogram.""" - - def __init__(self, value, *children): - self._value = value - self._children = children - - def leaves(self, values=True): - if self._children: - leaves = [] - for child in self._children: - leaves.extend(child.leaves(values)) - return leaves - elif values: - return [self._value] - else: - return [self] - - def groups(self, n): - queue = [(self._value, self)] - - while len(queue) < n: - priority, node = queue.pop() - if not node._children: - queue.push((priority, node)) - break - for child in node._children: - if child._children: - queue.append((child._value, child)) - else: - queue.append((0, child)) - # makes the earliest merges at the start, latest at the end - queue.sort() - - groups = [] - for priority, node in queue: - groups.append(node.leaves()) - return groups - - def __lt__(self, comparator): - return cosine_distance(self._value, comparator._value) < 0 - - -class Dendrogram: - """ - Represents a dendrogram, a tree with a specified branching order. This - must be initialised with the leaf items, then iteratively call merge for - each branch. This class constructs a tree representing the order of calls - to the merge function. - """ - - def __init__(self, items=[]): - """ - :param items: the items at the leaves of the dendrogram - :type items: sequence of (any) - """ - self._items = [_DendrogramNode(item) for item in items] - self._original_items = copy.copy(self._items) - self._merge = 1 - - def merge(self, *indices): - """ - Merges nodes at given indices in the dendrogram. The nodes will be - combined which then replaces the first node specified. All other nodes - involved in the merge will be removed. - - :param indices: indices of the items to merge (at least two) - :type indices: seq of int - """ - assert len(indices) >= 2 - node = _DendrogramNode(self._merge, *(self._items[i] for i in indices)) - self._merge += 1 - self._items[indices[0]] = node - for i in indices[1:]: - del self._items[i] - - def groups(self, n): - """ - Finds the n-groups of items (leaves) reachable from a cut at depth n. - :param n: number of groups - :type n: int - """ - if len(self._items) > 1: - root = _DendrogramNode(self._merge, *self._items) - else: - root = self._items[0] - return root.groups(n) - - def show(self, leaf_labels=[]): - """ - Print the dendrogram in ASCII art to standard out. - - :param leaf_labels: an optional list of strings to use for labeling the - leaves - :type leaf_labels: list - """ - - # ASCII rendering characters - JOIN, HLINK, VLINK = "+", "-", "|" - - # find the root (or create one) - if len(self._items) > 1: - root = _DendrogramNode(self._merge, *self._items) - else: - root = self._items[0] - leaves = self._original_items - - if leaf_labels: - last_row = leaf_labels - else: - last_row = ["%s" % leaf._value for leaf in leaves] - - # find the bottom row and the best cell width - width = max(map(len, last_row)) + 1 - lhalf = width // 2 - rhalf = int(width - lhalf - 1) - - # display functions - def format(centre, left=" ", right=" "): - return f"{lhalf * left}{centre}{right * rhalf}" - - def display(str): - stdout.write(str) - - # for each merge, top down - queue = [(root._value, root)] - verticals = [format(" ") for leaf in leaves] - while queue: - priority, node = queue.pop() - child_left_leaf = list(map(lambda c: c.leaves(False)[0], node._children)) - indices = list(map(leaves.index, child_left_leaf)) - if child_left_leaf: - min_idx = min(indices) - max_idx = max(indices) - for i in range(len(leaves)): - if leaves[i] in child_left_leaf: - if i == min_idx: - display(format(JOIN, " ", HLINK)) - elif i == max_idx: - display(format(JOIN, HLINK, " ")) - else: - display(format(JOIN, HLINK, HLINK)) - verticals[i] = format(VLINK) - elif min_idx <= i <= max_idx: - display(format(HLINK, HLINK, HLINK)) - else: - display(verticals[i]) - display("\n") - for child in node._children: - if child._children: - queue.append((child._value, child)) - queue.sort() - - for vertical in verticals: - display(vertical) - display("\n") - - # finally, display the last line - display("".join(item.center(width) for item in last_row)) - display("\n") - - def __repr__(self): - if len(self._items) > 1: - root = _DendrogramNode(self._merge, *self._items) - else: - root = self._items[0] - leaves = root.leaves(False) - return "" % len(leaves) diff --git a/pipeline/nltk/collections.py b/pipeline/nltk/collections.py deleted file mode 100644 index 89ade62b665a4b51e63d49e26ef4ce41001efcd1..0000000000000000000000000000000000000000 --- a/pipeline/nltk/collections.py +++ /dev/null @@ -1,661 +0,0 @@ -# Natural Language Toolkit: Collections -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# URL: -# For license information, see LICENSE.TXT - -import bisect - -# this unused import is for python 2.7 -from collections import Counter, defaultdict, deque -from functools import total_ordering -from itertools import chain, islice - -from nltk.internals import raise_unorderable_types, slice_bounds - -########################################################################## -# Ordered Dictionary -########################################################################## - - -class OrderedDict(dict): - def __init__(self, data=None, **kwargs): - self._keys = self.keys(data, kwargs.get("keys")) - self._default_factory = kwargs.get("default_factory") - if data is None: - dict.__init__(self) - else: - dict.__init__(self, data) - - def __delitem__(self, key): - dict.__delitem__(self, key) - self._keys.remove(key) - - def __getitem__(self, key): - try: - return dict.__getitem__(self, key) - except KeyError: - return self.__missing__(key) - - def __iter__(self): - return (key for key in self.keys()) - - def __missing__(self, key): - if not self._default_factory and key not in self._keys: - raise KeyError() - return self._default_factory() - - def __setitem__(self, key, item): - dict.__setitem__(self, key, item) - if key not in self._keys: - self._keys.append(key) - - def clear(self): - dict.clear(self) - self._keys.clear() - - def copy(self): - d = dict.copy(self) - d._keys = self._keys - return d - - def items(self): - # returns iterator under python 3 and list under python 2 - return zip(self.keys(), self.values()) - - def keys(self, data=None, keys=None): - if data: - if keys: - assert isinstance(keys, list) - assert len(data) == len(keys) - return keys - else: - assert ( - isinstance(data, dict) - or isinstance(data, OrderedDict) - or isinstance(data, list) - ) - if isinstance(data, dict) or isinstance(data, OrderedDict): - return data.keys() - elif isinstance(data, list): - return [key for (key, value) in data] - elif "_keys" in self.__dict__: - return self._keys - else: - return [] - - def popitem(self): - if not self._keys: - raise KeyError() - - key = self._keys.pop() - value = self[key] - del self[key] - return (key, value) - - def setdefault(self, key, failobj=None): - dict.setdefault(self, key, failobj) - if key not in self._keys: - self._keys.append(key) - - def update(self, data): - dict.update(self, data) - for key in self.keys(data): - if key not in self._keys: - self._keys.append(key) - - def values(self): - # returns iterator under python 3 - return map(self.get, self._keys) - - -###################################################################### -# Lazy Sequences -###################################################################### - - -@total_ordering -class AbstractLazySequence: - """ - An abstract base class for read-only sequences whose values are - computed as needed. Lazy sequences act like tuples -- they can be - indexed, sliced, and iterated over; but they may not be modified. - - The most common application of lazy sequences in NLTK is for - corpus view objects, which provide access to the contents of a - corpus without loading the entire corpus into memory, by loading - pieces of the corpus from disk as needed. - - The result of modifying a mutable element of a lazy sequence is - undefined. In particular, the modifications made to the element - may or may not persist, depending on whether and when the lazy - sequence caches that element's value or reconstructs it from - scratch. - - Subclasses are required to define two methods: ``__len__()`` - and ``iterate_from()``. - """ - - def __len__(self): - """ - Return the number of tokens in the corpus file underlying this - corpus view. - """ - raise NotImplementedError("should be implemented by subclass") - - def iterate_from(self, start): - """ - Return an iterator that generates the tokens in the corpus - file underlying this corpus view, starting at the token number - ``start``. If ``start>=len(self)``, then this iterator will - generate no tokens. - """ - raise NotImplementedError("should be implemented by subclass") - - def __getitem__(self, i): - """ - Return the *i* th token in the corpus file underlying this - corpus view. Negative indices and spans are both supported. - """ - if isinstance(i, slice): - start, stop = slice_bounds(self, i) - return LazySubsequence(self, start, stop) - else: - # Handle negative indices - if i < 0: - i += len(self) - if i < 0: - raise IndexError("index out of range") - # Use iterate_from to extract it. - try: - return next(self.iterate_from(i)) - except StopIteration as e: - raise IndexError("index out of range") from e - - def __iter__(self): - """Return an iterator that generates the tokens in the corpus - file underlying this corpus view.""" - return self.iterate_from(0) - - def count(self, value): - """Return the number of times this list contains ``value``.""" - return sum(1 for elt in self if elt == value) - - def index(self, value, start=None, stop=None): - """Return the index of the first occurrence of ``value`` in this - list that is greater than or equal to ``start`` and less than - ``stop``. Negative start and stop values are treated like negative - slice bounds -- i.e., they count from the end of the list.""" - start, stop = slice_bounds(self, slice(start, stop)) - for i, elt in enumerate(islice(self, start, stop)): - if elt == value: - return i + start - raise ValueError("index(x): x not in list") - - def __contains__(self, value): - """Return true if this list contains ``value``.""" - return bool(self.count(value)) - - def __add__(self, other): - """Return a list concatenating self with other.""" - return LazyConcatenation([self, other]) - - def __radd__(self, other): - """Return a list concatenating other with self.""" - return LazyConcatenation([other, self]) - - def __mul__(self, count): - """Return a list concatenating self with itself ``count`` times.""" - return LazyConcatenation([self] * count) - - def __rmul__(self, count): - """Return a list concatenating self with itself ``count`` times.""" - return LazyConcatenation([self] * count) - - _MAX_REPR_SIZE = 60 - - def __repr__(self): - """ - Return a string representation for this corpus view that is - similar to a list's representation; but if it would be more - than 60 characters long, it is truncated. - """ - pieces = [] - length = 5 - for elt in self: - pieces.append(repr(elt)) - length += len(pieces[-1]) + 2 - if length > self._MAX_REPR_SIZE and len(pieces) > 2: - return "[%s, ...]" % ", ".join(pieces[:-1]) - return "[%s]" % ", ".join(pieces) - - def __eq__(self, other): - return type(self) == type(other) and list(self) == list(other) - - def __ne__(self, other): - return not self == other - - def __lt__(self, other): - if type(other) != type(self): - raise_unorderable_types("<", self, other) - return list(self) < list(other) - - def __hash__(self): - """ - :raise ValueError: Corpus view objects are unhashable. - """ - raise ValueError("%s objects are unhashable" % self.__class__.__name__) - - -class LazySubsequence(AbstractLazySequence): - """ - A subsequence produced by slicing a lazy sequence. This slice - keeps a reference to its source sequence, and generates its values - by looking them up in the source sequence. - """ - - MIN_SIZE = 100 - """ - The minimum size for which lazy slices should be created. If - ``LazySubsequence()`` is called with a subsequence that is - shorter than ``MIN_SIZE``, then a tuple will be returned instead. - """ - - def __new__(cls, source, start, stop): - """ - Construct a new slice from a given underlying sequence. The - ``start`` and ``stop`` indices should be absolute indices -- - i.e., they should not be negative (for indexing from the back - of a list) or greater than the length of ``source``. - """ - # If the slice is small enough, just use a tuple. - if stop - start < cls.MIN_SIZE: - return list(islice(source.iterate_from(start), stop - start)) - else: - return object.__new__(cls) - - def __init__(self, source, start, stop): - self._source = source - self._start = start - self._stop = stop - - def __len__(self): - return self._stop - self._start - - def iterate_from(self, start): - return islice( - self._source.iterate_from(start + self._start), max(0, len(self) - start) - ) - - -class LazyConcatenation(AbstractLazySequence): - """ - A lazy sequence formed by concatenating a list of lists. This - underlying list of lists may itself be lazy. ``LazyConcatenation`` - maintains an index that it uses to keep track of the relationship - between offsets in the concatenated lists and offsets in the - sublists. - """ - - def __init__(self, list_of_lists): - self._list = list_of_lists - self._offsets = [0] - - def __len__(self): - if len(self._offsets) <= len(self._list): - for _ in self.iterate_from(self._offsets[-1]): - pass - return self._offsets[-1] - - def iterate_from(self, start_index): - if start_index < self._offsets[-1]: - sublist_index = bisect.bisect_right(self._offsets, start_index) - 1 - else: - sublist_index = len(self._offsets) - 1 - - index = self._offsets[sublist_index] - - # Construct an iterator over the sublists. - if isinstance(self._list, AbstractLazySequence): - sublist_iter = self._list.iterate_from(sublist_index) - else: - sublist_iter = islice(self._list, sublist_index, None) - - for sublist in sublist_iter: - if sublist_index == (len(self._offsets) - 1): - assert ( - index + len(sublist) >= self._offsets[-1] - ), "offsets not monotonic increasing!" - self._offsets.append(index + len(sublist)) - else: - assert self._offsets[sublist_index + 1] == index + len( - sublist - ), "inconsistent list value (num elts)" - - yield from sublist[max(0, start_index - index) :] - - index += len(sublist) - sublist_index += 1 - - -class LazyMap(AbstractLazySequence): - """ - A lazy sequence whose elements are formed by applying a given - function to each element in one or more underlying lists. The - function is applied lazily -- i.e., when you read a value from the - list, ``LazyMap`` will calculate that value by applying its - function to the underlying lists' value(s). ``LazyMap`` is - essentially a lazy version of the Python primitive function - ``map``. In particular, the following two expressions are - equivalent: - - >>> from nltk.collections import LazyMap - >>> function = str - >>> sequence = [1,2,3] - >>> map(function, sequence) # doctest: +SKIP - ['1', '2', '3'] - >>> list(LazyMap(function, sequence)) - ['1', '2', '3'] - - Like the Python ``map`` primitive, if the source lists do not have - equal size, then the value None will be supplied for the - 'missing' elements. - - Lazy maps can be useful for conserving memory, in cases where - individual values take up a lot of space. This is especially true - if the underlying list's values are constructed lazily, as is the - case with many corpus readers. - - A typical example of a use case for this class is performing - feature detection on the tokens in a corpus. Since featuresets - are encoded as dictionaries, which can take up a lot of memory, - using a ``LazyMap`` can significantly reduce memory usage when - training and running classifiers. - """ - - def __init__(self, function, *lists, **config): - """ - :param function: The function that should be applied to - elements of ``lists``. It should take as many arguments - as there are ``lists``. - :param lists: The underlying lists. - :param cache_size: Determines the size of the cache used - by this lazy map. (default=5) - """ - if not lists: - raise TypeError("LazyMap requires at least two args") - - self._lists = lists - self._func = function - self._cache_size = config.get("cache_size", 5) - self._cache = {} if self._cache_size > 0 else None - - # If you just take bool() of sum() here _all_lazy will be true just - # in case n >= 1 list is an AbstractLazySequence. Presumably this - # isn't what's intended. - self._all_lazy = sum( - isinstance(lst, AbstractLazySequence) for lst in lists - ) == len(lists) - - def iterate_from(self, index): - # Special case: one lazy sublist - if len(self._lists) == 1 and self._all_lazy: - for value in self._lists[0].iterate_from(index): - yield self._func(value) - return - - # Special case: one non-lazy sublist - elif len(self._lists) == 1: - while True: - try: - yield self._func(self._lists[0][index]) - except IndexError: - return - index += 1 - - # Special case: n lazy sublists - elif self._all_lazy: - iterators = [lst.iterate_from(index) for lst in self._lists] - while True: - elements = [] - for iterator in iterators: - try: - elements.append(next(iterator)) - except: # FIXME: What is this except really catching? StopIteration? - elements.append(None) - if elements == [None] * len(self._lists): - return - yield self._func(*elements) - index += 1 - - # general case - else: - while True: - try: - elements = [lst[index] for lst in self._lists] - except IndexError: - elements = [None] * len(self._lists) - for i, lst in enumerate(self._lists): - try: - elements[i] = lst[index] - except IndexError: - pass - if elements == [None] * len(self._lists): - return - yield self._func(*elements) - index += 1 - - def __getitem__(self, index): - if isinstance(index, slice): - sliced_lists = [lst[index] for lst in self._lists] - return LazyMap(self._func, *sliced_lists) - else: - # Handle negative indices - if index < 0: - index += len(self) - if index < 0: - raise IndexError("index out of range") - # Check the cache - if self._cache is not None and index in self._cache: - return self._cache[index] - # Calculate the value - try: - val = next(self.iterate_from(index)) - except StopIteration as e: - raise IndexError("index out of range") from e - # Update the cache - if self._cache is not None: - if len(self._cache) > self._cache_size: - self._cache.popitem() # discard random entry - self._cache[index] = val - # Return the value - return val - - def __len__(self): - return max(len(lst) for lst in self._lists) - - -class LazyZip(LazyMap): - """ - A lazy sequence whose elements are tuples, each containing the i-th - element from each of the argument sequences. The returned list is - truncated in length to the length of the shortest argument sequence. The - tuples are constructed lazily -- i.e., when you read a value from the - list, ``LazyZip`` will calculate that value by forming a tuple from - the i-th element of each of the argument sequences. - - ``LazyZip`` is essentially a lazy version of the Python primitive function - ``zip``. In particular, an evaluated LazyZip is equivalent to a zip: - - >>> from nltk.collections import LazyZip - >>> sequence1, sequence2 = [1, 2, 3], ['a', 'b', 'c'] - >>> zip(sequence1, sequence2) # doctest: +SKIP - [(1, 'a'), (2, 'b'), (3, 'c')] - >>> list(LazyZip(sequence1, sequence2)) - [(1, 'a'), (2, 'b'), (3, 'c')] - >>> sequences = [sequence1, sequence2, [6,7,8,9]] - >>> list(zip(*sequences)) == list(LazyZip(*sequences)) - True - - Lazy zips can be useful for conserving memory in cases where the argument - sequences are particularly long. - - A typical example of a use case for this class is combining long sequences - of gold standard and predicted values in a classification or tagging task - in order to calculate accuracy. By constructing tuples lazily and - avoiding the creation of an additional long sequence, memory usage can be - significantly reduced. - """ - - def __init__(self, *lists): - """ - :param lists: the underlying lists - :type lists: list(list) - """ - LazyMap.__init__(self, lambda *elts: elts, *lists) - - def iterate_from(self, index): - iterator = LazyMap.iterate_from(self, index) - while index < len(self): - yield next(iterator) - index += 1 - return - - def __len__(self): - return min(len(lst) for lst in self._lists) - - -class LazyEnumerate(LazyZip): - """ - A lazy sequence whose elements are tuples, each containing a count (from - zero) and a value yielded by underlying sequence. ``LazyEnumerate`` is - useful for obtaining an indexed list. The tuples are constructed lazily - -- i.e., when you read a value from the list, ``LazyEnumerate`` will - calculate that value by forming a tuple from the count of the i-th - element and the i-th element of the underlying sequence. - - ``LazyEnumerate`` is essentially a lazy version of the Python primitive - function ``enumerate``. In particular, the following two expressions are - equivalent: - - >>> from nltk.collections import LazyEnumerate - >>> sequence = ['first', 'second', 'third'] - >>> list(enumerate(sequence)) - [(0, 'first'), (1, 'second'), (2, 'third')] - >>> list(LazyEnumerate(sequence)) - [(0, 'first'), (1, 'second'), (2, 'third')] - - Lazy enumerations can be useful for conserving memory in cases where the - argument sequences are particularly long. - - A typical example of a use case for this class is obtaining an indexed - list for a long sequence of values. By constructing tuples lazily and - avoiding the creation of an additional long sequence, memory usage can be - significantly reduced. - """ - - def __init__(self, lst): - """ - :param lst: the underlying list - :type lst: list - """ - LazyZip.__init__(self, range(len(lst)), lst) - - -class LazyIteratorList(AbstractLazySequence): - """ - Wraps an iterator, loading its elements on demand - and making them subscriptable. - __repr__ displays only the first few elements. - """ - - def __init__(self, it, known_len=None): - self._it = it - self._len = known_len - self._cache = [] - - def __len__(self): - if self._len: - return self._len - for _ in self.iterate_from(len(self._cache)): - pass - self._len = len(self._cache) - return self._len - - def iterate_from(self, start): - """Create a new iterator over this list starting at the given offset.""" - while len(self._cache) < start: - v = next(self._it) - self._cache.append(v) - i = start - while i < len(self._cache): - yield self._cache[i] - i += 1 - try: - while True: - v = next(self._it) - self._cache.append(v) - yield v - except StopIteration: - pass - - def __add__(self, other): - """Return a list concatenating self with other.""" - return type(self)(chain(self, other)) - - def __radd__(self, other): - """Return a list concatenating other with self.""" - return type(self)(chain(other, self)) - - -###################################################################### -# Trie Implementation -###################################################################### -class Trie(dict): - """A Trie implementation for strings""" - - LEAF = True - - def __init__(self, strings=None): - """Builds a Trie object, which is built around a ``dict`` - - If ``strings`` is provided, it will add the ``strings``, which - consist of a ``list`` of ``strings``, to the Trie. - Otherwise, it'll construct an empty Trie. - - :param strings: List of strings to insert into the trie - (Default is ``None``) - :type strings: list(str) - - """ - super().__init__() - if strings: - for string in strings: - self.insert(string) - - def insert(self, string): - """Inserts ``string`` into the Trie - - :param string: String to insert into the trie - :type string: str - - :Example: - - >>> from nltk.collections import Trie - >>> trie = Trie(["abc", "def"]) - >>> expected = {'a': {'b': {'c': {True: None}}}, \ - 'd': {'e': {'f': {True: None}}}} - >>> trie == expected - True - - """ - if len(string): - self[string[0]].insert(string[1:]) - else: - # mark the string is complete - self[Trie.LEAF] = None - - def __missing__(self, key): - self[key] = Trie() - return self[key] diff --git a/pipeline/nltk/collocations.py b/pipeline/nltk/collocations.py deleted file mode 100644 index 2a1fd83ad38e861f0e8db96c24871d40c4ee185e..0000000000000000000000000000000000000000 --- a/pipeline/nltk/collocations.py +++ /dev/null @@ -1,412 +0,0 @@ -# Natural Language Toolkit: Collocations and Association Measures -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Joel Nothman -# URL: -# For license information, see LICENSE.TXT -# -""" -Tools to identify collocations --- words that often appear consecutively ---- within corpora. They may also be used to find other associations between -word occurrences. -See Manning and Schutze ch. 5 at https://nlp.stanford.edu/fsnlp/promo/colloc.pdf -and the Text::NSP Perl package at http://ngram.sourceforge.net - -Finding collocations requires first calculating the frequencies of words and -their appearance in the context of other words. Often the collection of words -will then requiring filtering to only retain useful content terms. Each ngram -of words may then be scored according to some association measure, in order -to determine the relative likelihood of each ngram being a collocation. - -The ``BigramCollocationFinder`` and ``TrigramCollocationFinder`` classes provide -these functionalities, dependent on being provided a function which scores a -ngram given appropriate frequency counts. A number of standard association -measures are provided in bigram_measures and trigram_measures. -""" - -# Possible TODOs: -# - consider the distinction between f(x,_) and f(x) and whether our -# approximation is good enough for fragmented data, and mention it -# - add a n-gram collocation finder with measures which only utilise n-gram -# and unigram counts (raw_freq, pmi, student_t) - -import itertools as _itertools - -# these two unused imports are referenced in collocations.doctest -from nltk.metrics import ( - BigramAssocMeasures, - ContingencyMeasures, - QuadgramAssocMeasures, - TrigramAssocMeasures, -) -from nltk.metrics.spearman import ranks_from_scores, spearman_correlation -from nltk.probability import FreqDist -from nltk.util import ngrams - - -class AbstractCollocationFinder: - """ - An abstract base class for collocation finders whose purpose is to - collect collocation candidate frequencies, filter and rank them. - - As a minimum, collocation finders require the frequencies of each - word in a corpus, and the joint frequency of word tuples. This data - should be provided through nltk.probability.FreqDist objects or an - identical interface. - """ - - def __init__(self, word_fd, ngram_fd): - self.word_fd = word_fd - self.N = word_fd.N() - self.ngram_fd = ngram_fd - - @classmethod - def _build_new_documents( - cls, documents, window_size, pad_left=False, pad_right=False, pad_symbol=None - ): - """ - Pad the document with the place holder according to the window_size - """ - padding = (pad_symbol,) * (window_size - 1) - if pad_right: - return _itertools.chain.from_iterable( - _itertools.chain(doc, padding) for doc in documents - ) - if pad_left: - return _itertools.chain.from_iterable( - _itertools.chain(padding, doc) for doc in documents - ) - - @classmethod - def from_documents(cls, documents): - """Constructs a collocation finder given a collection of documents, - each of which is a list (or iterable) of tokens. - """ - # return cls.from_words(_itertools.chain(*documents)) - return cls.from_words( - cls._build_new_documents(documents, cls.default_ws, pad_right=True) - ) - - @staticmethod - def _ngram_freqdist(words, n): - return FreqDist(tuple(words[i : i + n]) for i in range(len(words) - 1)) - - def _apply_filter(self, fn=lambda ngram, freq: False): - """Generic filter removes ngrams from the frequency distribution - if the function returns True when passed an ngram tuple. - """ - tmp_ngram = FreqDist() - for ngram, freq in self.ngram_fd.items(): - if not fn(ngram, freq): - tmp_ngram[ngram] = freq - self.ngram_fd = tmp_ngram - - def apply_freq_filter(self, min_freq): - """Removes candidate ngrams which have frequency less than min_freq.""" - self._apply_filter(lambda ng, freq: freq < min_freq) - - def apply_ngram_filter(self, fn): - """Removes candidate ngrams (w1, w2, ...) where fn(w1, w2, ...) - evaluates to True. - """ - self._apply_filter(lambda ng, f: fn(*ng)) - - def apply_word_filter(self, fn): - """Removes candidate ngrams (w1, w2, ...) where any of (fn(w1), fn(w2), - ...) evaluates to True. - """ - self._apply_filter(lambda ng, f: any(fn(w) for w in ng)) - - def _score_ngrams(self, score_fn): - """Generates of (ngram, score) pairs as determined by the scoring - function provided. - """ - for tup in self.ngram_fd: - score = self.score_ngram(score_fn, *tup) - if score is not None: - yield tup, score - - def score_ngrams(self, score_fn): - """Returns a sequence of (ngram, score) pairs ordered from highest to - lowest score, as determined by the scoring function provided. - """ - return sorted(self._score_ngrams(score_fn), key=lambda t: (-t[1], t[0])) - - def nbest(self, score_fn, n): - """Returns the top n ngrams when scored by the given function.""" - return [p for p, s in self.score_ngrams(score_fn)[:n]] - - def above_score(self, score_fn, min_score): - """Returns a sequence of ngrams, ordered by decreasing score, whose - scores each exceed the given minimum score. - """ - for ngram, score in self.score_ngrams(score_fn): - if score > min_score: - yield ngram - else: - break - - -class BigramCollocationFinder(AbstractCollocationFinder): - """A tool for the finding and ranking of bigram collocations or other - association measures. It is often useful to use from_words() rather than - constructing an instance directly. - """ - - default_ws = 2 - - def __init__(self, word_fd, bigram_fd, window_size=2): - """Construct a BigramCollocationFinder, given FreqDists for - appearances of words and (possibly non-contiguous) bigrams. - """ - AbstractCollocationFinder.__init__(self, word_fd, bigram_fd) - self.window_size = window_size - - @classmethod - def from_words(cls, words, window_size=2): - """Construct a BigramCollocationFinder for all bigrams in the given - sequence. When window_size > 2, count non-contiguous bigrams, in the - style of Church and Hanks's (1990) association ratio. - """ - wfd = FreqDist() - bfd = FreqDist() - - if window_size < 2: - raise ValueError("Specify window_size at least 2") - - for window in ngrams(words, window_size, pad_right=True): - w1 = window[0] - if w1 is None: - continue - wfd[w1] += 1 - for w2 in window[1:]: - if w2 is not None: - bfd[(w1, w2)] += 1 - return cls(wfd, bfd, window_size=window_size) - - def score_ngram(self, score_fn, w1, w2): - """Returns the score for a given bigram using the given scoring - function. Following Church and Hanks (1990), counts are scaled by - a factor of 1/(window_size - 1). - """ - n_all = self.N - n_ii = self.ngram_fd[(w1, w2)] / (self.window_size - 1.0) - if not n_ii: - return - n_ix = self.word_fd[w1] - n_xi = self.word_fd[w2] - return score_fn(n_ii, (n_ix, n_xi), n_all) - - -class TrigramCollocationFinder(AbstractCollocationFinder): - """A tool for the finding and ranking of trigram collocations or other - association measures. It is often useful to use from_words() rather than - constructing an instance directly. - """ - - default_ws = 3 - - def __init__(self, word_fd, bigram_fd, wildcard_fd, trigram_fd): - """Construct a TrigramCollocationFinder, given FreqDists for - appearances of words, bigrams, two words with any word between them, - and trigrams. - """ - AbstractCollocationFinder.__init__(self, word_fd, trigram_fd) - self.wildcard_fd = wildcard_fd - self.bigram_fd = bigram_fd - - @classmethod - def from_words(cls, words, window_size=3): - """Construct a TrigramCollocationFinder for all trigrams in the given - sequence. - """ - if window_size < 3: - raise ValueError("Specify window_size at least 3") - - wfd = FreqDist() - wildfd = FreqDist() - bfd = FreqDist() - tfd = FreqDist() - for window in ngrams(words, window_size, pad_right=True): - w1 = window[0] - if w1 is None: - continue - for w2, w3 in _itertools.combinations(window[1:], 2): - wfd[w1] += 1 - if w2 is None: - continue - bfd[(w1, w2)] += 1 - if w3 is None: - continue - wildfd[(w1, w3)] += 1 - tfd[(w1, w2, w3)] += 1 - return cls(wfd, bfd, wildfd, tfd) - - def bigram_finder(self): - """Constructs a bigram collocation finder with the bigram and unigram - data from this finder. Note that this does not include any filtering - applied to this finder. - """ - return BigramCollocationFinder(self.word_fd, self.bigram_fd) - - def score_ngram(self, score_fn, w1, w2, w3): - """Returns the score for a given trigram using the given scoring - function. - """ - n_all = self.N - n_iii = self.ngram_fd[(w1, w2, w3)] - if not n_iii: - return - n_iix = self.bigram_fd[(w1, w2)] - n_ixi = self.wildcard_fd[(w1, w3)] - n_xii = self.bigram_fd[(w2, w3)] - n_ixx = self.word_fd[w1] - n_xix = self.word_fd[w2] - n_xxi = self.word_fd[w3] - return score_fn(n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi), n_all) - - -class QuadgramCollocationFinder(AbstractCollocationFinder): - """A tool for the finding and ranking of quadgram collocations or other association measures. - It is often useful to use from_words() rather than constructing an instance directly. - """ - - default_ws = 4 - - def __init__(self, word_fd, quadgram_fd, ii, iii, ixi, ixxi, iixi, ixii): - """Construct a QuadgramCollocationFinder, given FreqDists for appearances of words, - bigrams, trigrams, two words with one word and two words between them, three words - with a word between them in both variations. - """ - AbstractCollocationFinder.__init__(self, word_fd, quadgram_fd) - self.iii = iii - self.ii = ii - self.ixi = ixi - self.ixxi = ixxi - self.iixi = iixi - self.ixii = ixii - - @classmethod - def from_words(cls, words, window_size=4): - if window_size < 4: - raise ValueError("Specify window_size at least 4") - ixxx = FreqDist() - iiii = FreqDist() - ii = FreqDist() - iii = FreqDist() - ixi = FreqDist() - ixxi = FreqDist() - iixi = FreqDist() - ixii = FreqDist() - - for window in ngrams(words, window_size, pad_right=True): - w1 = window[0] - if w1 is None: - continue - for w2, w3, w4 in _itertools.combinations(window[1:], 3): - ixxx[w1] += 1 - if w2 is None: - continue - ii[(w1, w2)] += 1 - if w3 is None: - continue - iii[(w1, w2, w3)] += 1 - ixi[(w1, w3)] += 1 - if w4 is None: - continue - iiii[(w1, w2, w3, w4)] += 1 - ixxi[(w1, w4)] += 1 - ixii[(w1, w3, w4)] += 1 - iixi[(w1, w2, w4)] += 1 - - return cls(ixxx, iiii, ii, iii, ixi, ixxi, iixi, ixii) - - def score_ngram(self, score_fn, w1, w2, w3, w4): - n_all = self.N - n_iiii = self.ngram_fd[(w1, w2, w3, w4)] - if not n_iiii: - return - n_iiix = self.iii[(w1, w2, w3)] - n_xiii = self.iii[(w2, w3, w4)] - n_iixi = self.iixi[(w1, w2, w4)] - n_ixii = self.ixii[(w1, w3, w4)] - - n_iixx = self.ii[(w1, w2)] - n_xxii = self.ii[(w3, w4)] - n_xiix = self.ii[(w2, w3)] - n_ixix = self.ixi[(w1, w3)] - n_ixxi = self.ixxi[(w1, w4)] - n_xixi = self.ixi[(w2, w4)] - - n_ixxx = self.word_fd[w1] - n_xixx = self.word_fd[w2] - n_xxix = self.word_fd[w3] - n_xxxi = self.word_fd[w4] - return score_fn( - n_iiii, - (n_iiix, n_iixi, n_ixii, n_xiii), - (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix), - (n_ixxx, n_xixx, n_xxix, n_xxxi), - n_all, - ) - - -def demo(scorer=None, compare_scorer=None): - """Finds bigram collocations in the files of the WebText corpus.""" - from nltk.metrics import ( - BigramAssocMeasures, - ranks_from_scores, - spearman_correlation, - ) - - if scorer is None: - scorer = BigramAssocMeasures.likelihood_ratio - if compare_scorer is None: - compare_scorer = BigramAssocMeasures.raw_freq - - from nltk.corpus import stopwords, webtext - - ignored_words = stopwords.words("english") - word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words - - for file in webtext.fileids(): - words = [word.lower() for word in webtext.words(file)] - - cf = BigramCollocationFinder.from_words(words) - cf.apply_freq_filter(3) - cf.apply_word_filter(word_filter) - - corr = spearman_correlation( - ranks_from_scores(cf.score_ngrams(scorer)), - ranks_from_scores(cf.score_ngrams(compare_scorer)), - ) - print(file) - print("\t", [" ".join(tup) for tup in cf.nbest(scorer, 15)]) - print(f"\t Correlation to {compare_scorer.__name__}: {corr:0.4f}") - - -# Slows down loading too much -# bigram_measures = BigramAssocMeasures() -# trigram_measures = TrigramAssocMeasures() - -if __name__ == "__main__": - import sys - - from nltk.metrics import BigramAssocMeasures - - try: - scorer = eval("BigramAssocMeasures." + sys.argv[1]) - except IndexError: - scorer = None - try: - compare_scorer = eval("BigramAssocMeasures." + sys.argv[2]) - except IndexError: - compare_scorer = None - - demo(scorer, compare_scorer) - -__all__ = [ - "BigramCollocationFinder", - "TrigramCollocationFinder", - "QuadgramCollocationFinder", -] diff --git a/pipeline/nltk/compat.py b/pipeline/nltk/compat.py deleted file mode 100644 index ceedc3992530e4e523dc9d479c26fbb43c918280..0000000000000000000000000000000000000000 --- a/pipeline/nltk/compat.py +++ /dev/null @@ -1,43 +0,0 @@ -# Natural Language Toolkit: Compatibility -# -# Copyright (C) 2001-2023 NLTK Project -# -# URL: -# For license information, see LICENSE.TXT - -import os -from functools import wraps - -# ======= Compatibility for datasets that care about Python versions ======== - -# The following datasets have a /PY3 subdirectory containing -# a full copy of the data which has been re-encoded or repickled. -DATA_UPDATES = [ - ("chunkers", "maxent_ne_chunker"), - ("help", "tagsets"), - ("taggers", "maxent_treebank_pos_tagger"), - ("tokenizers", "punkt"), -] - -_PY3_DATA_UPDATES = [os.path.join(*path_list) for path_list in DATA_UPDATES] - - -def add_py3_data(path): - for item in _PY3_DATA_UPDATES: - if item in str(path) and "/PY3" not in str(path): - pos = path.index(item) + len(item) - if path[pos : pos + 4] == ".zip": - pos += 4 - path = path[:pos] + "/PY3" + path[pos:] - break - return path - - -# for use in adding /PY3 to the second (filename) argument -# of the file pointers in data.py -def py3_data(init_func): - def _decorator(*args, **kwargs): - args = (args[0], add_py3_data(args[1])) + args[2:] - return init_func(*args, **kwargs) - - return wraps(init_func)(_decorator) diff --git a/pipeline/nltk/corpus/__init__.py b/pipeline/nltk/corpus/__init__.py deleted file mode 100644 index 67f565aaa85618c0268c75cd4b1524829712909c..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/__init__.py +++ /dev/null @@ -1,529 +0,0 @@ -# Natural Language Toolkit: Corpus Readers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT - -# TODO this docstring isn't up-to-date! -""" -NLTK corpus readers. The modules in this package provide functions -that can be used to read corpus files in a variety of formats. These -functions can be used to read both the corpus files that are -distributed in the NLTK corpus package, and corpus files that are part -of external corpora. - -Available Corpora -================= - -Please see https://www.nltk.org/nltk_data/ for a complete list. -Install corpora using nltk.download(). - -Corpus Reader Functions -======================= -Each corpus module defines one or more "corpus reader functions", -which can be used to read documents from that corpus. These functions -take an argument, ``item``, which is used to indicate which document -should be read from the corpus: - -- If ``item`` is one of the unique identifiers listed in the corpus - module's ``items`` variable, then the corresponding document will - be loaded from the NLTK corpus package. -- If ``item`` is a filename, then that file will be read. - -Additionally, corpus reader functions can be given lists of item -names; in which case, they will return a concatenation of the -corresponding documents. - -Corpus reader functions are named based on the type of information -they return. Some common examples, and their return types, are: - -- words(): list of str -- sents(): list of (list of str) -- paras(): list of (list of (list of str)) -- tagged_words(): list of (str,str) tuple -- tagged_sents(): list of (list of (str,str)) -- tagged_paras(): list of (list of (list of (str,str))) -- chunked_sents(): list of (Tree w/ (str,str) leaves) -- parsed_sents(): list of (Tree with str leaves) -- parsed_paras(): list of (list of (Tree with str leaves)) -- xml(): A single xml ElementTree -- raw(): unprocessed corpus contents - -For example, to read a list of the words in the Brown Corpus, use -``nltk.corpus.brown.words()``: - - >>> from nltk.corpus import brown - >>> print(", ".join(brown.words())) # doctest: +ELLIPSIS - The, Fulton, County, Grand, Jury, said, ... - -""" - -import re - -from nltk.corpus.reader import * -from nltk.corpus.util import LazyCorpusLoader -from nltk.tokenize import RegexpTokenizer - -abc: PlaintextCorpusReader = LazyCorpusLoader( - "abc", - PlaintextCorpusReader, - r"(?!\.).*\.txt", - encoding=[("science", "latin_1"), ("rural", "utf8")], -) -alpino: AlpinoCorpusReader = LazyCorpusLoader( - "alpino", AlpinoCorpusReader, tagset="alpino" -) -bcp47: BCP47CorpusReader = LazyCorpusLoader( - "bcp47", BCP47CorpusReader, r"(cldr|iana)/*" -) -brown: CategorizedTaggedCorpusReader = LazyCorpusLoader( - "brown", - CategorizedTaggedCorpusReader, - r"c[a-z]\d\d", - cat_file="cats.txt", - tagset="brown", - encoding="ascii", -) -cess_cat: BracketParseCorpusReader = LazyCorpusLoader( - "cess_cat", - BracketParseCorpusReader, - r"(?!\.).*\.tbf", - tagset="unknown", - encoding="ISO-8859-15", -) -cess_esp: BracketParseCorpusReader = LazyCorpusLoader( - "cess_esp", - BracketParseCorpusReader, - r"(?!\.).*\.tbf", - tagset="unknown", - encoding="ISO-8859-15", -) -cmudict: CMUDictCorpusReader = LazyCorpusLoader( - "cmudict", CMUDictCorpusReader, ["cmudict"] -) -comtrans: AlignedCorpusReader = LazyCorpusLoader( - "comtrans", AlignedCorpusReader, r"(?!\.).*\.txt" -) -comparative_sentences: ComparativeSentencesCorpusReader = LazyCorpusLoader( - "comparative_sentences", - ComparativeSentencesCorpusReader, - r"labeledSentences\.txt", - encoding="latin-1", -) -conll2000: ConllChunkCorpusReader = LazyCorpusLoader( - "conll2000", - ConllChunkCorpusReader, - ["train.txt", "test.txt"], - ("NP", "VP", "PP"), - tagset="wsj", - encoding="ascii", -) -conll2002: ConllChunkCorpusReader = LazyCorpusLoader( - "conll2002", - ConllChunkCorpusReader, - r".*\.(test|train).*", - ("LOC", "PER", "ORG", "MISC"), - encoding="utf-8", -) -conll2007: DependencyCorpusReader = LazyCorpusLoader( - "conll2007", - DependencyCorpusReader, - r".*\.(test|train).*", - encoding=[("eus", "ISO-8859-2"), ("esp", "utf8")], -) -crubadan: CrubadanCorpusReader = LazyCorpusLoader( - "crubadan", CrubadanCorpusReader, r".*\.txt" -) -dependency_treebank: DependencyCorpusReader = LazyCorpusLoader( - "dependency_treebank", DependencyCorpusReader, r".*\.dp", encoding="ascii" -) -extended_omw: CorpusReader = LazyCorpusLoader( - "extended_omw", CorpusReader, r".*/wn-[a-z\-]*\.tab", encoding="utf8" -) -floresta: BracketParseCorpusReader = LazyCorpusLoader( - "floresta", - BracketParseCorpusReader, - r"(?!\.).*\.ptb", - "#", - tagset="unknown", - encoding="ISO-8859-15", -) -framenet15: FramenetCorpusReader = LazyCorpusLoader( - "framenet_v15", - FramenetCorpusReader, - [ - "frRelation.xml", - "frameIndex.xml", - "fulltextIndex.xml", - "luIndex.xml", - "semTypes.xml", - ], -) -framenet: FramenetCorpusReader = LazyCorpusLoader( - "framenet_v17", - FramenetCorpusReader, - [ - "frRelation.xml", - "frameIndex.xml", - "fulltextIndex.xml", - "luIndex.xml", - "semTypes.xml", - ], -) -gazetteers: WordListCorpusReader = LazyCorpusLoader( - "gazetteers", WordListCorpusReader, r"(?!LICENSE|\.).*\.txt", encoding="ISO-8859-2" -) -genesis: PlaintextCorpusReader = LazyCorpusLoader( - "genesis", - PlaintextCorpusReader, - r"(?!\.).*\.txt", - encoding=[ - ("finnish|french|german", "latin_1"), - ("swedish", "cp865"), - (".*", "utf_8"), - ], -) -gutenberg: PlaintextCorpusReader = LazyCorpusLoader( - "gutenberg", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1" -) -ieer: IEERCorpusReader = LazyCorpusLoader("ieer", IEERCorpusReader, r"(?!README|\.).*") -inaugural: PlaintextCorpusReader = LazyCorpusLoader( - "inaugural", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="latin1" -) -# [XX] This should probably just use TaggedCorpusReader: -indian: IndianCorpusReader = LazyCorpusLoader( - "indian", IndianCorpusReader, r"(?!\.).*\.pos", tagset="unknown", encoding="utf8" -) - -jeita: ChasenCorpusReader = LazyCorpusLoader( - "jeita", ChasenCorpusReader, r".*\.chasen", encoding="utf-8" -) -knbc: KNBCorpusReader = LazyCorpusLoader( - "knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp" -) -lin_thesaurus: LinThesaurusCorpusReader = LazyCorpusLoader( - "lin_thesaurus", LinThesaurusCorpusReader, r".*\.lsp" -) -mac_morpho: MacMorphoCorpusReader = LazyCorpusLoader( - "mac_morpho", - MacMorphoCorpusReader, - r"(?!\.).*\.txt", - tagset="unknown", - encoding="latin-1", -) -machado: PortugueseCategorizedPlaintextCorpusReader = LazyCorpusLoader( - "machado", - PortugueseCategorizedPlaintextCorpusReader, - r"(?!\.).*\.txt", - cat_pattern=r"([a-z]*)/.*", - encoding="latin-1", -) -masc_tagged: CategorizedTaggedCorpusReader = LazyCorpusLoader( - "masc_tagged", - CategorizedTaggedCorpusReader, - r"(spoken|written)/.*\.txt", - cat_file="categories.txt", - tagset="wsj", - encoding="utf-8", - sep="_", -) -movie_reviews: CategorizedPlaintextCorpusReader = LazyCorpusLoader( - "movie_reviews", - CategorizedPlaintextCorpusReader, - r"(?!\.).*\.txt", - cat_pattern=r"(neg|pos)/.*", - encoding="ascii", -) -multext_east: MTECorpusReader = LazyCorpusLoader( - "mte_teip5", MTECorpusReader, r"(oana).*\.xml", encoding="utf-8" -) -names: WordListCorpusReader = LazyCorpusLoader( - "names", WordListCorpusReader, r"(?!\.).*\.txt", encoding="ascii" -) -nps_chat: NPSChatCorpusReader = LazyCorpusLoader( - "nps_chat", NPSChatCorpusReader, r"(?!README|\.).*\.xml", tagset="wsj" -) -opinion_lexicon: OpinionLexiconCorpusReader = LazyCorpusLoader( - "opinion_lexicon", - OpinionLexiconCorpusReader, - r"(\w+)\-words\.txt", - encoding="ISO-8859-2", -) -ppattach: PPAttachmentCorpusReader = LazyCorpusLoader( - "ppattach", PPAttachmentCorpusReader, ["training", "test", "devset"] -) -product_reviews_1: ReviewsCorpusReader = LazyCorpusLoader( - "product_reviews_1", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8" -) -product_reviews_2: ReviewsCorpusReader = LazyCorpusLoader( - "product_reviews_2", ReviewsCorpusReader, r"^(?!Readme).*\.txt", encoding="utf8" -) -pros_cons: ProsConsCorpusReader = LazyCorpusLoader( - "pros_cons", - ProsConsCorpusReader, - r"Integrated(Cons|Pros)\.txt", - cat_pattern=r"Integrated(Cons|Pros)\.txt", - encoding="ISO-8859-2", -) -ptb: CategorizedBracketParseCorpusReader = ( - LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions - "ptb", - CategorizedBracketParseCorpusReader, - r"(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG", - cat_file="allcats.txt", - tagset="wsj", - ) -) -qc: StringCategoryCorpusReader = LazyCorpusLoader( - "qc", StringCategoryCorpusReader, ["train.txt", "test.txt"], encoding="ISO-8859-2" -) -reuters: CategorizedPlaintextCorpusReader = LazyCorpusLoader( - "reuters", - CategorizedPlaintextCorpusReader, - "(training|test).*", - cat_file="cats.txt", - encoding="ISO-8859-2", -) -rte: RTECorpusReader = LazyCorpusLoader("rte", RTECorpusReader, r"(?!\.).*\.xml") -senseval: SensevalCorpusReader = LazyCorpusLoader( - "senseval", SensevalCorpusReader, r"(?!\.).*\.pos" -) -sentence_polarity: CategorizedSentencesCorpusReader = LazyCorpusLoader( - "sentence_polarity", - CategorizedSentencesCorpusReader, - r"rt-polarity\.(neg|pos)", - cat_pattern=r"rt-polarity\.(neg|pos)", - encoding="utf-8", -) -sentiwordnet: SentiWordNetCorpusReader = LazyCorpusLoader( - "sentiwordnet", SentiWordNetCorpusReader, "SentiWordNet_3.0.0.txt", encoding="utf-8" -) -shakespeare: XMLCorpusReader = LazyCorpusLoader( - "shakespeare", XMLCorpusReader, r"(?!\.).*\.xml" -) -sinica_treebank: SinicaTreebankCorpusReader = LazyCorpusLoader( - "sinica_treebank", - SinicaTreebankCorpusReader, - ["parsed"], - tagset="unknown", - encoding="utf-8", -) -state_union: PlaintextCorpusReader = LazyCorpusLoader( - "state_union", PlaintextCorpusReader, r"(?!\.).*\.txt", encoding="ISO-8859-2" -) -stopwords: WordListCorpusReader = LazyCorpusLoader( - "stopwords", WordListCorpusReader, r"(?!README|\.).*", encoding="utf8" -) -subjectivity: CategorizedSentencesCorpusReader = LazyCorpusLoader( - "subjectivity", - CategorizedSentencesCorpusReader, - r"(quote.tok.gt9|plot.tok.gt9)\.5000", - cat_map={"quote.tok.gt9.5000": ["subj"], "plot.tok.gt9.5000": ["obj"]}, - encoding="latin-1", -) -swadesh: SwadeshCorpusReader = LazyCorpusLoader( - "swadesh", SwadeshCorpusReader, r"(?!README|\.).*", encoding="utf8" -) -swadesh110: PanlexSwadeshCorpusReader = LazyCorpusLoader( - "panlex_swadesh", PanlexSwadeshCorpusReader, r"swadesh110/.*\.txt", encoding="utf8" -) -swadesh207: PanlexSwadeshCorpusReader = LazyCorpusLoader( - "panlex_swadesh", PanlexSwadeshCorpusReader, r"swadesh207/.*\.txt", encoding="utf8" -) -switchboard: SwitchboardCorpusReader = LazyCorpusLoader( - "switchboard", SwitchboardCorpusReader, tagset="wsj" -) -timit: TimitCorpusReader = LazyCorpusLoader("timit", TimitCorpusReader) -timit_tagged: TimitTaggedCorpusReader = LazyCorpusLoader( - "timit", TimitTaggedCorpusReader, r".+\.tags", tagset="wsj", encoding="ascii" -) -toolbox: ToolboxCorpusReader = LazyCorpusLoader( - "toolbox", ToolboxCorpusReader, r"(?!.*(README|\.)).*\.(dic|txt)" -) -treebank: BracketParseCorpusReader = LazyCorpusLoader( - "treebank/combined", - BracketParseCorpusReader, - r"wsj_.*\.mrg", - tagset="wsj", - encoding="ascii", -) -treebank_chunk: ChunkedCorpusReader = LazyCorpusLoader( - "treebank/tagged", - ChunkedCorpusReader, - r"wsj_.*\.pos", - sent_tokenizer=RegexpTokenizer(r"(?<=/\.)\s*(?![^\[]*\])", gaps=True), - para_block_reader=tagged_treebank_para_block_reader, - tagset="wsj", - encoding="ascii", -) -treebank_raw: PlaintextCorpusReader = LazyCorpusLoader( - "treebank/raw", PlaintextCorpusReader, r"wsj_.*", encoding="ISO-8859-2" -) -twitter_samples: TwitterCorpusReader = LazyCorpusLoader( - "twitter_samples", TwitterCorpusReader, r".*\.json" -) -udhr: UdhrCorpusReader = LazyCorpusLoader("udhr", UdhrCorpusReader) -udhr2: PlaintextCorpusReader = LazyCorpusLoader( - "udhr2", PlaintextCorpusReader, r".*\.txt", encoding="utf8" -) -universal_treebanks: ConllCorpusReader = LazyCorpusLoader( - "universal_treebanks_v20", - ConllCorpusReader, - r".*\.conll", - columntypes=( - "ignore", - "words", - "ignore", - "ignore", - "pos", - "ignore", - "ignore", - "ignore", - "ignore", - "ignore", - ), -) -verbnet: VerbnetCorpusReader = LazyCorpusLoader( - "verbnet", VerbnetCorpusReader, r"(?!\.).*\.xml" -) -webtext: PlaintextCorpusReader = LazyCorpusLoader( - "webtext", PlaintextCorpusReader, r"(?!README|\.).*\.txt", encoding="ISO-8859-2" -) -wordnet: WordNetCorpusReader = LazyCorpusLoader( - "wordnet", - WordNetCorpusReader, - LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"), -) -wordnet31: WordNetCorpusReader = LazyCorpusLoader( - "wordnet31", - WordNetCorpusReader, - LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"), -) -wordnet2021: WordNetCorpusReader = LazyCorpusLoader( - "wordnet2021", - WordNetCorpusReader, - LazyCorpusLoader("omw-1.4", CorpusReader, r".*/wn-data-.*\.tab", encoding="utf8"), -) -wordnet_ic: WordNetICCorpusReader = LazyCorpusLoader( - "wordnet_ic", WordNetICCorpusReader, r".*\.dat" -) -words: WordListCorpusReader = LazyCorpusLoader( - "words", WordListCorpusReader, r"(?!README|\.).*", encoding="ascii" -) - -# defined after treebank -propbank: PropbankCorpusReader = LazyCorpusLoader( - "propbank", - PropbankCorpusReader, - "prop.txt", - r"frames/.*\.xml", - "verbs.txt", - lambda filename: re.sub(r"^wsj/\d\d/", "", filename), - treebank, -) # Must be defined *after* treebank corpus. -nombank: NombankCorpusReader = LazyCorpusLoader( - "nombank.1.0", - NombankCorpusReader, - "nombank.1.0", - r"frames/.*\.xml", - "nombank.1.0.words", - lambda filename: re.sub(r"^wsj/\d\d/", "", filename), - treebank, -) # Must be defined *after* treebank corpus. -propbank_ptb: PropbankCorpusReader = LazyCorpusLoader( - "propbank", - PropbankCorpusReader, - "prop.txt", - r"frames/.*\.xml", - "verbs.txt", - lambda filename: filename.upper(), - ptb, -) # Must be defined *after* ptb corpus. -nombank_ptb: NombankCorpusReader = LazyCorpusLoader( - "nombank.1.0", - NombankCorpusReader, - "nombank.1.0", - r"frames/.*\.xml", - "nombank.1.0.words", - lambda filename: filename.upper(), - ptb, -) # Must be defined *after* ptb corpus. -semcor: SemcorCorpusReader = LazyCorpusLoader( - "semcor", SemcorCorpusReader, r"brown./tagfiles/br-.*\.xml", wordnet -) # Must be defined *after* wordnet corpus. - -nonbreaking_prefixes: NonbreakingPrefixesCorpusReader = LazyCorpusLoader( - "nonbreaking_prefixes", - NonbreakingPrefixesCorpusReader, - r"(?!README|\.).*", - encoding="utf8", -) -perluniprops: UnicharsCorpusReader = LazyCorpusLoader( - "perluniprops", - UnicharsCorpusReader, - r"(?!README|\.).*", - nltk_data_subdir="misc", - encoding="utf8", -) - -# mwa_ppdb = LazyCorpusLoader( -# 'mwa_ppdb', MWAPPDBCorpusReader, r'(?!README|\.).*', nltk_data_subdir='misc', encoding='utf8') - -# See https://github.com/nltk/nltk/issues/1579 -# and https://github.com/nltk/nltk/issues/1716 -# -# pl196x = LazyCorpusLoader( -# 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml', -# cat_file='cats.txt', textid_file='textids.txt', encoding='utf8') -# -# ipipan = LazyCorpusLoader( -# 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml') -# -# nkjp = LazyCorpusLoader( -# 'nkjp', NKJPCorpusReader, r'', encoding='utf8') -# -# panlex_lite = LazyCorpusLoader( -# 'panlex_lite', PanLexLiteCorpusReader) -# -# ycoe = LazyCorpusLoader( -# 'ycoe', YCOECorpusReader) -# -# corpus not available with NLTK; these lines caused help(nltk.corpus) to break -# hebrew_treebank = LazyCorpusLoader( -# 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt') - -# FIXME: override any imported demo from various corpora, see https://github.com/nltk/nltk/issues/2116 -def demo(): - # This is out-of-date: - abc.demo() - brown.demo() - # chat80.demo() - cmudict.demo() - conll2000.demo() - conll2002.demo() - genesis.demo() - gutenberg.demo() - ieer.demo() - inaugural.demo() - indian.demo() - names.demo() - ppattach.demo() - senseval.demo() - shakespeare.demo() - sinica_treebank.demo() - state_union.demo() - stopwords.demo() - timit.demo() - toolbox.demo() - treebank.demo() - udhr.demo() - webtext.demo() - words.demo() - - -# ycoe.demo() - -if __name__ == "__main__": - # demo() - pass diff --git a/pipeline/nltk/corpus/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/corpus/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index f041a5bbaa9cc017d68df306bd0a9dbcf77d1cb8..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/__pycache__/europarl_raw.cpython-39.pyc b/pipeline/nltk/corpus/__pycache__/europarl_raw.cpython-39.pyc deleted file mode 100644 index d002d571a92d2d44dee426f4048bfe8dfff72ab2..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/__pycache__/europarl_raw.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/__pycache__/util.cpython-39.pyc b/pipeline/nltk/corpus/__pycache__/util.cpython-39.pyc deleted file mode 100644 index 5a50220bfaca77c7a5793a0c6b9cf8befc033b5b..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/__pycache__/util.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/europarl_raw.py b/pipeline/nltk/corpus/europarl_raw.py deleted file mode 100644 index 2a32ecc86f7b7671445effc2801870c3fc10f295..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/europarl_raw.py +++ /dev/null @@ -1,56 +0,0 @@ -# Natural Language Toolkit: Europarl Corpus Readers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Nitin Madnani -# URL: -# For license information, see LICENSE.TXT - -import re - -from nltk.corpus.reader import * -from nltk.corpus.util import LazyCorpusLoader - -# Create a new corpus reader instance for each European language -danish: EuroparlCorpusReader = LazyCorpusLoader( - "europarl_raw/danish", EuroparlCorpusReader, r"ep-.*\.da", encoding="utf-8" -) - -dutch: EuroparlCorpusReader = LazyCorpusLoader( - "europarl_raw/dutch", EuroparlCorpusReader, r"ep-.*\.nl", encoding="utf-8" -) - -english: EuroparlCorpusReader = LazyCorpusLoader( - "europarl_raw/english", EuroparlCorpusReader, r"ep-.*\.en", encoding="utf-8" -) - -finnish: EuroparlCorpusReader = LazyCorpusLoader( - "europarl_raw/finnish", EuroparlCorpusReader, r"ep-.*\.fi", encoding="utf-8" -) - -french: EuroparlCorpusReader = LazyCorpusLoader( - "europarl_raw/french", EuroparlCorpusReader, r"ep-.*\.fr", encoding="utf-8" -) - -german: EuroparlCorpusReader = LazyCorpusLoader( - "europarl_raw/german", EuroparlCorpusReader, r"ep-.*\.de", encoding="utf-8" -) - -greek: EuroparlCorpusReader = LazyCorpusLoader( - "europarl_raw/greek", EuroparlCorpusReader, r"ep-.*\.el", encoding="utf-8" -) - -italian: EuroparlCorpusReader = LazyCorpusLoader( - "europarl_raw/italian", EuroparlCorpusReader, r"ep-.*\.it", encoding="utf-8" -) - -portuguese: EuroparlCorpusReader = LazyCorpusLoader( - "europarl_raw/portuguese", EuroparlCorpusReader, r"ep-.*\.pt", encoding="utf-8" -) - -spanish: EuroparlCorpusReader = LazyCorpusLoader( - "europarl_raw/spanish", EuroparlCorpusReader, r"ep-.*\.es", encoding="utf-8" -) - -swedish: EuroparlCorpusReader = LazyCorpusLoader( - "europarl_raw/swedish", EuroparlCorpusReader, r"ep-.*\.sv", encoding="utf-8" -) diff --git a/pipeline/nltk/corpus/reader/__init__.py b/pipeline/nltk/corpus/reader/__init__.py deleted file mode 100644 index a5274f09dde2db30aa213800647e19a7d8201981..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/__init__.py +++ /dev/null @@ -1,186 +0,0 @@ -# Natural Language Toolkit: Corpus Readers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -NLTK corpus readers. The modules in this package provide functions -that can be used to read corpus fileids in a variety of formats. These -functions can be used to read both the corpus fileids that are -distributed in the NLTK corpus package, and corpus fileids that are part -of external corpora. - -Corpus Reader Functions -======================= -Each corpus module defines one or more "corpus reader functions", -which can be used to read documents from that corpus. These functions -take an argument, ``item``, which is used to indicate which document -should be read from the corpus: - -- If ``item`` is one of the unique identifiers listed in the corpus - module's ``items`` variable, then the corresponding document will - be loaded from the NLTK corpus package. -- If ``item`` is a fileid, then that file will be read. - -Additionally, corpus reader functions can be given lists of item -names; in which case, they will return a concatenation of the -corresponding documents. - -Corpus reader functions are named based on the type of information -they return. Some common examples, and their return types, are: - -- words(): list of str -- sents(): list of (list of str) -- paras(): list of (list of (list of str)) -- tagged_words(): list of (str,str) tuple -- tagged_sents(): list of (list of (str,str)) -- tagged_paras(): list of (list of (list of (str,str))) -- chunked_sents(): list of (Tree w/ (str,str) leaves) -- parsed_sents(): list of (Tree with str leaves) -- parsed_paras(): list of (list of (Tree with str leaves)) -- xml(): A single xml ElementTree -- raw(): unprocessed corpus contents - -For example, to read a list of the words in the Brown Corpus, use -``nltk.corpus.brown.words()``: - - >>> from nltk.corpus import brown - >>> print(", ".join(brown.words()[:6])) # only first 6 words - The, Fulton, County, Grand, Jury, said - -isort:skip_file -""" - -from nltk.corpus.reader.plaintext import * -from nltk.corpus.reader.util import * -from nltk.corpus.reader.api import * -from nltk.corpus.reader.tagged import * -from nltk.corpus.reader.cmudict import * -from nltk.corpus.reader.conll import * -from nltk.corpus.reader.chunked import * -from nltk.corpus.reader.wordlist import * -from nltk.corpus.reader.xmldocs import * -from nltk.corpus.reader.ppattach import * -from nltk.corpus.reader.senseval import * -from nltk.corpus.reader.ieer import * -from nltk.corpus.reader.sinica_treebank import * -from nltk.corpus.reader.bracket_parse import * -from nltk.corpus.reader.indian import * -from nltk.corpus.reader.toolbox import * -from nltk.corpus.reader.timit import * -from nltk.corpus.reader.ycoe import * -from nltk.corpus.reader.rte import * -from nltk.corpus.reader.string_category import * -from nltk.corpus.reader.propbank import * -from nltk.corpus.reader.verbnet import * -from nltk.corpus.reader.bnc import * -from nltk.corpus.reader.nps_chat import * -from nltk.corpus.reader.wordnet import * -from nltk.corpus.reader.switchboard import * -from nltk.corpus.reader.dependency import * -from nltk.corpus.reader.nombank import * -from nltk.corpus.reader.ipipan import * -from nltk.corpus.reader.pl196x import * -from nltk.corpus.reader.knbc import * -from nltk.corpus.reader.chasen import * -from nltk.corpus.reader.childes import * -from nltk.corpus.reader.aligned import * -from nltk.corpus.reader.lin import * -from nltk.corpus.reader.semcor import * -from nltk.corpus.reader.framenet import * -from nltk.corpus.reader.udhr import * -from nltk.corpus.reader.bnc import * -from nltk.corpus.reader.sentiwordnet import * -from nltk.corpus.reader.twitter import * -from nltk.corpus.reader.nkjp import * -from nltk.corpus.reader.crubadan import * -from nltk.corpus.reader.mte import * -from nltk.corpus.reader.reviews import * -from nltk.corpus.reader.opinion_lexicon import * -from nltk.corpus.reader.pros_cons import * -from nltk.corpus.reader.categorized_sents import * -from nltk.corpus.reader.comparative_sents import * -from nltk.corpus.reader.panlex_lite import * -from nltk.corpus.reader.panlex_swadesh import * -from nltk.corpus.reader.bcp47 import * - -# Make sure that nltk.corpus.reader.bracket_parse gives the module, not -# the function bracket_parse() defined in nltk.tree: -from nltk.corpus.reader import bracket_parse - -__all__ = [ - "CorpusReader", - "CategorizedCorpusReader", - "PlaintextCorpusReader", - "find_corpus_fileids", - "TaggedCorpusReader", - "CMUDictCorpusReader", - "ConllChunkCorpusReader", - "WordListCorpusReader", - "PPAttachmentCorpusReader", - "SensevalCorpusReader", - "IEERCorpusReader", - "ChunkedCorpusReader", - "SinicaTreebankCorpusReader", - "BracketParseCorpusReader", - "IndianCorpusReader", - "ToolboxCorpusReader", - "TimitCorpusReader", - "YCOECorpusReader", - "MacMorphoCorpusReader", - "SyntaxCorpusReader", - "AlpinoCorpusReader", - "RTECorpusReader", - "StringCategoryCorpusReader", - "EuroparlCorpusReader", - "CategorizedBracketParseCorpusReader", - "CategorizedTaggedCorpusReader", - "CategorizedPlaintextCorpusReader", - "PortugueseCategorizedPlaintextCorpusReader", - "tagged_treebank_para_block_reader", - "PropbankCorpusReader", - "VerbnetCorpusReader", - "BNCCorpusReader", - "ConllCorpusReader", - "XMLCorpusReader", - "NPSChatCorpusReader", - "SwadeshCorpusReader", - "WordNetCorpusReader", - "WordNetICCorpusReader", - "SwitchboardCorpusReader", - "DependencyCorpusReader", - "NombankCorpusReader", - "IPIPANCorpusReader", - "Pl196xCorpusReader", - "TEICorpusView", - "KNBCorpusReader", - "ChasenCorpusReader", - "CHILDESCorpusReader", - "AlignedCorpusReader", - "TimitTaggedCorpusReader", - "LinThesaurusCorpusReader", - "SemcorCorpusReader", - "FramenetCorpusReader", - "UdhrCorpusReader", - "BNCCorpusReader", - "SentiWordNetCorpusReader", - "SentiSynset", - "TwitterCorpusReader", - "NKJPCorpusReader", - "CrubadanCorpusReader", - "MTECorpusReader", - "ReviewsCorpusReader", - "OpinionLexiconCorpusReader", - "ProsConsCorpusReader", - "CategorizedSentencesCorpusReader", - "ComparativeSentencesCorpusReader", - "PanLexLiteCorpusReader", - "NonbreakingPrefixesCorpusReader", - "UnicharsCorpusReader", - "MWAPPDBCorpusReader", - "PanlexSwadeshCorpusReader", - "BCP47CorpusReader", -] diff --git a/pipeline/nltk/corpus/reader/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index ac98fd7c9acfafaf204bbb82ac1f63454ad07b0f..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/aligned.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/aligned.cpython-39.pyc deleted file mode 100644 index 614e68877c1bd98790666b8d43320e8a80816ecc..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/aligned.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/api.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/api.cpython-39.pyc deleted file mode 100644 index 4f654596fa76f0b2f56e00ed7f3967836f1cca6d..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/api.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/bcp47.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/bcp47.cpython-39.pyc deleted file mode 100644 index b4aca8c7e3b25b6ffc094adb48bc17b427e488fe..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/bcp47.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/bnc.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/bnc.cpython-39.pyc deleted file mode 100644 index e99dfaea482c0c52da0c059e9f4f54d163e47fd4..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/bnc.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/bracket_parse.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/bracket_parse.cpython-39.pyc deleted file mode 100644 index c57966795fad97655ad15c6f42a1d1d8830b2d32..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/bracket_parse.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/categorized_sents.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/categorized_sents.cpython-39.pyc deleted file mode 100644 index dd374dbbc208caba05270a3cd7739cef8cce5f30..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/categorized_sents.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/chasen.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/chasen.cpython-39.pyc deleted file mode 100644 index 705627f99ad12d9520378306616b00b7a1a6437c..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/chasen.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/childes.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/childes.cpython-39.pyc deleted file mode 100644 index aeef4606d20b7d0068ef470eee0f63ff5c49d600..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/childes.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/chunked.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/chunked.cpython-39.pyc deleted file mode 100644 index 862473a89e9b4d5cc65809cfd11273002c7746a5..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/chunked.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/cmudict.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/cmudict.cpython-39.pyc deleted file mode 100644 index b11e13886f8c9971a172aa1080748ea628cea6ff..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/cmudict.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/comparative_sents.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/comparative_sents.cpython-39.pyc deleted file mode 100644 index a69ef17e4ed50ff73f79c1ee9348a13045541c56..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/comparative_sents.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/conll.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/conll.cpython-39.pyc deleted file mode 100644 index 29aa6110181292d5c31efca148ca2dbd4cd0dd7f..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/conll.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/crubadan.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/crubadan.cpython-39.pyc deleted file mode 100644 index cac15ed6273af9ec2a765e19f99696677f61502d..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/crubadan.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/dependency.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/dependency.cpython-39.pyc deleted file mode 100644 index e255094ff77cc64d4cc4dbed3166d69c5bb3e115..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/dependency.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/framenet.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/framenet.cpython-39.pyc deleted file mode 100644 index 54635fe89f40e50729d6b2246cbc6b8ed46f8f6b..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/framenet.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/ieer.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/ieer.cpython-39.pyc deleted file mode 100644 index d30a62a0c8950c5abbfada7d44789dd450c9ef83..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/ieer.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/indian.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/indian.cpython-39.pyc deleted file mode 100644 index 13ec302bff545671072123224eb45064a189fcea..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/indian.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/ipipan.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/ipipan.cpython-39.pyc deleted file mode 100644 index 7065cf781009ec960717a5341cd65681ccd64def..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/ipipan.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/knbc.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/knbc.cpython-39.pyc deleted file mode 100644 index c1caa2298d486384cb5e21f9dd658416d6913a1b..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/knbc.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/lin.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/lin.cpython-39.pyc deleted file mode 100644 index 2d290f9c827727ae1c9941cab5fa4b6bd4e1cef0..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/lin.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/markdown.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/markdown.cpython-39.pyc deleted file mode 100644 index 72ca397777f803cb1804ff08a43c0468abb90828..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/markdown.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/mte.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/mte.cpython-39.pyc deleted file mode 100644 index 09be21f368f054b017cdbbb12ccc301349eb565a..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/mte.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/nkjp.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/nkjp.cpython-39.pyc deleted file mode 100644 index 68f5d35e61bf8dac39c39eef0912953204dbc1f3..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/nkjp.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/nombank.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/nombank.cpython-39.pyc deleted file mode 100644 index 286715f7dba2716bcc69bd0cd9405769b9d6f5fa..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/nombank.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/nps_chat.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/nps_chat.cpython-39.pyc deleted file mode 100644 index c4f4ccaf1ef2923d5dd5bca0cdd5e40d9f8f800a..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/nps_chat.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/opinion_lexicon.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/opinion_lexicon.cpython-39.pyc deleted file mode 100644 index bdc61dc7a8f079f0110a706b7c080d4ed1d47c97..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/opinion_lexicon.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/panlex_lite.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/panlex_lite.cpython-39.pyc deleted file mode 100644 index 59388b42301cb9f88a26d670a761acc030997147..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/panlex_lite.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/panlex_swadesh.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/panlex_swadesh.cpython-39.pyc deleted file mode 100644 index 304a758cbd512ebf68dfef6c9216880bf69da6b5..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/panlex_swadesh.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/pl196x.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/pl196x.cpython-39.pyc deleted file mode 100644 index 10112b68af2930825b53c2b182cb149cea30e75b..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/pl196x.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/plaintext.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/plaintext.cpython-39.pyc deleted file mode 100644 index 592e6ed8655a28c9ede6aa98fdfd05de75793ed8..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/plaintext.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/ppattach.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/ppattach.cpython-39.pyc deleted file mode 100644 index 297cd44f67a50c64a863dd1c009f8e5c2274e01c..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/ppattach.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/propbank.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/propbank.cpython-39.pyc deleted file mode 100644 index 37f120347ca51ffceb66922fd8650e2f54ef81bd..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/propbank.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/pros_cons.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/pros_cons.cpython-39.pyc deleted file mode 100644 index 3be2bbbc0e893d89d91a71247d21e67b09510051..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/pros_cons.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/reviews.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/reviews.cpython-39.pyc deleted file mode 100644 index 83ede1dbc502f30e9602ca12c58118b0b91e9cd4..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/reviews.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/rte.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/rte.cpython-39.pyc deleted file mode 100644 index aa983e5a32ba30308615170fb0adcd1338e5db2f..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/rte.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/semcor.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/semcor.cpython-39.pyc deleted file mode 100644 index b0ba0e4e2dbd3eb558fc3084aa032256ef1c171a..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/semcor.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/senseval.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/senseval.cpython-39.pyc deleted file mode 100644 index 6b006b41827d484fa581a9c689b33d3c88e9c65c..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/senseval.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/sentiwordnet.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/sentiwordnet.cpython-39.pyc deleted file mode 100644 index 5401ff1dcb8162d985929845f77d822bacc11ab4..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/sentiwordnet.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/sinica_treebank.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/sinica_treebank.cpython-39.pyc deleted file mode 100644 index fbf95db1acfc4cbfe1a9b090531a31846dadb20f..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/sinica_treebank.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/string_category.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/string_category.cpython-39.pyc deleted file mode 100644 index 2fe0f56d5e952c0a63c11bad1a37662924f78142..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/string_category.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/switchboard.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/switchboard.cpython-39.pyc deleted file mode 100644 index ad7d6ae1dfad4af7f38a78b91458513644eb06ab..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/switchboard.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/tagged.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/tagged.cpython-39.pyc deleted file mode 100644 index 14156faa5c44375847d16746b50c5ad24dacf0ad..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/tagged.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/timit.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/timit.cpython-39.pyc deleted file mode 100644 index 2f1b65851464d77a664f5f9428e6067c8da8e14a..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/timit.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/toolbox.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/toolbox.cpython-39.pyc deleted file mode 100644 index f27dc8816353c270cf3b7bab3a359b83e973aa6e..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/toolbox.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/twitter.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/twitter.cpython-39.pyc deleted file mode 100644 index 2a5939e04662760fd86939ae5813998148e3bfc6..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/twitter.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/udhr.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/udhr.cpython-39.pyc deleted file mode 100644 index 3640f1e7b8d55718a347d95ae1135b24a216c90a..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/udhr.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/util.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/util.cpython-39.pyc deleted file mode 100644 index d4cf9b1aaf81f2c8b30a87a432baa7173f5d586e..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/util.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/verbnet.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/verbnet.cpython-39.pyc deleted file mode 100644 index 350538a57a8918bd14d841b58c9374b17bf83cfb..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/verbnet.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/wordlist.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/wordlist.cpython-39.pyc deleted file mode 100644 index 5fcebd0e683c77a165bfbe2d41c4888b5e624ed2..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/wordlist.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/wordnet.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/wordnet.cpython-39.pyc deleted file mode 100644 index b68597931e6061a0afc1367abdcad89f8f83102c..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/wordnet.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/xmldocs.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/xmldocs.cpython-39.pyc deleted file mode 100644 index c15acd3a4efd9814608a5018cf7939cf2c69c9ee..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/xmldocs.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/__pycache__/ycoe.cpython-39.pyc b/pipeline/nltk/corpus/reader/__pycache__/ycoe.cpython-39.pyc deleted file mode 100644 index 6f106dacbbd295069c1789ebac23f6ef4ab71efe..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/corpus/reader/__pycache__/ycoe.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/corpus/reader/aligned.py b/pipeline/nltk/corpus/reader/aligned.py deleted file mode 100644 index 93caf6233b5d1ee4d66eff0009a0d73fceb67904..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/aligned.py +++ /dev/null @@ -1,154 +0,0 @@ -# Natural Language Toolkit: Aligned Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# URL: -# Author: Steven Bird -# For license information, see LICENSE.TXT - -from nltk.corpus.reader.api import CorpusReader -from nltk.corpus.reader.util import ( - StreamBackedCorpusView, - concat, - read_alignedsent_block, -) -from nltk.tokenize import RegexpTokenizer, WhitespaceTokenizer -from nltk.translate import AlignedSent, Alignment - - -class AlignedCorpusReader(CorpusReader): - """ - Reader for corpora of word-aligned sentences. Tokens are assumed - to be separated by whitespace. Sentences begin on separate lines. - """ - - def __init__( - self, - root, - fileids, - sep="/", - word_tokenizer=WhitespaceTokenizer(), - sent_tokenizer=RegexpTokenizer("\n", gaps=True), - alignedsent_block_reader=read_alignedsent_block, - encoding="latin1", - ): - """ - Construct a new Aligned Corpus reader for a set of documents - located at the given root directory. Example usage: - - >>> root = '/...path to corpus.../' - >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP - - :param root: The root directory for this corpus. - :param fileids: A list or regexp specifying the fileids in this corpus. - """ - CorpusReader.__init__(self, root, fileids, encoding) - self._sep = sep - self._word_tokenizer = word_tokenizer - self._sent_tokenizer = sent_tokenizer - self._alignedsent_block_reader = alignedsent_block_reader - - def words(self, fileids=None): - """ - :return: the given file(s) as a list of words - and punctuation symbols. - :rtype: list(str) - """ - return concat( - [ - AlignedSentCorpusView( - fileid, - enc, - False, - False, - self._word_tokenizer, - self._sent_tokenizer, - self._alignedsent_block_reader, - ) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def sents(self, fileids=None): - """ - :return: the given file(s) as a list of - sentences or utterances, each encoded as a list of word - strings. - :rtype: list(list(str)) - """ - return concat( - [ - AlignedSentCorpusView( - fileid, - enc, - False, - True, - self._word_tokenizer, - self._sent_tokenizer, - self._alignedsent_block_reader, - ) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def aligned_sents(self, fileids=None): - """ - :return: the given file(s) as a list of AlignedSent objects. - :rtype: list(AlignedSent) - """ - return concat( - [ - AlignedSentCorpusView( - fileid, - enc, - True, - True, - self._word_tokenizer, - self._sent_tokenizer, - self._alignedsent_block_reader, - ) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - -class AlignedSentCorpusView(StreamBackedCorpusView): - """ - A specialized corpus view for aligned sentences. - ``AlignedSentCorpusView`` objects are typically created by - ``AlignedCorpusReader`` (not directly by nltk users). - """ - - def __init__( - self, - corpus_file, - encoding, - aligned, - group_by_sent, - word_tokenizer, - sent_tokenizer, - alignedsent_block_reader, - ): - self._aligned = aligned - self._group_by_sent = group_by_sent - self._word_tokenizer = word_tokenizer - self._sent_tokenizer = sent_tokenizer - self._alignedsent_block_reader = alignedsent_block_reader - StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) - - def read_block(self, stream): - block = [ - self._word_tokenizer.tokenize(sent_str) - for alignedsent_str in self._alignedsent_block_reader(stream) - for sent_str in self._sent_tokenizer.tokenize(alignedsent_str) - ] - if self._aligned: - block[2] = Alignment.fromstring( - " ".join(block[2]) - ) # kludge; we shouldn't have tokenized the alignment string - block = [AlignedSent(*block)] - elif self._group_by_sent: - block = [block[0]] - else: - block = block[0] - - return block diff --git a/pipeline/nltk/corpus/reader/api.py b/pipeline/nltk/corpus/reader/api.py deleted file mode 100644 index cbe80d902ff8daa5b2a94fcccbc5b050f2d36324..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/api.py +++ /dev/null @@ -1,516 +0,0 @@ -# Natural Language Toolkit: API for Corpus Readers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -API for corpus readers. -""" - -import os -import re -from collections import defaultdict -from itertools import chain - -from nltk.corpus.reader.util import * -from nltk.data import FileSystemPathPointer, PathPointer, ZipFilePathPointer - - -class CorpusReader: - """ - A base class for "corpus reader" classes, each of which can be - used to read a specific corpus format. Each individual corpus - reader instance is used to read a specific corpus, consisting of - one or more files under a common root directory. Each file is - identified by its ``file identifier``, which is the relative path - to the file from the root directory. - - A separate subclass is defined for each corpus format. These - subclasses define one or more methods that provide 'views' on the - corpus contents, such as ``words()`` (for a list of words) and - ``parsed_sents()`` (for a list of parsed sentences). Called with - no arguments, these methods will return the contents of the entire - corpus. For most corpora, these methods define one or more - selection arguments, such as ``fileids`` or ``categories``, which can - be used to select which portion of the corpus should be returned. - """ - - def __init__(self, root, fileids, encoding="utf8", tagset=None): - """ - :type root: PathPointer or str - :param root: A path pointer identifying the root directory for - this corpus. If a string is specified, then it will be - converted to a ``PathPointer`` automatically. - :param fileids: A list of the files that make up this corpus. - This list can either be specified explicitly, as a list of - strings; or implicitly, as a regular expression over file - paths. The absolute path for each file will be constructed - by joining the reader's root to each file name. - :param encoding: The default unicode encoding for the files - that make up the corpus. The value of ``encoding`` can be any - of the following: - - - A string: ``encoding`` is the encoding name for all files. - - A dictionary: ``encoding[file_id]`` is the encoding - name for the file whose identifier is ``file_id``. If - ``file_id`` is not in ``encoding``, then the file - contents will be processed using non-unicode byte strings. - - A list: ``encoding`` should be a list of ``(regexp, encoding)`` - tuples. The encoding for a file whose identifier is ``file_id`` - will be the ``encoding`` value for the first tuple whose - ``regexp`` matches the ``file_id``. If no tuple's ``regexp`` - matches the ``file_id``, the file contents will be processed - using non-unicode byte strings. - - None: the file contents of all files will be - processed using non-unicode byte strings. - :param tagset: The name of the tagset used by this corpus, to be used - for normalizing or converting the POS tags returned by the - ``tagged_...()`` methods. - """ - # Convert the root to a path pointer, if necessary. - if isinstance(root, str) and not isinstance(root, PathPointer): - m = re.match(r"(.*\.zip)/?(.*)$|", root) - zipfile, zipentry = m.groups() - if zipfile: - root = ZipFilePathPointer(zipfile, zipentry) - else: - root = FileSystemPathPointer(root) - elif not isinstance(root, PathPointer): - raise TypeError("CorpusReader: expected a string or a PathPointer") - - # If `fileids` is a regexp, then expand it. - if isinstance(fileids, str): - fileids = find_corpus_fileids(root, fileids) - - self._fileids = fileids - """A list of the relative paths for the fileids that make up - this corpus.""" - - self._root = root - """The root directory for this corpus.""" - - self._readme = "README" - self._license = "LICENSE" - self._citation = "citation.bib" - - # If encoding was specified as a list of regexps, then convert - # it to a dictionary. - if isinstance(encoding, list): - encoding_dict = {} - for fileid in self._fileids: - for x in encoding: - (regexp, enc) = x - if re.match(regexp, fileid): - encoding_dict[fileid] = enc - break - encoding = encoding_dict - - self._encoding = encoding - """The default unicode encoding for the fileids that make up - this corpus. If ``encoding`` is None, then the file - contents are processed using byte strings.""" - self._tagset = tagset - - def __repr__(self): - if isinstance(self._root, ZipFilePathPointer): - path = f"{self._root.zipfile.filename}/{self._root.entry}" - else: - path = "%s" % self._root.path - return f"<{self.__class__.__name__} in {path!r}>" - - def ensure_loaded(self): - """ - Load this corpus (if it has not already been loaded). This is - used by LazyCorpusLoader as a simple method that can be used to - make sure a corpus is loaded -- e.g., in case a user wants to - do help(some_corpus). - """ - pass # no need to actually do anything. - - def readme(self): - """ - Return the contents of the corpus README file, if it exists. - """ - with self.open(self._readme) as f: - return f.read() - - def license(self): - """ - Return the contents of the corpus LICENSE file, if it exists. - """ - with self.open(self._license) as f: - return f.read() - - def citation(self): - """ - Return the contents of the corpus citation.bib file, if it exists. - """ - with self.open(self._citation) as f: - return f.read() - - def fileids(self): - """ - Return a list of file identifiers for the fileids that make up - this corpus. - """ - return self._fileids - - def abspath(self, fileid): - """ - Return the absolute path for the given file. - - :type fileid: str - :param fileid: The file identifier for the file whose path - should be returned. - :rtype: PathPointer - """ - return self._root.join(fileid) - - def abspaths(self, fileids=None, include_encoding=False, include_fileid=False): - """ - Return a list of the absolute paths for all fileids in this corpus; - or for the given list of fileids, if specified. - - :type fileids: None or str or list - :param fileids: Specifies the set of fileids for which paths should - be returned. Can be None, for all fileids; a list of - file identifiers, for a specified set of fileids; or a single - file identifier, for a single file. Note that the return - value is always a list of paths, even if ``fileids`` is a - single file identifier. - - :param include_encoding: If true, then return a list of - ``(path_pointer, encoding)`` tuples. - - :rtype: list(PathPointer) - """ - if fileids is None: - fileids = self._fileids - elif isinstance(fileids, str): - fileids = [fileids] - - paths = [self._root.join(f) for f in fileids] - - if include_encoding and include_fileid: - return list(zip(paths, [self.encoding(f) for f in fileids], fileids)) - elif include_fileid: - return list(zip(paths, fileids)) - elif include_encoding: - return list(zip(paths, [self.encoding(f) for f in fileids])) - else: - return paths - - def raw(self, fileids=None): - """ - :param fileids: A list specifying the fileids that should be used. - :return: the given file(s) as a single string. - :rtype: str - """ - if fileids is None: - fileids = self._fileids - elif isinstance(fileids, str): - fileids = [fileids] - contents = [] - for f in fileids: - with self.open(f) as fp: - contents.append(fp.read()) - return concat(contents) - - def open(self, file): - """ - Return an open stream that can be used to read the given file. - If the file's encoding is not None, then the stream will - automatically decode the file's contents into unicode. - - :param file: The file identifier of the file to read. - """ - encoding = self.encoding(file) - stream = self._root.join(file).open(encoding) - return stream - - def encoding(self, file): - """ - Return the unicode encoding for the given corpus file, if known. - If the encoding is unknown, or if the given file should be - processed using byte strings (str), then return None. - """ - if isinstance(self._encoding, dict): - return self._encoding.get(file) - else: - return self._encoding - - def _get_root(self): - return self._root - - root = property( - _get_root, - doc=""" - The directory where this corpus is stored. - - :type: PathPointer""", - ) - - -###################################################################### -# { Corpora containing categorized items -###################################################################### - - -class CategorizedCorpusReader: - """ - A mixin class used to aid in the implementation of corpus readers - for categorized corpora. This class defines the method - ``categories()``, which returns a list of the categories for the - corpus or for a specified set of fileids; and overrides ``fileids()`` - to take a ``categories`` argument, restricting the set of fileids to - be returned. - - Subclasses are expected to: - - - Call ``__init__()`` to set up the mapping. - - - Override all view methods to accept a ``categories`` parameter, - which can be used *instead* of the ``fileids`` parameter, to - select which fileids should be included in the returned view. - """ - - def __init__(self, kwargs): - """ - Initialize this mapping based on keyword arguments, as - follows: - - - cat_pattern: A regular expression pattern used to find the - category for each file identifier. The pattern will be - applied to each file identifier, and the first matching - group will be used as the category label for that file. - - - cat_map: A dictionary, mapping from file identifiers to - category labels. - - - cat_file: The name of a file that contains the mapping - from file identifiers to categories. The argument - ``cat_delimiter`` can be used to specify a delimiter. - - The corresponding argument will be deleted from ``kwargs``. If - more than one argument is specified, an exception will be - raised. - """ - self._f2c = None #: file-to-category mapping - self._c2f = None #: category-to-file mapping - - self._pattern = None #: regexp specifying the mapping - self._map = None #: dict specifying the mapping - self._file = None #: fileid of file containing the mapping - self._delimiter = None #: delimiter for ``self._file`` - - if "cat_pattern" in kwargs: - self._pattern = kwargs["cat_pattern"] - del kwargs["cat_pattern"] - elif "cat_map" in kwargs: - self._map = kwargs["cat_map"] - del kwargs["cat_map"] - elif "cat_file" in kwargs: - self._file = kwargs["cat_file"] - del kwargs["cat_file"] - if "cat_delimiter" in kwargs: - self._delimiter = kwargs["cat_delimiter"] - del kwargs["cat_delimiter"] - else: - raise ValueError( - "Expected keyword argument cat_pattern or " "cat_map or cat_file." - ) - - if "cat_pattern" in kwargs or "cat_map" in kwargs or "cat_file" in kwargs: - raise ValueError( - "Specify exactly one of: cat_pattern, " "cat_map, cat_file." - ) - - def _init(self): - self._f2c = defaultdict(set) - self._c2f = defaultdict(set) - - if self._pattern is not None: - for file_id in self._fileids: - category = re.match(self._pattern, file_id).group(1) - self._add(file_id, category) - - elif self._map is not None: - for (file_id, categories) in self._map.items(): - for category in categories: - self._add(file_id, category) - - elif self._file is not None: - with self.open(self._file) as f: - for line in f.readlines(): - line = line.strip() - file_id, categories = line.split(self._delimiter, 1) - if file_id not in self.fileids(): - raise ValueError( - "In category mapping file %s: %s " - "not found" % (self._file, file_id) - ) - for category in categories.split(self._delimiter): - self._add(file_id, category) - - def _add(self, file_id, category): - self._f2c[file_id].add(category) - self._c2f[category].add(file_id) - - def categories(self, fileids=None): - """ - Return a list of the categories that are defined for this corpus, - or for the file(s) if it is given. - """ - if self._f2c is None: - self._init() - if fileids is None: - return sorted(self._c2f) - if isinstance(fileids, str): - fileids = [fileids] - return sorted(set.union(*(self._f2c[d] for d in fileids))) - - def fileids(self, categories=None): - """ - Return a list of file identifiers for the files that make up - this corpus, or that make up the given category(s) if specified. - """ - if categories is None: - return super().fileids() - elif isinstance(categories, str): - if self._f2c is None: - self._init() - if categories in self._c2f: - return sorted(self._c2f[categories]) - else: - raise ValueError("Category %s not found" % categories) - else: - if self._f2c is None: - self._init() - return sorted(set.union(*(self._c2f[c] for c in categories))) - - def _resolve(self, fileids, categories): - if fileids is not None and categories is not None: - raise ValueError("Specify fileids or categories, not both") - if categories is not None: - return self.fileids(categories) - else: - return fileids - - def raw(self, fileids=None, categories=None): - return super().raw(self._resolve(fileids, categories)) - - def words(self, fileids=None, categories=None): - return super().words(self._resolve(fileids, categories)) - - def sents(self, fileids=None, categories=None): - return super().sents(self._resolve(fileids, categories)) - - def paras(self, fileids=None, categories=None): - return super().paras(self._resolve(fileids, categories)) - - -###################################################################### -# { Treebank readers -###################################################################### - -# [xx] is it worth it to factor this out? -class SyntaxCorpusReader(CorpusReader): - """ - An abstract base class for reading corpora consisting of - syntactically parsed text. Subclasses should define: - - - ``__init__``, which specifies the location of the corpus - and a method for detecting the sentence blocks in corpus files. - - ``_read_block``, which reads a block from the input stream. - - ``_word``, which takes a block and returns a list of list of words. - - ``_tag``, which takes a block and returns a list of list of tagged - words. - - ``_parse``, which takes a block and returns a list of parsed - sentences. - """ - - def _parse(self, s): - raise NotImplementedError() - - def _word(self, s): - raise NotImplementedError() - - def _tag(self, s): - raise NotImplementedError() - - def _read_block(self, stream): - raise NotImplementedError() - - def parsed_sents(self, fileids=None): - reader = self._read_parsed_sent_block - return concat( - [ - StreamBackedCorpusView(fileid, reader, encoding=enc) - for fileid, enc in self.abspaths(fileids, True) - ] - ) - - def tagged_sents(self, fileids=None, tagset=None): - def reader(stream): - return self._read_tagged_sent_block(stream, tagset) - - return concat( - [ - StreamBackedCorpusView(fileid, reader, encoding=enc) - for fileid, enc in self.abspaths(fileids, True) - ] - ) - - def sents(self, fileids=None): - reader = self._read_sent_block - return concat( - [ - StreamBackedCorpusView(fileid, reader, encoding=enc) - for fileid, enc in self.abspaths(fileids, True) - ] - ) - - def tagged_words(self, fileids=None, tagset=None): - def reader(stream): - return self._read_tagged_word_block(stream, tagset) - - return concat( - [ - StreamBackedCorpusView(fileid, reader, encoding=enc) - for fileid, enc in self.abspaths(fileids, True) - ] - ) - - def words(self, fileids=None): - return concat( - [ - StreamBackedCorpusView(fileid, self._read_word_block, encoding=enc) - for fileid, enc in self.abspaths(fileids, True) - ] - ) - - # ------------------------------------------------------------ - # { Block Readers - - def _read_word_block(self, stream): - return list(chain.from_iterable(self._read_sent_block(stream))) - - def _read_tagged_word_block(self, stream, tagset=None): - return list(chain.from_iterable(self._read_tagged_sent_block(stream, tagset))) - - def _read_sent_block(self, stream): - return list(filter(None, [self._word(t) for t in self._read_block(stream)])) - - def _read_tagged_sent_block(self, stream, tagset=None): - return list( - filter(None, [self._tag(t, tagset) for t in self._read_block(stream)]) - ) - - def _read_parsed_sent_block(self, stream): - return list(filter(None, [self._parse(t) for t in self._read_block(stream)])) - - # } End of Block Readers - # ------------------------------------------------------------ diff --git a/pipeline/nltk/corpus/reader/bcp47.py b/pipeline/nltk/corpus/reader/bcp47.py deleted file mode 100644 index 429f52a65034f6faee531430a4b1d08aabe20103..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/bcp47.py +++ /dev/null @@ -1,218 +0,0 @@ -# Natural Language Toolkit: BCP-47 language tags -# -# Copyright (C) 2022-2023 NLTK Project -# Author: Eric Kafe -# URL: -# For license information, see LICENSE.TXT - -import re -from warnings import warn -from xml.etree import ElementTree as et - -from nltk.corpus.reader import CorpusReader - - -class BCP47CorpusReader(CorpusReader): - """ - Parse BCP-47 composite language tags - - Supports all the main subtags, and the 'u-sd' extension: - - >>> from nltk.corpus import bcp47 - >>> bcp47.name('oc-gascon-u-sd-fr64') - 'Occitan (post 1500): Gascon: Pyrénées-Atlantiques' - - Can load a conversion table to Wikidata Q-codes: - >>> bcp47.load_wiki_q() - >>> bcp47.wiki_q['en-GI-spanglis'] - 'Q79388' - - """ - - def __init__(self, root, fileids): - """Read the BCP-47 database""" - super().__init__(root, fileids) - self.langcode = {} - with self.open("iana/language-subtag-registry.txt") as fp: - self.db = self.data_dict(fp.read().split("%%\n")) - with self.open("cldr/common-subdivisions-en.xml") as fp: - self.subdiv = self.subdiv_dict( - et.parse(fp).iterfind("localeDisplayNames/subdivisions/subdivision") - ) - self.morphology() - - def load_wiki_q(self): - """Load conversion table to Wikidata Q-codes (only if needed)""" - with self.open("cldr/tools-cldr-rdf-external-entityToCode.tsv") as fp: - self.wiki_q = self.wiki_dict(fp.read().strip().split("\n")[1:]) - - def wiki_dict(self, lines): - """Convert Wikidata list of Q-codes to a BCP-47 dictionary""" - return { - pair[1]: pair[0].split("/")[-1] - for pair in [line.strip().split("\t") for line in lines] - } - - def subdiv_dict(self, subdivs): - """Convert the CLDR subdivisions list to a dictionary""" - return {sub.attrib["type"]: sub.text for sub in subdivs} - - def morphology(self): - self.casing = { - "language": str.lower, - "extlang": str.lower, - "script": str.title, - "region": str.upper, - "variant": str.lower, - } - dig = "[0-9]" - low = "[a-z]" - up = "[A-Z]" - alnum = "[a-zA-Z0-9]" - self.format = { - "language": re.compile(f"{low*3}?"), - "extlang": re.compile(f"{low*3}"), - "script": re.compile(f"{up}{low*3}"), - "region": re.compile(f"({up*2})|({dig*3})"), - "variant": re.compile(f"{alnum*4}{(alnum+'?')*4}"), - "singleton": re.compile(f"{low}"), - } - - def data_dict(self, records): - """Convert the BCP-47 language subtag registry to a dictionary""" - self.version = records[0].replace("File-Date:", "").strip() - dic = {} - dic["deprecated"] = {} - for label in [ - "language", - "extlang", - "script", - "region", - "variant", - "redundant", - "grandfathered", - ]: - dic["deprecated"][label] = {} - for record in records[1:]: - fields = [field.split(": ") for field in record.strip().split("\n")] - typ = fields[0][1] - tag = fields[1][1] - if typ not in dic: - dic[typ] = {} - subfields = {} - for field in fields[2:]: - if len(field) == 2: - [key, val] = field - if key not in subfields: - subfields[key] = [val] - else: # multiple value - subfields[key].append(val) - else: # multiline field - subfields[key][-1] += " " + field[0].strip() - if ( - "Deprecated" not in record - and typ == "language" - and key == "Description" - ): - self.langcode[subfields[key][-1]] = tag - for key in subfields: - if len(subfields[key]) == 1: # single value - subfields[key] = subfields[key][0] - if "Deprecated" in record: - dic["deprecated"][typ][tag] = subfields - else: - dic[typ][tag] = subfields - return dic - - def val2str(self, val): - """Return only first value""" - if type(val) == list: - # val = "/".join(val) # Concatenate all values - val = val[0] - return val - - def lang2str(self, lg_record): - """Concatenate subtag values""" - name = f"{lg_record['language']}" - for label in ["extlang", "script", "region", "variant", "extension"]: - if label in lg_record: - name += f": {lg_record[label]}" - return name - - def parse_tag(self, tag): - """Convert a BCP-47 tag to a dictionary of labelled subtags""" - subtags = tag.split("-") - lang = {} - labels = ["language", "extlang", "script", "region", "variant", "variant"] - while subtags and labels: - subtag = subtags.pop(0) - found = False - while labels: - label = labels.pop(0) - subtag = self.casing[label](subtag) - if self.format[label].fullmatch(subtag): - if subtag in self.db[label]: - found = True - valstr = self.val2str(self.db[label][subtag]["Description"]) - if label == "variant" and label in lang: - lang[label] += ": " + valstr - else: - lang[label] = valstr - break - elif subtag in self.db["deprecated"][label]: - found = True - note = f"The {subtag!r} {label} code is deprecated" - if "Preferred-Value" in self.db["deprecated"][label][subtag]: - prefer = self.db["deprecated"][label][subtag][ - "Preferred-Value" - ] - note += f"', prefer '{self.val2str(prefer)}'" - lang[label] = self.val2str( - self.db["deprecated"][label][subtag]["Description"] - ) - warn(note) - break - if not found: - if subtag == "u" and subtags[0] == "sd": # CLDR regional subdivisions - sd = subtags[1] - if sd in self.subdiv: - ext = self.subdiv[sd] - else: - ext = f"" - else: # other extension subtags are not supported yet - ext = f"{subtag}{''.join(['-'+ext for ext in subtags])}".lower() - if not self.format["singleton"].fullmatch(subtag): - ext = f"" - warn(ext) - lang["extension"] = ext - subtags = [] - return lang - - def name(self, tag): - """ - Convert a BCP-47 tag to a colon-separated string of subtag names - - >>> from nltk.corpus import bcp47 - >>> bcp47.name('ca-Latn-ES-valencia') - 'Catalan: Latin: Spain: Valencian' - - """ - for label in ["redundant", "grandfathered"]: - val = None - if tag in self.db[label]: - val = f"{self.db[label][tag]['Description']}" - note = f"The {tag!r} code is {label}" - elif tag in self.db["deprecated"][label]: - val = f"{self.db['deprecated'][label][tag]['Description']}" - note = f"The {tag!r} code is {label} and deprecated" - if "Preferred-Value" in self.db["deprecated"][label][tag]: - prefer = self.db["deprecated"][label][tag]["Preferred-Value"] - note += f", prefer {self.val2str(prefer)!r}" - if val: - warn(note) - return val - try: - return self.lang2str(self.parse_tag(tag)) - except: - warn(f"Tag {tag!r} was not recognized") - return None diff --git a/pipeline/nltk/corpus/reader/bnc.py b/pipeline/nltk/corpus/reader/bnc.py deleted file mode 100644 index e7128bf843b5c24a59b10d8a0cf1f689592bae52..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/bnc.py +++ /dev/null @@ -1,265 +0,0 @@ -# Natural Language Toolkit: Plaintext Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT - -"""Corpus reader for the XML version of the British National Corpus.""" - -from nltk.corpus.reader.util import concat -from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader, XMLCorpusView - - -class BNCCorpusReader(XMLCorpusReader): - r"""Corpus reader for the XML version of the British National Corpus. - - For access to the complete XML data structure, use the ``xml()`` - method. For access to simple word lists and tagged word lists, use - ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``. - - You can obtain the full version of the BNC corpus at - https://www.ota.ox.ac.uk/desc/2554 - - If you extracted the archive to a directory called `BNC`, then you can - instantiate the reader as:: - - BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml') - - """ - - def __init__(self, root, fileids, lazy=True): - XMLCorpusReader.__init__(self, root, fileids) - self._lazy = lazy - - def words(self, fileids=None, strip_space=True, stem=False): - """ - :return: the given file(s) as a list of words - and punctuation symbols. - :rtype: list(str) - - :param strip_space: If true, then strip trailing spaces from - word tokens. Otherwise, leave the spaces on the tokens. - :param stem: If true, then use word stems instead of word strings. - """ - return self._views(fileids, False, None, strip_space, stem) - - def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False): - """ - :return: the given file(s) as a list of tagged - words and punctuation symbols, encoded as tuples - ``(word,tag)``. - :rtype: list(tuple(str,str)) - - :param c5: If true, then the tags used will be the more detailed - c5 tags. Otherwise, the simplified tags will be used. - :param strip_space: If true, then strip trailing spaces from - word tokens. Otherwise, leave the spaces on the tokens. - :param stem: If true, then use word stems instead of word strings. - """ - tag = "c5" if c5 else "pos" - return self._views(fileids, False, tag, strip_space, stem) - - def sents(self, fileids=None, strip_space=True, stem=False): - """ - :return: the given file(s) as a list of - sentences or utterances, each encoded as a list of word - strings. - :rtype: list(list(str)) - - :param strip_space: If true, then strip trailing spaces from - word tokens. Otherwise, leave the spaces on the tokens. - :param stem: If true, then use word stems instead of word strings. - """ - return self._views(fileids, True, None, strip_space, stem) - - def tagged_sents(self, fileids=None, c5=False, strip_space=True, stem=False): - """ - :return: the given file(s) as a list of - sentences, each encoded as a list of ``(word,tag)`` tuples. - :rtype: list(list(tuple(str,str))) - - :param c5: If true, then the tags used will be the more detailed - c5 tags. Otherwise, the simplified tags will be used. - :param strip_space: If true, then strip trailing spaces from - word tokens. Otherwise, leave the spaces on the tokens. - :param stem: If true, then use word stems instead of word strings. - """ - tag = "c5" if c5 else "pos" - return self._views( - fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem - ) - - def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False): - """A helper function that instantiates BNCWordViews or the list of words/sentences.""" - f = BNCWordView if self._lazy else self._words - return concat( - [ - f(fileid, sent, tag, strip_space, stem) - for fileid in self.abspaths(fileids) - ] - ) - - def _words(self, fileid, bracket_sent, tag, strip_space, stem): - """ - Helper used to implement the view methods -- returns a list of - words or a list of sentences, optionally tagged. - - :param fileid: The name of the underlying file. - :param bracket_sent: If true, include sentence bracketing. - :param tag: The name of the tagset to use, or None for no tags. - :param strip_space: If true, strip spaces from word tokens. - :param stem: If true, then substitute stems for words. - """ - result = [] - - xmldoc = ElementTree.parse(fileid).getroot() - for xmlsent in xmldoc.findall(".//s"): - sent = [] - for xmlword in _all_xmlwords_in(xmlsent): - word = xmlword.text - if not word: - word = "" # fixes issue 337? - if strip_space or stem: - word = word.strip() - if stem: - word = xmlword.get("hw", word) - if tag == "c5": - word = (word, xmlword.get("c5")) - elif tag == "pos": - word = (word, xmlword.get("pos", xmlword.get("c5"))) - sent.append(word) - if bracket_sent: - result.append(BNCSentence(xmlsent.attrib["n"], sent)) - else: - result.extend(sent) - - assert None not in result - return result - - -def _all_xmlwords_in(elt, result=None): - if result is None: - result = [] - for child in elt: - if child.tag in ("c", "w"): - result.append(child) - else: - _all_xmlwords_in(child, result) - return result - - -class BNCSentence(list): - """ - A list of words, augmented by an attribute ``num`` used to record - the sentence identifier (the ``n`` attribute from the XML). - """ - - def __init__(self, num, items): - self.num = num - list.__init__(self, items) - - -class BNCWordView(XMLCorpusView): - """ - A stream backed corpus view specialized for use with the BNC corpus. - """ - - tags_to_ignore = { - "pb", - "gap", - "vocal", - "event", - "unclear", - "shift", - "pause", - "align", - } - """These tags are ignored. For their description refer to the - technical documentation, for example, - http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html - - """ - - def __init__(self, fileid, sent, tag, strip_space, stem): - """ - :param fileid: The name of the underlying file. - :param sent: If true, include sentence bracketing. - :param tag: The name of the tagset to use, or None for no tags. - :param strip_space: If true, strip spaces from word tokens. - :param stem: If true, then substitute stems for words. - """ - if sent: - tagspec = ".*/s" - else: - tagspec = ".*/s/(.*/)?(c|w)" - self._sent = sent - self._tag = tag - self._strip_space = strip_space - self._stem = stem - - self.title = None #: Title of the document. - self.author = None #: Author of the document. - self.editor = None #: Editor - self.resps = None #: Statement of responsibility - - XMLCorpusView.__init__(self, fileid, tagspec) - - # Read in a tasty header. - self._open() - self.read_block(self._stream, ".*/teiHeader$", self.handle_header) - self.close() - - # Reset tag context. - self._tag_context = {0: ()} - - def handle_header(self, elt, context): - # Set up some metadata! - titles = elt.findall("titleStmt/title") - if titles: - self.title = "\n".join(title.text.strip() for title in titles) - - authors = elt.findall("titleStmt/author") - if authors: - self.author = "\n".join(author.text.strip() for author in authors) - - editors = elt.findall("titleStmt/editor") - if editors: - self.editor = "\n".join(editor.text.strip() for editor in editors) - - resps = elt.findall("titleStmt/respStmt") - if resps: - self.resps = "\n\n".join( - "\n".join(resp_elt.text.strip() for resp_elt in resp) for resp in resps - ) - - def handle_elt(self, elt, context): - if self._sent: - return self.handle_sent(elt) - else: - return self.handle_word(elt) - - def handle_word(self, elt): - word = elt.text - if not word: - word = "" # fixes issue 337? - if self._strip_space or self._stem: - word = word.strip() - if self._stem: - word = elt.get("hw", word) - if self._tag == "c5": - word = (word, elt.get("c5")) - elif self._tag == "pos": - word = (word, elt.get("pos", elt.get("c5"))) - return word - - def handle_sent(self, elt): - sent = [] - for child in elt: - if child.tag in ("mw", "hi", "corr", "trunc"): - sent += [self.handle_word(w) for w in child] - elif child.tag in ("w", "c"): - sent.append(self.handle_word(child)) - elif child.tag not in self.tags_to_ignore: - raise ValueError("Unexpected element %s" % child.tag) - return BNCSentence(elt.attrib["n"], sent) diff --git a/pipeline/nltk/corpus/reader/bracket_parse.py b/pipeline/nltk/corpus/reader/bracket_parse.py deleted file mode 100644 index c5d3ff67b94dcc6b476e7125c62bbe41e03603f1..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/bracket_parse.py +++ /dev/null @@ -1,237 +0,0 @@ -# Natural Language Toolkit: Penn Treebank Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Edward Loper -# URL: -# For license information, see LICENSE.TXT -""" -Corpus reader for corpora that consist of parenthesis-delineated parse trees. -""" - -import sys - -from nltk.corpus.reader.api import * -from nltk.corpus.reader.util import * -from nltk.tag import map_tag -from nltk.tree import Tree - -# we use [^\s()]+ instead of \S+? to avoid matching () -SORTTAGWRD = re.compile(r"\((\d+) ([^\s()]+) ([^\s()]+)\)") -TAGWORD = re.compile(r"\(([^\s()]+) ([^\s()]+)\)") -WORD = re.compile(r"\([^\s()]+ ([^\s()]+)\)") -EMPTY_BRACKETS = re.compile(r"\s*\(\s*\(") - - -class BracketParseCorpusReader(SyntaxCorpusReader): - """ - Reader for corpora that consist of parenthesis-delineated parse trees, - like those found in the "combined" section of the Penn Treebank, - e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))". - - """ - - def __init__( - self, - root, - fileids, - comment_char=None, - detect_blocks="unindented_paren", - encoding="utf8", - tagset=None, - ): - """ - :param root: The root directory for this corpus. - :param fileids: A list or regexp specifying the fileids in this corpus. - :param comment_char: The character which can appear at the start of - a line to indicate that the rest of the line is a comment. - :param detect_blocks: The method that is used to find blocks - in the corpus; can be 'unindented_paren' (every unindented - parenthesis starts a new parse) or 'sexpr' (brackets are - matched). - :param tagset: The name of the tagset used by this corpus, to be used - for normalizing or converting the POS tags returned by the - ``tagged_...()`` methods. - """ - SyntaxCorpusReader.__init__(self, root, fileids, encoding) - self._comment_char = comment_char - self._detect_blocks = detect_blocks - self._tagset = tagset - - def _read_block(self, stream): - if self._detect_blocks == "sexpr": - return read_sexpr_block(stream, comment_char=self._comment_char) - elif self._detect_blocks == "blankline": - return read_blankline_block(stream) - elif self._detect_blocks == "unindented_paren": - # Tokens start with unindented left parens. - toks = read_regexp_block(stream, start_re=r"^\(") - # Strip any comments out of the tokens. - if self._comment_char: - toks = [ - re.sub("(?m)^%s.*" % re.escape(self._comment_char), "", tok) - for tok in toks - ] - return toks - else: - assert 0, "bad block type" - - def _normalize(self, t): - # Replace leaves of the form (!), (,), with (! !), (, ,) - t = re.sub(r"\((.)\)", r"(\1 \1)", t) - # Replace leaves of the form (tag word root) with (tag word) - t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t) - return t - - def _parse(self, t): - try: - tree = Tree.fromstring(self._normalize(t)) - # If there's an empty node at the top, strip it off - if tree.label() == "" and len(tree) == 1: - return tree[0] - else: - return tree - - except ValueError as e: - sys.stderr.write("Bad tree detected; trying to recover...\n") - # Try to recover, if we can: - if e.args == ("mismatched parens",): - for n in range(1, 5): - try: - v = Tree(self._normalize(t + ")" * n)) - sys.stderr.write( - " Recovered by adding %d close " "paren(s)\n" % n - ) - return v - except ValueError: - pass - # Try something else: - sys.stderr.write(" Recovered by returning a flat parse.\n") - # sys.stderr.write(' '.join(t.split())+'\n') - return Tree("S", self._tag(t)) - - def _tag(self, t, tagset=None): - tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))] - if tagset and tagset != self._tagset: - tagged_sent = [ - (w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent - ] - return tagged_sent - - def _word(self, t): - return WORD.findall(self._normalize(t)) - - -class CategorizedBracketParseCorpusReader( - CategorizedCorpusReader, BracketParseCorpusReader -): - """ - A reader for parsed corpora whose documents are - divided into categories based on their file identifiers. - @author: Nathan Schneider - """ - - def __init__(self, *args, **kwargs): - """ - Initialize the corpus reader. Categorization arguments - (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to - the L{CategorizedCorpusReader constructor - }. The remaining arguments - are passed to the L{BracketParseCorpusReader constructor - }. - """ - CategorizedCorpusReader.__init__(self, kwargs) - BracketParseCorpusReader.__init__(self, *args, **kwargs) - - def tagged_words(self, fileids=None, categories=None, tagset=None): - return super().tagged_words(self._resolve(fileids, categories), tagset) - - def tagged_sents(self, fileids=None, categories=None, tagset=None): - return super().tagged_sents(self._resolve(fileids, categories), tagset) - - def tagged_paras(self, fileids=None, categories=None, tagset=None): - return super().tagged_paras(self._resolve(fileids, categories), tagset) - - def parsed_words(self, fileids=None, categories=None): - return super().parsed_words(self._resolve(fileids, categories)) - - def parsed_sents(self, fileids=None, categories=None): - return super().parsed_sents(self._resolve(fileids, categories)) - - def parsed_paras(self, fileids=None, categories=None): - return super().parsed_paras(self._resolve(fileids, categories)) - - -class AlpinoCorpusReader(BracketParseCorpusReader): - """ - Reader for the Alpino Dutch Treebank. - This corpus has a lexical breakdown structure embedded, as read by `_parse` - Unfortunately this puts punctuation and some other words out of the sentence - order in the xml element tree. This is no good for `tag_` and `word_` - `_tag` and `_word` will be overridden to use a non-default new parameter 'ordered' - to the overridden _normalize function. The _parse function can then remain - untouched. - """ - - def __init__(self, root, encoding="ISO-8859-1", tagset=None): - BracketParseCorpusReader.__init__( - self, - root, - r"alpino\.xml", - detect_blocks="blankline", - encoding=encoding, - tagset=tagset, - ) - - def _normalize(self, t, ordered=False): - """Normalize the xml sentence element in t. - The sentence elements , although embedded in a few overall - xml elements, are separated by blank lines. That's how the reader can - deliver them one at a time. - Each sentence has a few category subnodes that are of no use to us. - The remaining word nodes may or may not appear in the proper order. - Each word node has attributes, among which: - - begin : the position of the word in the sentence - - pos : Part of Speech: the Tag - - word : the actual word - The return value is a string with all xml elementes replaced by - clauses: either a cat clause with nested clauses, or a word clause. - The order of the bracket clauses closely follows the xml. - If ordered == True, the word clauses include an order sequence number. - If ordered == False, the word clauses only have pos and word parts. - """ - if t[:10] != "', r"(\1", t) - if ordered: - t = re.sub( - r' ', - r"(\1 \2 \3)", - t, - ) - else: - t = re.sub(r' ', r"(\1 \2)", t) - t = re.sub(r" ", r")", t) - t = re.sub(r".*", r"", t) - t = re.sub(r"", r"", t) - return t - - def _tag(self, t, tagset=None): - tagged_sent = [ - (int(o), w, p) - for (o, p, w) in SORTTAGWRD.findall(self._normalize(t, ordered=True)) - ] - tagged_sent.sort() - if tagset and tagset != self._tagset: - tagged_sent = [ - (w, map_tag(self._tagset, tagset, p)) for (o, w, p) in tagged_sent - ] - else: - tagged_sent = [(w, p) for (o, w, p) in tagged_sent] - return tagged_sent - - def _word(self, t): - """Return a correctly ordered list if words""" - tagged_sent = self._tag(t) - return [w for (w, p) in tagged_sent] diff --git a/pipeline/nltk/corpus/reader/categorized_sents.py b/pipeline/nltk/corpus/reader/categorized_sents.py deleted file mode 100644 index 92bfe47210e9db56aa1cde4fe27a41f4133909c1..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/categorized_sents.py +++ /dev/null @@ -1,168 +0,0 @@ -# Natural Language Toolkit: Categorized Sentences Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Pierpaolo Pantone <24alsecondo@gmail.com> -# URL: -# For license information, see LICENSE.TXT - -""" -CorpusReader structured for corpora that contain one instance on each row. -This CorpusReader is specifically used for the Subjectivity Dataset and the -Sentence Polarity Dataset. - -- Subjectivity Dataset information - - -Authors: Bo Pang and Lillian Lee. -Url: https://www.cs.cornell.edu/people/pabo/movie-review-data - -Distributed with permission. - -Related papers: - -- Bo Pang and Lillian Lee. "A Sentimental Education: Sentiment Analysis Using - Subjectivity Summarization Based on Minimum Cuts". Proceedings of the ACL, - 2004. - -- Sentence Polarity Dataset information - - -Authors: Bo Pang and Lillian Lee. -Url: https://www.cs.cornell.edu/people/pabo/movie-review-data - -Related papers: - -- Bo Pang and Lillian Lee. "Seeing stars: Exploiting class relationships for - sentiment categorization with respect to rating scales". Proceedings of the - ACL, 2005. -""" - -from nltk.corpus.reader.api import * -from nltk.tokenize import * - - -class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader): - """ - A reader for corpora in which each row represents a single instance, mainly - a sentence. Istances are divided into categories based on their file identifiers - (see CategorizedCorpusReader). - Since many corpora allow rows that contain more than one sentence, it is - possible to specify a sentence tokenizer to retrieve all sentences instead - than all rows. - - Examples using the Subjectivity Dataset: - - >>> from nltk.corpus import subjectivity - >>> subjectivity.sents()[23] # doctest: +NORMALIZE_WHITESPACE - ['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits', - 'happened', 'off', 'screen', '.'] - >>> subjectivity.categories() - ['obj', 'subj'] - >>> subjectivity.words(categories='subj') - ['smart', 'and', 'alert', ',', 'thirteen', ...] - - Examples using the Sentence Polarity Dataset: - - >>> from nltk.corpus import sentence_polarity - >>> sentence_polarity.sents() # doctest: +NORMALIZE_WHITESPACE - [['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish', - 'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find', - 'it', 'funny', '.'], ...] - >>> sentence_polarity.categories() - ['neg', 'pos'] - """ - - CorpusView = StreamBackedCorpusView - - def __init__( - self, - root, - fileids, - word_tokenizer=WhitespaceTokenizer(), - sent_tokenizer=None, - encoding="utf8", - **kwargs - ): - """ - :param root: The root directory for the corpus. - :param fileids: a list or regexp specifying the fileids in the corpus. - :param word_tokenizer: a tokenizer for breaking sentences or paragraphs - into words. Default: `WhitespaceTokenizer` - :param sent_tokenizer: a tokenizer for breaking paragraphs into sentences. - :param encoding: the encoding that should be used to read the corpus. - :param kwargs: additional parameters passed to CategorizedCorpusReader. - """ - - CorpusReader.__init__(self, root, fileids, encoding) - CategorizedCorpusReader.__init__(self, kwargs) - self._word_tokenizer = word_tokenizer - self._sent_tokenizer = sent_tokenizer - - def sents(self, fileids=None, categories=None): - """ - Return all sentences in the corpus or in the specified file(s). - - :param fileids: a list or regexp specifying the ids of the files whose - sentences have to be returned. - :param categories: a list specifying the categories whose sentences have - to be returned. - :return: the given file(s) as a list of sentences. - Each sentence is tokenized using the specified word_tokenizer. - :rtype: list(list(str)) - """ - fileids = self._resolve(fileids, categories) - if fileids is None: - fileids = self._fileids - elif isinstance(fileids, str): - fileids = [fileids] - return concat( - [ - self.CorpusView(path, self._read_sent_block, encoding=enc) - for (path, enc, fileid) in self.abspaths(fileids, True, True) - ] - ) - - def words(self, fileids=None, categories=None): - """ - Return all words and punctuation symbols in the corpus or in the specified - file(s). - - :param fileids: a list or regexp specifying the ids of the files whose - words have to be returned. - :param categories: a list specifying the categories whose words have to - be returned. - :return: the given file(s) as a list of words and punctuation symbols. - :rtype: list(str) - """ - fileids = self._resolve(fileids, categories) - if fileids is None: - fileids = self._fileids - elif isinstance(fileids, str): - fileids = [fileids] - return concat( - [ - self.CorpusView(path, self._read_word_block, encoding=enc) - for (path, enc, fileid) in self.abspaths(fileids, True, True) - ] - ) - - def _read_sent_block(self, stream): - sents = [] - for i in range(20): # Read 20 lines at a time. - line = stream.readline() - if not line: - continue - if self._sent_tokenizer: - sents.extend( - [ - self._word_tokenizer.tokenize(sent) - for sent in self._sent_tokenizer.tokenize(line) - ] - ) - else: - sents.append(self._word_tokenizer.tokenize(line)) - return sents - - def _read_word_block(self, stream): - words = [] - for sent in self._read_sent_block(stream): - words.extend(sent) - return words diff --git a/pipeline/nltk/corpus/reader/chasen.py b/pipeline/nltk/corpus/reader/chasen.py deleted file mode 100644 index ef6ab8146619bbdc0f448f9771269ab7d3ee5451..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/chasen.py +++ /dev/null @@ -1,158 +0,0 @@ -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Masato Hagiwara -# URL: -# For license information, see LICENSE.TXT - -import sys - -from nltk.corpus.reader import util -from nltk.corpus.reader.api import * -from nltk.corpus.reader.util import * - - -class ChasenCorpusReader(CorpusReader): - def __init__(self, root, fileids, encoding="utf8", sent_splitter=None): - self._sent_splitter = sent_splitter - CorpusReader.__init__(self, root, fileids, encoding) - - def words(self, fileids=None): - return concat( - [ - ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def tagged_words(self, fileids=None): - return concat( - [ - ChasenCorpusView(fileid, enc, True, False, False, self._sent_splitter) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def sents(self, fileids=None): - return concat( - [ - ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def tagged_sents(self, fileids=None): - return concat( - [ - ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def paras(self, fileids=None): - return concat( - [ - ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def tagged_paras(self, fileids=None): - return concat( - [ - ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - -class ChasenCorpusView(StreamBackedCorpusView): - """ - A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``, - but this'll use fixed sets of word and sentence tokenizer. - """ - - def __init__( - self, - corpus_file, - encoding, - tagged, - group_by_sent, - group_by_para, - sent_splitter=None, - ): - self._tagged = tagged - self._group_by_sent = group_by_sent - self._group_by_para = group_by_para - self._sent_splitter = sent_splitter - StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) - - def read_block(self, stream): - """Reads one paragraph at a time.""" - block = [] - for para_str in read_regexp_block(stream, r".", r"^EOS\n"): - - para = [] - - sent = [] - for line in para_str.splitlines(): - - _eos = line.strip() == "EOS" - _cells = line.split("\t") - w = (_cells[0], "\t".join(_cells[1:])) - if not _eos: - sent.append(w) - - if _eos or (self._sent_splitter and self._sent_splitter(w)): - if not self._tagged: - sent = [w for (w, t) in sent] - if self._group_by_sent: - para.append(sent) - else: - para.extend(sent) - sent = [] - - if len(sent) > 0: - if not self._tagged: - sent = [w for (w, t) in sent] - - if self._group_by_sent: - para.append(sent) - else: - para.extend(sent) - - if self._group_by_para: - block.append(para) - else: - block.extend(para) - - return block - - -def demo(): - - import nltk - from nltk.corpus.util import LazyCorpusLoader - - jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8") - print("/".join(jeita.words()[22100:22140])) - - print( - "\nEOS\n".join( - "\n".join("{}/{}".format(w[0], w[1].split("\t")[2]) for w in sent) - for sent in jeita.tagged_sents()[2170:2173] - ) - ) - - -def test(): - - from nltk.corpus.util import LazyCorpusLoader - - jeita = LazyCorpusLoader("jeita", ChasenCorpusReader, r".*chasen", encoding="utf-8") - - assert isinstance(jeita.tagged_words()[0][1], str) - - -if __name__ == "__main__": - demo() - test() diff --git a/pipeline/nltk/corpus/reader/childes.py b/pipeline/nltk/corpus/reader/childes.py deleted file mode 100644 index 115ccfb927f7bb4d217670f0cd52a55d64563e9c..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/childes.py +++ /dev/null @@ -1,630 +0,0 @@ -# CHILDES XML Corpus Reader - -# Copyright (C) 2001-2023 NLTK Project -# Author: Tomonori Nagano -# Alexis Dimitriadis -# URL: -# For license information, see LICENSE.TXT - -""" -Corpus reader for the XML version of the CHILDES corpus. -""" - -__docformat__ = "epytext en" - -import re -from collections import defaultdict - -from nltk.corpus.reader.util import concat -from nltk.corpus.reader.xmldocs import ElementTree, XMLCorpusReader -from nltk.util import LazyConcatenation, LazyMap, flatten - -# to resolve the namespace issue -NS = "http://www.talkbank.org/ns/talkbank" - - -class CHILDESCorpusReader(XMLCorpusReader): - """ - Corpus reader for the XML version of the CHILDES corpus. - The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML - version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``. - Copy the needed parts of the CHILDES XML corpus into the NLTK data directory - (``nltk_data/corpora/CHILDES/``). - - For access to the file text use the usual nltk functions, - ``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``. - """ - - def __init__(self, root, fileids, lazy=True): - XMLCorpusReader.__init__(self, root, fileids) - self._lazy = lazy - - def words( - self, - fileids=None, - speaker="ALL", - stem=False, - relation=False, - strip_space=True, - replace=False, - ): - """ - :return: the given file(s) as a list of words - :rtype: list(str) - - :param speaker: If specified, select specific speaker(s) defined - in the corpus. Default is 'ALL' (all participants). Common choices - are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude - researchers) - :param stem: If true, then use word stems instead of word strings. - :param relation: If true, then return tuples of (stem, index, - dependent_index) - :param strip_space: If true, then strip trailing spaces from word - tokens. Otherwise, leave the spaces on the tokens. - :param replace: If true, then use the replaced (intended) word instead - of the original word (e.g., 'wat' will be replaced with 'watch') - """ - sent = None - pos = False - if not self._lazy: - return [ - self._get_words( - fileid, speaker, sent, stem, relation, pos, strip_space, replace - ) - for fileid in self.abspaths(fileids) - ] - - get_words = lambda fileid: self._get_words( - fileid, speaker, sent, stem, relation, pos, strip_space, replace - ) - return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids))) - - def tagged_words( - self, - fileids=None, - speaker="ALL", - stem=False, - relation=False, - strip_space=True, - replace=False, - ): - """ - :return: the given file(s) as a list of tagged - words and punctuation symbols, encoded as tuples - ``(word,tag)``. - :rtype: list(tuple(str,str)) - - :param speaker: If specified, select specific speaker(s) defined - in the corpus. Default is 'ALL' (all participants). Common choices - are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude - researchers) - :param stem: If true, then use word stems instead of word strings. - :param relation: If true, then return tuples of (stem, index, - dependent_index) - :param strip_space: If true, then strip trailing spaces from word - tokens. Otherwise, leave the spaces on the tokens. - :param replace: If true, then use the replaced (intended) word instead - of the original word (e.g., 'wat' will be replaced with 'watch') - """ - sent = None - pos = True - if not self._lazy: - return [ - self._get_words( - fileid, speaker, sent, stem, relation, pos, strip_space, replace - ) - for fileid in self.abspaths(fileids) - ] - - get_words = lambda fileid: self._get_words( - fileid, speaker, sent, stem, relation, pos, strip_space, replace - ) - return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids))) - - def sents( - self, - fileids=None, - speaker="ALL", - stem=False, - relation=None, - strip_space=True, - replace=False, - ): - """ - :return: the given file(s) as a list of sentences or utterances, each - encoded as a list of word strings. - :rtype: list(list(str)) - - :param speaker: If specified, select specific speaker(s) defined - in the corpus. Default is 'ALL' (all participants). Common choices - are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude - researchers) - :param stem: If true, then use word stems instead of word strings. - :param relation: If true, then return tuples of ``(str,pos,relation_list)``. - If there is manually-annotated relation info, it will return - tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)`` - :param strip_space: If true, then strip trailing spaces from word - tokens. Otherwise, leave the spaces on the tokens. - :param replace: If true, then use the replaced (intended) word instead - of the original word (e.g., 'wat' will be replaced with 'watch') - """ - sent = True - pos = False - if not self._lazy: - return [ - self._get_words( - fileid, speaker, sent, stem, relation, pos, strip_space, replace - ) - for fileid in self.abspaths(fileids) - ] - - get_words = lambda fileid: self._get_words( - fileid, speaker, sent, stem, relation, pos, strip_space, replace - ) - return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids))) - - def tagged_sents( - self, - fileids=None, - speaker="ALL", - stem=False, - relation=None, - strip_space=True, - replace=False, - ): - """ - :return: the given file(s) as a list of - sentences, each encoded as a list of ``(word,tag)`` tuples. - :rtype: list(list(tuple(str,str))) - - :param speaker: If specified, select specific speaker(s) defined - in the corpus. Default is 'ALL' (all participants). Common choices - are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude - researchers) - :param stem: If true, then use word stems instead of word strings. - :param relation: If true, then return tuples of ``(str,pos,relation_list)``. - If there is manually-annotated relation info, it will return - tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)`` - :param strip_space: If true, then strip trailing spaces from word - tokens. Otherwise, leave the spaces on the tokens. - :param replace: If true, then use the replaced (intended) word instead - of the original word (e.g., 'wat' will be replaced with 'watch') - """ - sent = True - pos = True - if not self._lazy: - return [ - self._get_words( - fileid, speaker, sent, stem, relation, pos, strip_space, replace - ) - for fileid in self.abspaths(fileids) - ] - - get_words = lambda fileid: self._get_words( - fileid, speaker, sent, stem, relation, pos, strip_space, replace - ) - return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids))) - - def corpus(self, fileids=None): - """ - :return: the given file(s) as a dict of ``(corpus_property_key, value)`` - :rtype: list(dict) - """ - if not self._lazy: - return [self._get_corpus(fileid) for fileid in self.abspaths(fileids)] - return LazyMap(self._get_corpus, self.abspaths(fileids)) - - def _get_corpus(self, fileid): - results = dict() - xmldoc = ElementTree.parse(fileid).getroot() - for key, value in xmldoc.items(): - results[key] = value - return results - - def participants(self, fileids=None): - """ - :return: the given file(s) as a dict of - ``(participant_property_key, value)`` - :rtype: list(dict) - """ - if not self._lazy: - return [self._get_participants(fileid) for fileid in self.abspaths(fileids)] - return LazyMap(self._get_participants, self.abspaths(fileids)) - - def _get_participants(self, fileid): - # multidimensional dicts - def dictOfDicts(): - return defaultdict(dictOfDicts) - - xmldoc = ElementTree.parse(fileid).getroot() - # getting participants' data - pat = dictOfDicts() - for participant in xmldoc.findall( - f".//{{{NS}}}Participants/{{{NS}}}participant" - ): - for (key, value) in participant.items(): - pat[participant.get("id")][key] = value - return pat - - def age(self, fileids=None, speaker="CHI", month=False): - """ - :return: the given file(s) as string or int - :rtype: list or int - - :param month: If true, return months instead of year-month-date - """ - if not self._lazy: - return [ - self._get_age(fileid, speaker, month) - for fileid in self.abspaths(fileids) - ] - get_age = lambda fileid: self._get_age(fileid, speaker, month) - return LazyMap(get_age, self.abspaths(fileids)) - - def _get_age(self, fileid, speaker, month): - xmldoc = ElementTree.parse(fileid).getroot() - for pat in xmldoc.findall(f".//{{{NS}}}Participants/{{{NS}}}participant"): - try: - if pat.get("id") == speaker: - age = pat.get("age") - if month: - age = self.convert_age(age) - return age - # some files don't have age data - except (TypeError, AttributeError) as e: - return None - - def convert_age(self, age_year): - "Caclculate age in months from a string in CHILDES format" - m = re.match(r"P(\d+)Y(\d+)M?(\d?\d?)D?", age_year) - age_month = int(m.group(1)) * 12 + int(m.group(2)) - try: - if int(m.group(3)) > 15: - age_month += 1 - # some corpora don't have age information? - except ValueError as e: - pass - return age_month - - def MLU(self, fileids=None, speaker="CHI"): - """ - :return: the given file(s) as a floating number - :rtype: list(float) - """ - if not self._lazy: - return [ - self._getMLU(fileid, speaker=speaker) - for fileid in self.abspaths(fileids) - ] - get_MLU = lambda fileid: self._getMLU(fileid, speaker=speaker) - return LazyMap(get_MLU, self.abspaths(fileids)) - - def _getMLU(self, fileid, speaker): - sents = self._get_words( - fileid, - speaker=speaker, - sent=True, - stem=True, - relation=False, - pos=True, - strip_space=True, - replace=True, - ) - results = [] - lastSent = [] - numFillers = 0 - sentDiscount = 0 - for sent in sents: - posList = [pos for (word, pos) in sent] - # if any part of the sentence is intelligible - if any(pos == "unk" for pos in posList): - continue - # if the sentence is null - elif sent == []: - continue - # if the sentence is the same as the last sent - elif sent == lastSent: - continue - else: - results.append([word for (word, pos) in sent]) - # count number of fillers - if len({"co", None}.intersection(posList)) > 0: - numFillers += posList.count("co") - numFillers += posList.count(None) - sentDiscount += 1 - lastSent = sent - try: - thisWordList = flatten(results) - # count number of morphemes - # (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes) - numWords = ( - len(flatten([word.split("-") for word in thisWordList])) - numFillers - ) - numSents = len(results) - sentDiscount - mlu = numWords / numSents - except ZeroDivisionError: - mlu = 0 - # return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents} - return mlu - - def _get_words( - self, fileid, speaker, sent, stem, relation, pos, strip_space, replace - ): - if ( - isinstance(speaker, str) and speaker != "ALL" - ): # ensure we have a list of speakers - speaker = [speaker] - xmldoc = ElementTree.parse(fileid).getroot() - # processing each xml doc - results = [] - for xmlsent in xmldoc.findall(".//{%s}u" % NS): - sents = [] - # select speakers - if speaker == "ALL" or xmlsent.get("who") in speaker: - for xmlword in xmlsent.findall(".//{%s}w" % NS): - infl = None - suffixStem = None - suffixTag = None - # getting replaced words - if replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}replacement"): - xmlword = xmlsent.find( - f".//{{{NS}}}w/{{{NS}}}replacement/{{{NS}}}w" - ) - elif replace and xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk"): - xmlword = xmlsent.find(f".//{{{NS}}}w/{{{NS}}}wk") - # get text - if xmlword.text: - word = xmlword.text - else: - word = "" - # strip tailing space - if strip_space: - word = word.strip() - # stem - if relation or stem: - try: - xmlstem = xmlword.find(".//{%s}stem" % NS) - word = xmlstem.text - except AttributeError as e: - pass - # if there is an inflection - try: - xmlinfl = xmlword.find( - f".//{{{NS}}}mor/{{{NS}}}mw/{{{NS}}}mk" - ) - word += "-" + xmlinfl.text - except: - pass - # if there is a suffix - try: - xmlsuffix = xmlword.find( - ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem" - % (NS, NS, NS, NS) - ) - suffixStem = xmlsuffix.text - except AttributeError: - suffixStem = "" - if suffixStem: - word += "~" + suffixStem - # pos - if relation or pos: - try: - xmlpos = xmlword.findall(".//{%s}c" % NS) - xmlpos2 = xmlword.findall(".//{%s}s" % NS) - if xmlpos2 != []: - tag = xmlpos[0].text + ":" + xmlpos2[0].text - else: - tag = xmlpos[0].text - except (AttributeError, IndexError) as e: - tag = "" - try: - xmlsuffixpos = xmlword.findall( - ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c" - % (NS, NS, NS, NS, NS) - ) - xmlsuffixpos2 = xmlword.findall( - ".//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s" - % (NS, NS, NS, NS, NS) - ) - if xmlsuffixpos2: - suffixTag = ( - xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text - ) - else: - suffixTag = xmlsuffixpos[0].text - except: - pass - if suffixTag: - tag += "~" + suffixTag - word = (word, tag) - # relational - # the gold standard is stored in - # - if relation == True: - for xmlstem_rel in xmlword.findall( - f".//{{{NS}}}mor/{{{NS}}}gra" - ): - if not xmlstem_rel.get("type") == "grt": - word = ( - word[0], - word[1], - xmlstem_rel.get("index") - + "|" - + xmlstem_rel.get("head") - + "|" - + xmlstem_rel.get("relation"), - ) - else: - word = ( - word[0], - word[1], - word[2], - word[0], - word[1], - xmlstem_rel.get("index") - + "|" - + xmlstem_rel.get("head") - + "|" - + xmlstem_rel.get("relation"), - ) - try: - for xmlpost_rel in xmlword.findall( - f".//{{{NS}}}mor/{{{NS}}}mor-post/{{{NS}}}gra" - ): - if not xmlpost_rel.get("type") == "grt": - suffixStem = ( - suffixStem[0], - suffixStem[1], - xmlpost_rel.get("index") - + "|" - + xmlpost_rel.get("head") - + "|" - + xmlpost_rel.get("relation"), - ) - else: - suffixStem = ( - suffixStem[0], - suffixStem[1], - suffixStem[2], - suffixStem[0], - suffixStem[1], - xmlpost_rel.get("index") - + "|" - + xmlpost_rel.get("head") - + "|" - + xmlpost_rel.get("relation"), - ) - except: - pass - sents.append(word) - if sent or relation: - results.append(sents) - else: - results.extend(sents) - return LazyMap(lambda x: x, results) - - # Ready-to-use browser opener - - """ - The base URL for viewing files on the childes website. This - shouldn't need to be changed, unless CHILDES changes the configuration - of their server or unless the user sets up their own corpus webserver. - """ - childes_url_base = r"https://childes.talkbank.org/browser/index.php?url=" - - def webview_file(self, fileid, urlbase=None): - """Map a corpus file to its web version on the CHILDES website, - and open it in a web browser. - - The complete URL to be used is: - childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha') - - If no urlbase is passed, we try to calculate it. This - requires that the childes corpus was set up to mirror the - folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.: - nltk_data/corpora/childes/Eng-USA/Cornell/??? or - nltk_data/corpora/childes/Romance/Spanish/Aguirre/??? - - The function first looks (as a special case) if "Eng-USA" is - on the path consisting of +fileid; then if - "childes", possibly followed by "data-xml", appears. If neither - one is found, we use the unmodified fileid and hope for the best. - If this is not right, specify urlbase explicitly, e.g., if the - corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'. - """ - - import webbrowser - - if urlbase: - path = urlbase + "/" + fileid - else: - full = self.root + "/" + fileid - full = re.sub(r"\\", "/", full) - if "/childes/" in full.lower(): - # Discard /data-xml/ if present - path = re.findall(r"(?i)/childes(?:/data-xml)?/(.*)\.xml", full)[0] - elif "eng-usa" in full.lower(): - path = "Eng-USA/" + re.findall(r"/(?i)Eng-USA/(.*)\.xml", full)[0] - else: - path = fileid - - # Strip ".xml" and add ".cha", as necessary: - if path.endswith(".xml"): - path = path[:-4] - - if not path.endswith(".cha"): - path = path + ".cha" - - url = self.childes_url_base + path - - webbrowser.open_new_tab(url) - print("Opening in browser:", url) - # Pausing is a good idea, but it's up to the user... - # raw_input("Hit Return to continue") - - -def demo(corpus_root=None): - """ - The CHILDES corpus should be manually downloaded and saved - to ``[NLTK_Data_Dir]/corpora/childes/`` - """ - if not corpus_root: - from nltk.data import find - - corpus_root = find("corpora/childes/data-xml/Eng-USA/") - - try: - childes = CHILDESCorpusReader(corpus_root, ".*.xml") - # describe all corpus - for file in childes.fileids()[:5]: - corpus = "" - corpus_id = "" - for (key, value) in childes.corpus(file)[0].items(): - if key == "Corpus": - corpus = value - if key == "Id": - corpus_id = value - print("Reading", corpus, corpus_id, " .....") - print("words:", childes.words(file)[:7], "...") - print( - "words with replaced words:", - childes.words(file, replace=True)[:7], - " ...", - ) - print("words with pos tags:", childes.tagged_words(file)[:7], " ...") - print("words (only MOT):", childes.words(file, speaker="MOT")[:7], "...") - print("words (only CHI):", childes.words(file, speaker="CHI")[:7], "...") - print("stemmed words:", childes.words(file, stem=True)[:7], " ...") - print( - "words with relations and pos-tag:", - childes.words(file, relation=True)[:5], - " ...", - ) - print("sentence:", childes.sents(file)[:2], " ...") - for (participant, values) in childes.participants(file)[0].items(): - for (key, value) in values.items(): - print("\tparticipant", participant, key, ":", value) - print("num of sent:", len(childes.sents(file))) - print("num of morphemes:", len(childes.words(file, stem=True))) - print("age:", childes.age(file)) - print("age in month:", childes.age(file, month=True)) - print("MLU:", childes.MLU(file)) - print() - - except LookupError as e: - print( - """The CHILDES corpus, or the parts you need, should be manually - downloaded from https://childes.talkbank.org/data-xml/ and saved at - [NLTK_Data_Dir]/corpora/childes/ - Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.: - demo('/path/to/childes/data-xml/Eng-USA/") - """ - ) - # corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip') - # corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read())) - ##this fails - # childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist()) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/corpus/reader/chunked.py b/pipeline/nltk/corpus/reader/chunked.py deleted file mode 100644 index 66b42e79ca134227357aba4cb493335196e05961..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/chunked.py +++ /dev/null @@ -1,273 +0,0 @@ -# Natural Language Toolkit: Chunked Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -A reader for corpora that contain chunked (and optionally tagged) -documents. -""" - -import codecs -import os.path - -import nltk -from nltk.chunk import tagstr2tree -from nltk.corpus.reader.api import * -from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader -from nltk.corpus.reader.util import * -from nltk.tokenize import * -from nltk.tree import Tree - - -class ChunkedCorpusReader(CorpusReader): - """ - Reader for chunked (and optionally tagged) corpora. Paragraphs - are split using a block reader. They are then tokenized into - sentences using a sentence tokenizer. Finally, these sentences - are parsed into chunk trees using a string-to-chunktree conversion - function. Each of these steps can be performed using a default - function or a custom function. By default, paragraphs are split - on blank lines; sentences are listed one per line; and sentences - are parsed into chunk trees using ``nltk.chunk.tagstr2tree``. - """ - - def __init__( - self, - root, - fileids, - extension="", - str2chunktree=tagstr2tree, - sent_tokenizer=RegexpTokenizer("\n", gaps=True), - para_block_reader=read_blankline_block, - encoding="utf8", - tagset=None, - ): - """ - :param root: The root directory for this corpus. - :param fileids: A list or regexp specifying the fileids in this corpus. - """ - CorpusReader.__init__(self, root, fileids, encoding) - self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader, tagset) - """Arguments for corpus views generated by this corpus: a tuple - (str2chunktree, sent_tokenizer, para_block_tokenizer)""" - - def words(self, fileids=None): - """ - :return: the given file(s) as a list of words - and punctuation symbols. - :rtype: list(str) - """ - return concat( - [ - ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args) - for (f, enc) in self.abspaths(fileids, True) - ] - ) - - def sents(self, fileids=None): - """ - :return: the given file(s) as a list of - sentences or utterances, each encoded as a list of word - strings. - :rtype: list(list(str)) - """ - return concat( - [ - ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args) - for (f, enc) in self.abspaths(fileids, True) - ] - ) - - def paras(self, fileids=None): - """ - :return: the given file(s) as a list of - paragraphs, each encoded as a list of sentences, which are - in turn encoded as lists of word strings. - :rtype: list(list(list(str))) - """ - return concat( - [ - ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args) - for (f, enc) in self.abspaths(fileids, True) - ] - ) - - def tagged_words(self, fileids=None, tagset=None): - """ - :return: the given file(s) as a list of tagged - words and punctuation symbols, encoded as tuples - ``(word,tag)``. - :rtype: list(tuple(str,str)) - """ - return concat( - [ - ChunkedCorpusView( - f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset - ) - for (f, enc) in self.abspaths(fileids, True) - ] - ) - - def tagged_sents(self, fileids=None, tagset=None): - """ - :return: the given file(s) as a list of - sentences, each encoded as a list of ``(word,tag)`` tuples. - - :rtype: list(list(tuple(str,str))) - """ - return concat( - [ - ChunkedCorpusView( - f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset - ) - for (f, enc) in self.abspaths(fileids, True) - ] - ) - - def tagged_paras(self, fileids=None, tagset=None): - """ - :return: the given file(s) as a list of - paragraphs, each encoded as a list of sentences, which are - in turn encoded as lists of ``(word,tag)`` tuples. - :rtype: list(list(list(tuple(str,str)))) - """ - return concat( - [ - ChunkedCorpusView( - f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset - ) - for (f, enc) in self.abspaths(fileids, True) - ] - ) - - def chunked_words(self, fileids=None, tagset=None): - """ - :return: the given file(s) as a list of tagged - words and chunks. Words are encoded as ``(word, tag)`` - tuples (if the corpus has tags) or word strings (if the - corpus has no tags). Chunks are encoded as depth-one - trees over ``(word,tag)`` tuples or word strings. - :rtype: list(tuple(str,str) and Tree) - """ - return concat( - [ - ChunkedCorpusView( - f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset - ) - for (f, enc) in self.abspaths(fileids, True) - ] - ) - - def chunked_sents(self, fileids=None, tagset=None): - """ - :return: the given file(s) as a list of - sentences, each encoded as a shallow Tree. The leaves - of these trees are encoded as ``(word, tag)`` tuples (if - the corpus has tags) or word strings (if the corpus has no - tags). - :rtype: list(Tree) - """ - return concat( - [ - ChunkedCorpusView( - f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset - ) - for (f, enc) in self.abspaths(fileids, True) - ] - ) - - def chunked_paras(self, fileids=None, tagset=None): - """ - :return: the given file(s) as a list of - paragraphs, each encoded as a list of sentences, which are - in turn encoded as a shallow Tree. The leaves of these - trees are encoded as ``(word, tag)`` tuples (if the corpus - has tags) or word strings (if the corpus has no tags). - :rtype: list(list(Tree)) - """ - return concat( - [ - ChunkedCorpusView( - f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset - ) - for (f, enc) in self.abspaths(fileids, True) - ] - ) - - def _read_block(self, stream): - return [tagstr2tree(t) for t in read_blankline_block(stream)] - - -class ChunkedCorpusView(StreamBackedCorpusView): - def __init__( - self, - fileid, - encoding, - tagged, - group_by_sent, - group_by_para, - chunked, - str2chunktree, - sent_tokenizer, - para_block_reader, - source_tagset=None, - target_tagset=None, - ): - StreamBackedCorpusView.__init__(self, fileid, encoding=encoding) - self._tagged = tagged - self._group_by_sent = group_by_sent - self._group_by_para = group_by_para - self._chunked = chunked - self._str2chunktree = str2chunktree - self._sent_tokenizer = sent_tokenizer - self._para_block_reader = para_block_reader - self._source_tagset = source_tagset - self._target_tagset = target_tagset - - def read_block(self, stream): - block = [] - for para_str in self._para_block_reader(stream): - para = [] - for sent_str in self._sent_tokenizer.tokenize(para_str): - sent = self._str2chunktree( - sent_str, - source_tagset=self._source_tagset, - target_tagset=self._target_tagset, - ) - - # If requested, throw away the tags. - if not self._tagged: - sent = self._untag(sent) - - # If requested, throw away the chunks. - if not self._chunked: - sent = sent.leaves() - - # Add the sentence to `para`. - if self._group_by_sent: - para.append(sent) - else: - para.extend(sent) - - # Add the paragraph to `block`. - if self._group_by_para: - block.append(para) - else: - block.extend(para) - - # Return the block - return block - - def _untag(self, tree): - for i, child in enumerate(tree): - if isinstance(child, Tree): - self._untag(child) - elif isinstance(child, tuple): - tree[i] = child[0] - else: - raise ValueError("expected child to be Tree or tuple") - return tree diff --git a/pipeline/nltk/corpus/reader/cmudict.py b/pipeline/nltk/corpus/reader/cmudict.py deleted file mode 100644 index 7328ca3239c6e746d328d5706dc05a09af918c14..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/cmudict.py +++ /dev/null @@ -1,88 +0,0 @@ -# Natural Language Toolkit: Carnegie Mellon Pronouncing Dictionary Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# URL: -# For license information, see LICENSE.TXT - -""" -The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6] -ftp://ftp.cs.cmu.edu/project/speech/dict/ -Copyright 1998 Carnegie Mellon University - -File Format: Each line consists of an uppercased word, a counter -(for alternative pronunciations), and a transcription. Vowels are -marked for stress (1=primary, 2=secondary, 0=no stress). E.g.: -NATURAL 1 N AE1 CH ER0 AH0 L - -The dictionary contains 127069 entries. Of these, 119400 words are assigned -a unique pronunciation, 6830 words have two pronunciations, and 839 words have -three or more pronunciations. Many of these are fast-speech variants. - -Phonemes: There are 39 phonemes, as shown below: - -Phoneme Example Translation Phoneme Example Translation -------- ------- ----------- ------- ------- ----------- -AA odd AA D AE at AE T -AH hut HH AH T AO ought AO T -AW cow K AW AY hide HH AY D -B be B IY CH cheese CH IY Z -D dee D IY DH thee DH IY -EH Ed EH D ER hurt HH ER T -EY ate EY T F fee F IY -G green G R IY N HH he HH IY -IH it IH T IY eat IY T -JH gee JH IY K key K IY -L lee L IY M me M IY -N knee N IY NG ping P IH NG -OW oat OW T OY toy T OY -P pee P IY R read R IY D -S sea S IY SH she SH IY -T tea T IY TH theta TH EY T AH -UH hood HH UH D UW two T UW -V vee V IY W we W IY -Y yield Y IY L D Z zee Z IY -ZH seizure S IY ZH ER -""" - -from nltk.corpus.reader.api import * -from nltk.corpus.reader.util import * -from nltk.util import Index - - -class CMUDictCorpusReader(CorpusReader): - def entries(self): - """ - :return: the cmudict lexicon as a list of entries - containing (word, transcriptions) tuples. - """ - return concat( - [ - StreamBackedCorpusView(fileid, read_cmudict_block, encoding=enc) - for fileid, enc in self.abspaths(None, True) - ] - ) - - def words(self): - """ - :return: a list of all words defined in the cmudict lexicon. - """ - return [word.lower() for (word, _) in self.entries()] - - def dict(self): - """ - :return: the cmudict lexicon as a dictionary, whose keys are - lowercase words and whose values are lists of pronunciations. - """ - return dict(Index(self.entries())) - - -def read_cmudict_block(stream): - entries = [] - while len(entries) < 100: # Read 100 at a time. - line = stream.readline() - if line == "": - return entries # end of file. - pieces = line.split() - entries.append((pieces[0].lower(), pieces[2:])) - return entries diff --git a/pipeline/nltk/corpus/reader/comparative_sents.py b/pipeline/nltk/corpus/reader/comparative_sents.py deleted file mode 100644 index 032ce82c3b2a6a4011c9b1637b882db2df1bcd55..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/comparative_sents.py +++ /dev/null @@ -1,309 +0,0 @@ -# Natural Language Toolkit: Comparative Sentence Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Pierpaolo Pantone <24alsecondo@gmail.com> -# URL: -# For license information, see LICENSE.TXT - -""" -CorpusReader for the Comparative Sentence Dataset. - -- Comparative Sentence Dataset information - - -Annotated by: Nitin Jindal and Bing Liu, 2006. - Department of Computer Sicence - University of Illinois at Chicago - -Contact: Nitin Jindal, njindal@cs.uic.edu - Bing Liu, liub@cs.uic.edu (https://www.cs.uic.edu/~liub) - -Distributed with permission. - -Related papers: - -- Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents". - Proceedings of the ACM SIGIR International Conference on Information Retrieval - (SIGIR-06), 2006. - -- Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations". - Proceedings of Twenty First National Conference on Artificial Intelligence - (AAAI-2006), 2006. - -- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences". - Proceedings of the 22nd International Conference on Computational Linguistics - (Coling-2008), Manchester, 18-22 August, 2008. -""" -import re - -from nltk.corpus.reader.api import * -from nltk.tokenize import * - -# Regular expressions for dataset components -STARS = re.compile(r"^\*+$") -COMPARISON = re.compile(r"") -CLOSE_COMPARISON = re.compile(r"") -GRAD_COMPARISON = re.compile(r"") -NON_GRAD_COMPARISON = re.compile(r"") -ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)") -KEYWORD = re.compile(r"\(([^\(]*)\)$") - - -class Comparison: - """ - A Comparison represents a comparative sentence and its constituents. - """ - - def __init__( - self, - text=None, - comp_type=None, - entity_1=None, - entity_2=None, - feature=None, - keyword=None, - ): - """ - :param text: a string (optionally tokenized) containing a comparison. - :param comp_type: an integer defining the type of comparison expressed. - Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative), - 4 (Non-gradable). - :param entity_1: the first entity considered in the comparison relation. - :param entity_2: the second entity considered in the comparison relation. - :param feature: the feature considered in the comparison relation. - :param keyword: the word or phrase which is used for that comparative relation. - """ - self.text = text - self.comp_type = comp_type - self.entity_1 = entity_1 - self.entity_2 = entity_2 - self.feature = feature - self.keyword = keyword - - def __repr__(self): - return ( - 'Comparison(text="{}", comp_type={}, entity_1="{}", entity_2="{}", ' - 'feature="{}", keyword="{}")' - ).format( - self.text, - self.comp_type, - self.entity_1, - self.entity_2, - self.feature, - self.keyword, - ) - - -class ComparativeSentencesCorpusReader(CorpusReader): - """ - Reader for the Comparative Sentence Dataset by Jindal and Liu (2006). - - >>> from nltk.corpus import comparative_sentences - >>> comparison = comparative_sentences.comparisons()[0] - >>> comparison.text # doctest: +NORMALIZE_WHITESPACE - ['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly', - 'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve", - 'had', '.'] - >>> comparison.entity_2 - 'models' - >>> (comparison.feature, comparison.keyword) - ('rewind', 'more') - >>> len(comparative_sentences.comparisons()) - 853 - """ - - CorpusView = StreamBackedCorpusView - - def __init__( - self, - root, - fileids, - word_tokenizer=WhitespaceTokenizer(), - sent_tokenizer=None, - encoding="utf8", - ): - """ - :param root: The root directory for this corpus. - :param fileids: a list or regexp specifying the fileids in this corpus. - :param word_tokenizer: tokenizer for breaking sentences or paragraphs - into words. Default: `WhitespaceTokenizer` - :param sent_tokenizer: tokenizer for breaking paragraphs into sentences. - :param encoding: the encoding that should be used to read the corpus. - """ - - CorpusReader.__init__(self, root, fileids, encoding) - self._word_tokenizer = word_tokenizer - self._sent_tokenizer = sent_tokenizer - self._readme = "README.txt" - - def comparisons(self, fileids=None): - """ - Return all comparisons in the corpus. - - :param fileids: a list or regexp specifying the ids of the files whose - comparisons have to be returned. - :return: the given file(s) as a list of Comparison objects. - :rtype: list(Comparison) - """ - if fileids is None: - fileids = self._fileids - elif isinstance(fileids, str): - fileids = [fileids] - return concat( - [ - self.CorpusView(path, self._read_comparison_block, encoding=enc) - for (path, enc, fileid) in self.abspaths(fileids, True, True) - ] - ) - - def keywords(self, fileids=None): - """ - Return a set of all keywords used in the corpus. - - :param fileids: a list or regexp specifying the ids of the files whose - keywords have to be returned. - :return: the set of keywords and comparative phrases used in the corpus. - :rtype: set(str) - """ - all_keywords = concat( - [ - self.CorpusView(path, self._read_keyword_block, encoding=enc) - for (path, enc, fileid) in self.abspaths(fileids, True, True) - ] - ) - - keywords_set = {keyword.lower() for keyword in all_keywords if keyword} - return keywords_set - - def keywords_readme(self): - """ - Return the list of words and constituents considered as clues of a - comparison (from listOfkeywords.txt). - """ - keywords = [] - with self.open("listOfkeywords.txt") as fp: - raw_text = fp.read() - for line in raw_text.split("\n"): - if not line or line.startswith("//"): - continue - keywords.append(line.strip()) - return keywords - - def sents(self, fileids=None): - """ - Return all sentences in the corpus. - - :param fileids: a list or regexp specifying the ids of the files whose - sentences have to be returned. - :return: all sentences of the corpus as lists of tokens (or as plain - strings, if no word tokenizer is specified). - :rtype: list(list(str)) or list(str) - """ - return concat( - [ - self.CorpusView(path, self._read_sent_block, encoding=enc) - for (path, enc, fileid) in self.abspaths(fileids, True, True) - ] - ) - - def words(self, fileids=None): - """ - Return all words and punctuation symbols in the corpus. - - :param fileids: a list or regexp specifying the ids of the files whose - words have to be returned. - :return: the given file(s) as a list of words and punctuation symbols. - :rtype: list(str) - """ - return concat( - [ - self.CorpusView(path, self._read_word_block, encoding=enc) - for (path, enc, fileid) in self.abspaths(fileids, True, True) - ] - ) - - def _read_comparison_block(self, stream): - while True: - line = stream.readline() - if not line: - return [] # end of file. - comparison_tags = re.findall(COMPARISON, line) - if comparison_tags: - grad_comparisons = re.findall(GRAD_COMPARISON, line) - non_grad_comparisons = re.findall(NON_GRAD_COMPARISON, line) - # Advance to the next line (it contains the comparative sentence) - comparison_text = stream.readline().strip() - if self._word_tokenizer: - comparison_text = self._word_tokenizer.tokenize(comparison_text) - # Skip the next line (it contains closing comparison tags) - stream.readline() - # If gradable comparisons are found, create Comparison instances - # and populate their fields - comparison_bundle = [] - if grad_comparisons: - # Each comparison tag has its own relations on a separate line - for comp in grad_comparisons: - comp_type = int(re.match(r"", comp).group(1)) - comparison = Comparison( - text=comparison_text, comp_type=comp_type - ) - line = stream.readline() - entities_feats = ENTITIES_FEATS.findall(line) - if entities_feats: - for (code, entity_feat) in entities_feats: - if code == "1": - comparison.entity_1 = entity_feat.strip() - elif code == "2": - comparison.entity_2 = entity_feat.strip() - elif code == "3": - comparison.feature = entity_feat.strip() - keyword = KEYWORD.findall(line) - if keyword: - comparison.keyword = keyword[0] - comparison_bundle.append(comparison) - # If non-gradable comparisons are found, create a simple Comparison - # instance for each one - if non_grad_comparisons: - for comp in non_grad_comparisons: - # comp_type in this case should always be 4. - comp_type = int(re.match(r"", comp).group(1)) - comparison = Comparison( - text=comparison_text, comp_type=comp_type - ) - comparison_bundle.append(comparison) - # Flatten the list of comparisons before returning them - # return concat([comparison_bundle]) - return comparison_bundle - - def _read_keyword_block(self, stream): - keywords = [] - for comparison in self._read_comparison_block(stream): - keywords.append(comparison.keyword) - return keywords - - def _read_sent_block(self, stream): - while True: - line = stream.readline() - if re.match(STARS, line): - while True: - line = stream.readline() - if re.match(STARS, line): - break - continue - if ( - not re.findall(COMPARISON, line) - and not ENTITIES_FEATS.findall(line) - and not re.findall(CLOSE_COMPARISON, line) - ): - if self._sent_tokenizer: - return [ - self._word_tokenizer.tokenize(sent) - for sent in self._sent_tokenizer.tokenize(line) - ] - else: - return [self._word_tokenizer.tokenize(line)] - - def _read_word_block(self, stream): - words = [] - for sent in self._read_sent_block(stream): - words.extend(sent) - return words diff --git a/pipeline/nltk/corpus/reader/conll.py b/pipeline/nltk/corpus/reader/conll.py deleted file mode 100644 index 3c3b30db900ee4eb4648b74d5904af04b60e1692..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/conll.py +++ /dev/null @@ -1,579 +0,0 @@ -# Natural Language Toolkit: CONLL Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -Read CoNLL-style chunk fileids. -""" - -import textwrap - -from nltk.corpus.reader.api import * -from nltk.corpus.reader.util import * -from nltk.tag import map_tag -from nltk.tree import Tree -from nltk.util import LazyConcatenation, LazyMap - - -class ConllCorpusReader(CorpusReader): - """ - A corpus reader for CoNLL-style files. These files consist of a - series of sentences, separated by blank lines. Each sentence is - encoded using a table (or "grid") of values, where each line - corresponds to a single word, and each column corresponds to an - annotation type. The set of columns used by CoNLL-style files can - vary from corpus to corpus; the ``ConllCorpusReader`` constructor - therefore takes an argument, ``columntypes``, which is used to - specify the columns that are used by a given corpus. By default - columns are split by consecutive whitespaces, with the - ``separator`` argument you can set a string to split by (e.g. - ``\'\t\'``). - - - @todo: Add support for reading from corpora where different - parallel files contain different columns. - @todo: Possibly add caching of the grid corpus view? This would - allow the same grid view to be used by different data access - methods (eg words() and parsed_sents() could both share the - same grid corpus view object). - @todo: Better support for -DOCSTART-. Currently, we just ignore - it, but it could be used to define methods that retrieve a - document at a time (eg parsed_documents()). - """ - - # ///////////////////////////////////////////////////////////////// - # Column Types - # ///////////////////////////////////////////////////////////////// - - WORDS = "words" #: column type for words - POS = "pos" #: column type for part-of-speech tags - TREE = "tree" #: column type for parse trees - CHUNK = "chunk" #: column type for chunk structures - NE = "ne" #: column type for named entities - SRL = "srl" #: column type for semantic role labels - IGNORE = "ignore" #: column type for column that should be ignored - - #: A list of all column types supported by the conll corpus reader. - COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE) - - # ///////////////////////////////////////////////////////////////// - # Constructor - # ///////////////////////////////////////////////////////////////// - - def __init__( - self, - root, - fileids, - columntypes, - chunk_types=None, - root_label="S", - pos_in_tree=False, - srl_includes_roleset=True, - encoding="utf8", - tree_class=Tree, - tagset=None, - separator=None, - ): - for columntype in columntypes: - if columntype not in self.COLUMN_TYPES: - raise ValueError("Bad column type %r" % columntype) - if isinstance(chunk_types, str): - chunk_types = [chunk_types] - self._chunk_types = chunk_types - self._colmap = {c: i for (i, c) in enumerate(columntypes)} - self._pos_in_tree = pos_in_tree - self._root_label = root_label # for chunks - self._srl_includes_roleset = srl_includes_roleset - self._tree_class = tree_class - CorpusReader.__init__(self, root, fileids, encoding) - self._tagset = tagset - self.sep = separator - - # ///////////////////////////////////////////////////////////////// - # Data Access Methods - # ///////////////////////////////////////////////////////////////// - - def words(self, fileids=None): - self._require(self.WORDS) - return LazyConcatenation(LazyMap(self._get_words, self._grids(fileids))) - - def sents(self, fileids=None): - self._require(self.WORDS) - return LazyMap(self._get_words, self._grids(fileids)) - - def tagged_words(self, fileids=None, tagset=None): - self._require(self.WORDS, self.POS) - - def get_tagged_words(grid): - return self._get_tagged_words(grid, tagset) - - return LazyConcatenation(LazyMap(get_tagged_words, self._grids(fileids))) - - def tagged_sents(self, fileids=None, tagset=None): - self._require(self.WORDS, self.POS) - - def get_tagged_words(grid): - return self._get_tagged_words(grid, tagset) - - return LazyMap(get_tagged_words, self._grids(fileids)) - - def chunked_words(self, fileids=None, chunk_types=None, tagset=None): - self._require(self.WORDS, self.POS, self.CHUNK) - if chunk_types is None: - chunk_types = self._chunk_types - - def get_chunked_words(grid): # capture chunk_types as local var - return self._get_chunked_words(grid, chunk_types, tagset) - - return LazyConcatenation(LazyMap(get_chunked_words, self._grids(fileids))) - - def chunked_sents(self, fileids=None, chunk_types=None, tagset=None): - self._require(self.WORDS, self.POS, self.CHUNK) - if chunk_types is None: - chunk_types = self._chunk_types - - def get_chunked_words(grid): # capture chunk_types as local var - return self._get_chunked_words(grid, chunk_types, tagset) - - return LazyMap(get_chunked_words, self._grids(fileids)) - - def parsed_sents(self, fileids=None, pos_in_tree=None, tagset=None): - self._require(self.WORDS, self.POS, self.TREE) - if pos_in_tree is None: - pos_in_tree = self._pos_in_tree - - def get_parsed_sent(grid): # capture pos_in_tree as local var - return self._get_parsed_sent(grid, pos_in_tree, tagset) - - return LazyMap(get_parsed_sent, self._grids(fileids)) - - def srl_spans(self, fileids=None): - self._require(self.SRL) - return LazyMap(self._get_srl_spans, self._grids(fileids)) - - def srl_instances(self, fileids=None, pos_in_tree=None, flatten=True): - self._require(self.WORDS, self.POS, self.TREE, self.SRL) - if pos_in_tree is None: - pos_in_tree = self._pos_in_tree - - def get_srl_instances(grid): # capture pos_in_tree as local var - return self._get_srl_instances(grid, pos_in_tree) - - result = LazyMap(get_srl_instances, self._grids(fileids)) - if flatten: - result = LazyConcatenation(result) - return result - - def iob_words(self, fileids=None, tagset=None): - """ - :return: a list of word/tag/IOB tuples - :rtype: list(tuple) - :param fileids: the list of fileids that make up this corpus - :type fileids: None or str or list - """ - self._require(self.WORDS, self.POS, self.CHUNK) - - def get_iob_words(grid): - return self._get_iob_words(grid, tagset) - - return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids))) - - def iob_sents(self, fileids=None, tagset=None): - """ - :return: a list of lists of word/tag/IOB tuples - :rtype: list(list) - :param fileids: the list of fileids that make up this corpus - :type fileids: None or str or list - """ - self._require(self.WORDS, self.POS, self.CHUNK) - - def get_iob_words(grid): - return self._get_iob_words(grid, tagset) - - return LazyMap(get_iob_words, self._grids(fileids)) - - # ///////////////////////////////////////////////////////////////// - # Grid Reading - # ///////////////////////////////////////////////////////////////// - - def _grids(self, fileids=None): - # n.b.: we could cache the object returned here (keyed on - # fileids), which would let us reuse the same corpus view for - # different things (eg srl and parse trees). - return concat( - [ - StreamBackedCorpusView(fileid, self._read_grid_block, encoding=enc) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def _read_grid_block(self, stream): - grids = [] - for block in read_blankline_block(stream): - block = block.strip() - if not block: - continue - - grid = [line.split(self.sep) for line in block.split("\n")] - - # If there's a docstart row, then discard. ([xx] eventually it - # would be good to actually use it) - if grid[0][self._colmap.get("words", 0)] == "-DOCSTART-": - del grid[0] - - # Check that the grid is consistent. - for row in grid: - if len(row) != len(grid[0]): - raise ValueError("Inconsistent number of columns:\n%s" % block) - grids.append(grid) - return grids - - # ///////////////////////////////////////////////////////////////// - # Transforms - # ///////////////////////////////////////////////////////////////// - # given a grid, transform it into some representation (e.g., - # a list of words or a parse tree). - - def _get_words(self, grid): - return self._get_column(grid, self._colmap["words"]) - - def _get_tagged_words(self, grid, tagset=None): - pos_tags = self._get_column(grid, self._colmap["pos"]) - if tagset and tagset != self._tagset: - pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] - return list(zip(self._get_column(grid, self._colmap["words"]), pos_tags)) - - def _get_iob_words(self, grid, tagset=None): - pos_tags = self._get_column(grid, self._colmap["pos"]) - if tagset and tagset != self._tagset: - pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] - return list( - zip( - self._get_column(grid, self._colmap["words"]), - pos_tags, - self._get_column(grid, self._colmap["chunk"]), - ) - ) - - def _get_chunked_words(self, grid, chunk_types, tagset=None): - # n.b.: this method is very similar to conllstr2tree. - words = self._get_column(grid, self._colmap["words"]) - pos_tags = self._get_column(grid, self._colmap["pos"]) - if tagset and tagset != self._tagset: - pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] - chunk_tags = self._get_column(grid, self._colmap["chunk"]) - - stack = [Tree(self._root_label, [])] - - for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags): - if chunk_tag == "O": - state, chunk_type = "O", "" - else: - (state, chunk_type) = chunk_tag.split("-") - # If it's a chunk we don't care about, treat it as O. - if chunk_types is not None and chunk_type not in chunk_types: - state = "O" - # Treat a mismatching I like a B. - if state == "I" and chunk_type != stack[-1].label(): - state = "B" - # For B or I: close any open chunks - if state in "BO" and len(stack) == 2: - stack.pop() - # For B: start a new chunk. - if state == "B": - new_chunk = Tree(chunk_type, []) - stack[-1].append(new_chunk) - stack.append(new_chunk) - # Add the word token. - stack[-1].append((word, pos_tag)) - - return stack[0] - - def _get_parsed_sent(self, grid, pos_in_tree, tagset=None): - words = self._get_column(grid, self._colmap["words"]) - pos_tags = self._get_column(grid, self._colmap["pos"]) - if tagset and tagset != self._tagset: - pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] - parse_tags = self._get_column(grid, self._colmap["tree"]) - - treestr = "" - for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags): - if word == "(": - word = "-LRB-" - if word == ")": - word = "-RRB-" - if pos_tag == "(": - pos_tag = "-LRB-" - if pos_tag == ")": - pos_tag = "-RRB-" - (left, right) = parse_tag.split("*") - right = right.count(")") * ")" # only keep ')'. - treestr += f"{left} ({pos_tag} {word}) {right}" - try: - tree = self._tree_class.fromstring(treestr) - except (ValueError, IndexError): - tree = self._tree_class.fromstring(f"({self._root_label} {treestr})") - - if not pos_in_tree: - for subtree in tree.subtrees(): - for i, child in enumerate(subtree): - if ( - isinstance(child, Tree) - and len(child) == 1 - and isinstance(child[0], str) - ): - subtree[i] = (child[0], child.label()) - - return tree - - def _get_srl_spans(self, grid): - """ - list of list of (start, end), tag) tuples - """ - if self._srl_includes_roleset: - predicates = self._get_column(grid, self._colmap["srl"] + 1) - start_col = self._colmap["srl"] + 2 - else: - predicates = self._get_column(grid, self._colmap["srl"]) - start_col = self._colmap["srl"] + 1 - - # Count how many predicates there are. This tells us how many - # columns to expect for SRL data. - num_preds = len([p for p in predicates if p != "-"]) - - spanlists = [] - for i in range(num_preds): - col = self._get_column(grid, start_col + i) - spanlist = [] - stack = [] - for wordnum, srl_tag in enumerate(col): - (left, right) = srl_tag.split("*") - for tag in left.split("("): - if tag: - stack.append((tag, wordnum)) - for i in range(right.count(")")): - (tag, start) = stack.pop() - spanlist.append(((start, wordnum + 1), tag)) - spanlists.append(spanlist) - - return spanlists - - def _get_srl_instances(self, grid, pos_in_tree): - tree = self._get_parsed_sent(grid, pos_in_tree) - spanlists = self._get_srl_spans(grid) - if self._srl_includes_roleset: - predicates = self._get_column(grid, self._colmap["srl"] + 1) - rolesets = self._get_column(grid, self._colmap["srl"]) - else: - predicates = self._get_column(grid, self._colmap["srl"]) - rolesets = [None] * len(predicates) - - instances = ConllSRLInstanceList(tree) - for wordnum, predicate in enumerate(predicates): - if predicate == "-": - continue - # Decide which spanlist to use. Don't assume that they're - # sorted in the same order as the predicates (even though - # they usually are). - for spanlist in spanlists: - for (start, end), tag in spanlist: - if wordnum in range(start, end) and tag in ("V", "C-V"): - break - else: - continue - break - else: - raise ValueError("No srl column found for %r" % predicate) - instances.append( - ConllSRLInstance(tree, wordnum, predicate, rolesets[wordnum], spanlist) - ) - - return instances - - # ///////////////////////////////////////////////////////////////// - # Helper Methods - # ///////////////////////////////////////////////////////////////// - - def _require(self, *columntypes): - for columntype in columntypes: - if columntype not in self._colmap: - raise ValueError( - "This corpus does not contain a %s " "column." % columntype - ) - - @staticmethod - def _get_column(grid, column_index): - return [grid[i][column_index] for i in range(len(grid))] - - -class ConllSRLInstance: - """ - An SRL instance from a CoNLL corpus, which identifies and - providing labels for the arguments of a single verb. - """ - - # [xx] add inst.core_arguments, inst.argm_arguments? - - def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans): - self.verb = [] - """A list of the word indices of the words that compose the - verb whose arguments are identified by this instance. - This will contain multiple word indices when multi-word - verbs are used (e.g. 'turn on').""" - - self.verb_head = verb_head - """The word index of the head word of the verb whose arguments - are identified by this instance. E.g., for a sentence that - uses the verb 'turn on,' ``verb_head`` will be the word index - of the word 'turn'.""" - - self.verb_stem = verb_stem - - self.roleset = roleset - - self.arguments = [] - """A list of ``(argspan, argid)`` tuples, specifying the location - and type for each of the arguments identified by this - instance. ``argspan`` is a tuple ``start, end``, indicating - that the argument consists of the ``words[start:end]``.""" - - self.tagged_spans = tagged_spans - """A list of ``(span, id)`` tuples, specifying the location and - type for each of the arguments, as well as the verb pieces, - that make up this instance.""" - - self.tree = tree - """The parse tree for the sentence containing this instance.""" - - self.words = tree.leaves() - """A list of the words in the sentence containing this - instance.""" - - # Fill in the self.verb and self.arguments values. - for (start, end), tag in tagged_spans: - if tag in ("V", "C-V"): - self.verb += list(range(start, end)) - else: - self.arguments.append(((start, end), tag)) - - def __repr__(self): - # Originally, its: - ##plural = 's' if len(self.arguments) != 1 else '' - plural = "s" if len(self.arguments) != 1 else "" - return "" % ( - (self.verb_stem, len(self.arguments), plural) - ) - - def pprint(self): - verbstr = " ".join(self.words[i][0] for i in self.verb) - hdr = f"SRL for {verbstr!r} (stem={self.verb_stem!r}):\n" - s = "" - for i, word in enumerate(self.words): - if isinstance(word, tuple): - word = word[0] - for (start, end), argid in self.arguments: - if i == start: - s += "[%s " % argid - if i == end: - s += "] " - if i in self.verb: - word = "<<%s>>" % word - s += word + " " - return hdr + textwrap.fill( - s.replace(" ]", "]"), initial_indent=" ", subsequent_indent=" " - ) - - -class ConllSRLInstanceList(list): - """ - Set of instances for a single sentence - """ - - def __init__(self, tree, instances=()): - self.tree = tree - list.__init__(self, instances) - - def __str__(self): - return self.pprint() - - def pprint(self, include_tree=False): - # Sanity check: trees should be the same - for inst in self: - if inst.tree != self.tree: - raise ValueError("Tree mismatch!") - - # If desired, add trees: - if include_tree: - words = self.tree.leaves() - pos = [None] * len(words) - synt = ["*"] * len(words) - self._tree2conll(self.tree, 0, words, pos, synt) - - s = "" - for i in range(len(words)): - # optional tree columns - if include_tree: - s += "%-20s " % words[i] - s += "%-8s " % pos[i] - s += "%15s*%-8s " % tuple(synt[i].split("*")) - - # verb head column - for inst in self: - if i == inst.verb_head: - s += "%-20s " % inst.verb_stem - break - else: - s += "%-20s " % "-" - # Remaining columns: self - for inst in self: - argstr = "*" - for (start, end), argid in inst.tagged_spans: - if i == start: - argstr = f"({argid}{argstr}" - if i == (end - 1): - argstr += ")" - s += "%-12s " % argstr - s += "\n" - return s - - def _tree2conll(self, tree, wordnum, words, pos, synt): - assert isinstance(tree, Tree) - if len(tree) == 1 and isinstance(tree[0], str): - pos[wordnum] = tree.label() - assert words[wordnum] == tree[0] - return wordnum + 1 - elif len(tree) == 1 and isinstance(tree[0], tuple): - assert len(tree[0]) == 2 - pos[wordnum], pos[wordnum] = tree[0] - return wordnum + 1 - else: - synt[wordnum] = f"({tree.label()}{synt[wordnum]}" - for child in tree: - wordnum = self._tree2conll(child, wordnum, words, pos, synt) - synt[wordnum - 1] += ")" - return wordnum - - -class ConllChunkCorpusReader(ConllCorpusReader): - """ - A ConllCorpusReader whose data file contains three columns: words, - pos, and chunk. - """ - - def __init__( - self, root, fileids, chunk_types, encoding="utf8", tagset=None, separator=None - ): - ConllCorpusReader.__init__( - self, - root, - fileids, - ("words", "pos", "chunk"), - chunk_types=chunk_types, - encoding=encoding, - tagset=tagset, - separator=separator, - ) diff --git a/pipeline/nltk/corpus/reader/crubadan.py b/pipeline/nltk/corpus/reader/crubadan.py deleted file mode 100644 index d7bcf8a05cf86123ce952e802a71bb5dd637bd42..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/crubadan.py +++ /dev/null @@ -1,106 +0,0 @@ -# Natural Language Toolkit: An Crubadan N-grams Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Avital Pekker -# -# URL: -# For license information, see LICENSE.TXT - -""" -An NLTK interface for the n-gram statistics gathered from -the corpora for each language using An Crubadan. - -There are multiple potential applications for the data but -this reader was created with the goal of using it in the -context of language identification. - -For details about An Crubadan, this data, and its potential uses, see: -http://borel.slu.edu/crubadan/index.html -""" - -import re -from os import path - -from nltk.corpus.reader import CorpusReader -from nltk.data import ZipFilePathPointer -from nltk.probability import FreqDist - - -class CrubadanCorpusReader(CorpusReader): - """ - A corpus reader used to access language An Crubadan n-gram files. - """ - - _LANG_MAPPER_FILE = "table.txt" - _all_lang_freq = {} - - def __init__(self, root, fileids, encoding="utf8", tagset=None): - super().__init__(root, fileids, encoding="utf8") - self._lang_mapping_data = [] - self._load_lang_mapping_data() - - def lang_freq(self, lang): - """Return n-gram FreqDist for a specific language - given ISO 639-3 language code""" - - if lang not in self._all_lang_freq: - self._all_lang_freq[lang] = self._load_lang_ngrams(lang) - - return self._all_lang_freq[lang] - - def langs(self): - """Return a list of supported languages as ISO 639-3 codes""" - return [row[1] for row in self._lang_mapping_data] - - def iso_to_crubadan(self, lang): - """Return internal Crubadan code based on ISO 639-3 code""" - for i in self._lang_mapping_data: - if i[1].lower() == lang.lower(): - return i[0] - - def crubadan_to_iso(self, lang): - """Return ISO 639-3 code given internal Crubadan code""" - for i in self._lang_mapping_data: - if i[0].lower() == lang.lower(): - return i[1] - - def _load_lang_mapping_data(self): - """Load language mappings between codes and description from table.txt""" - if isinstance(self.root, ZipFilePathPointer): - raise RuntimeError( - "Please install the 'crubadan' corpus first, use nltk.download()" - ) - - mapper_file = path.join(self.root, self._LANG_MAPPER_FILE) - if self._LANG_MAPPER_FILE not in self.fileids(): - raise RuntimeError("Could not find language mapper file: " + mapper_file) - - with open(mapper_file, encoding="utf-8") as raw: - strip_raw = raw.read().strip() - - self._lang_mapping_data = [row.split("\t") for row in strip_raw.split("\n")] - - def _load_lang_ngrams(self, lang): - """Load single n-gram language file given the ISO 639-3 language code - and return its FreqDist""" - - if lang not in self.langs(): - raise RuntimeError("Unsupported language.") - - crubadan_code = self.iso_to_crubadan(lang) - ngram_file = path.join(self.root, crubadan_code + "-3grams.txt") - - if not path.isfile(ngram_file): - raise RuntimeError("No N-gram file found for requested language.") - - counts = FreqDist() - with open(ngram_file, encoding="utf-8") as f: - for line in f: - data = line.split(" ") - - ngram = data[1].strip("\n") - freq = int(data[0]) - - counts[ngram] = freq - - return counts diff --git a/pipeline/nltk/corpus/reader/dependency.py b/pipeline/nltk/corpus/reader/dependency.py deleted file mode 100644 index 87f56d4b5410a6dc419cd58538d3f4499478a205..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/dependency.py +++ /dev/null @@ -1,115 +0,0 @@ -# Natural Language Toolkit: Dependency Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Kepa Sarasola -# Iker Manterola -# -# URL: -# For license information, see LICENSE.TXT - -from nltk.corpus.reader.api import * -from nltk.corpus.reader.util import * -from nltk.parse import DependencyGraph -from nltk.tokenize import * - - -class DependencyCorpusReader(SyntaxCorpusReader): - def __init__( - self, - root, - fileids, - encoding="utf8", - word_tokenizer=TabTokenizer(), - sent_tokenizer=RegexpTokenizer("\n", gaps=True), - para_block_reader=read_blankline_block, - ): - SyntaxCorpusReader.__init__(self, root, fileids, encoding) - - ######################################################### - - def words(self, fileids=None): - return concat( - [ - DependencyCorpusView(fileid, False, False, False, encoding=enc) - for fileid, enc in self.abspaths(fileids, include_encoding=True) - ] - ) - - def tagged_words(self, fileids=None): - return concat( - [ - DependencyCorpusView(fileid, True, False, False, encoding=enc) - for fileid, enc in self.abspaths(fileids, include_encoding=True) - ] - ) - - def sents(self, fileids=None): - return concat( - [ - DependencyCorpusView(fileid, False, True, False, encoding=enc) - for fileid, enc in self.abspaths(fileids, include_encoding=True) - ] - ) - - def tagged_sents(self, fileids=None): - return concat( - [ - DependencyCorpusView(fileid, True, True, False, encoding=enc) - for fileid, enc in self.abspaths(fileids, include_encoding=True) - ] - ) - - def parsed_sents(self, fileids=None): - sents = concat( - [ - DependencyCorpusView(fileid, False, True, True, encoding=enc) - for fileid, enc in self.abspaths(fileids, include_encoding=True) - ] - ) - return [DependencyGraph(sent) for sent in sents] - - -class DependencyCorpusView(StreamBackedCorpusView): - _DOCSTART = "-DOCSTART- -DOCSTART- O\n" # dokumentu hasiera definitzen da - - def __init__( - self, - corpus_file, - tagged, - group_by_sent, - dependencies, - chunk_types=None, - encoding="utf8", - ): - self._tagged = tagged - self._dependencies = dependencies - self._group_by_sent = group_by_sent - self._chunk_types = chunk_types - StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) - - def read_block(self, stream): - # Read the next sentence. - sent = read_blankline_block(stream)[0].strip() - # Strip off the docstart marker, if present. - if sent.startswith(self._DOCSTART): - sent = sent[len(self._DOCSTART) :].lstrip() - - # extract word and tag from any of the formats - if not self._dependencies: - lines = [line.split("\t") for line in sent.split("\n")] - if len(lines[0]) == 3 or len(lines[0]) == 4: - sent = [(line[0], line[1]) for line in lines] - elif len(lines[0]) == 10: - sent = [(line[1], line[4]) for line in lines] - else: - raise ValueError("Unexpected number of fields in dependency tree file") - - # discard tags if they weren't requested - if not self._tagged: - sent = [word for (word, tag) in sent] - - # Return the result. - if self._group_by_sent: - return [sent] - else: - return list(sent) diff --git a/pipeline/nltk/corpus/reader/framenet.py b/pipeline/nltk/corpus/reader/framenet.py deleted file mode 100644 index 6eaa1ad8931ab407bac92d0ea3e6f2e60f74d0e1..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/framenet.py +++ /dev/null @@ -1,3442 +0,0 @@ -# Natural Language Toolkit: Framenet Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Authors: Chuck Wooters , -# Nathan Schneider -# URL: -# For license information, see LICENSE.TXT - - -""" -Corpus reader for the FrameNet 1.7 lexicon and corpus. -""" - -import itertools -import os -import re -import sys -import textwrap -import types -from collections import OrderedDict, defaultdict -from itertools import zip_longest -from operator import itemgetter -from pprint import pprint - -from nltk.corpus.reader import XMLCorpusReader, XMLCorpusView -from nltk.util import LazyConcatenation, LazyIteratorList, LazyMap - -__docformat__ = "epytext en" - - -def mimic_wrap(lines, wrap_at=65, **kwargs): - """ - Wrap the first of 'lines' with textwrap and the remaining lines at exactly the same - positions as the first. - """ - l0 = textwrap.fill(lines[0], wrap_at, drop_whitespace=False).split("\n") - yield l0 - - def _(line): - il0 = 0 - while line and il0 < len(l0) - 1: - yield line[: len(l0[il0])] - line = line[len(l0[il0]) :] - il0 += 1 - if line: # Remaining stuff on this line past the end of the mimicked line. - # So just textwrap this line. - yield from textwrap.fill(line, wrap_at, drop_whitespace=False).split("\n") - - for l in lines[1:]: - yield list(_(l)) - - -def _pretty_longstring(defstr, prefix="", wrap_at=65): - - """ - Helper function for pretty-printing a long string. - - :param defstr: The string to be printed. - :type defstr: str - :return: A nicely formatted string representation of the long string. - :rtype: str - """ - - outstr = "" - for line in textwrap.fill(defstr, wrap_at).split("\n"): - outstr += prefix + line + "\n" - return outstr - - -def _pretty_any(obj): - - """ - Helper function for pretty-printing any AttrDict object. - - :param obj: The obj to be printed. - :type obj: AttrDict - :return: A nicely formatted string representation of the AttrDict object. - :rtype: str - """ - - outstr = "" - for k in obj: - if isinstance(obj[k], str) and len(obj[k]) > 65: - outstr += f"[{k}]\n" - outstr += "{}".format(_pretty_longstring(obj[k], prefix=" ")) - outstr += "\n" - else: - outstr += f"[{k}] {obj[k]}\n" - - return outstr - - -def _pretty_semtype(st): - - """ - Helper function for pretty-printing a semantic type. - - :param st: The semantic type to be printed. - :type st: AttrDict - :return: A nicely formatted string representation of the semantic type. - :rtype: str - """ - - semkeys = st.keys() - if len(semkeys) == 1: - return "" - - outstr = "" - outstr += "semantic type ({0.ID}): {0.name}\n".format(st) - if "abbrev" in semkeys: - outstr += f"[abbrev] {st.abbrev}\n" - if "definition" in semkeys: - outstr += "[definition]\n" - outstr += _pretty_longstring(st.definition, " ") - outstr += f"[rootType] {st.rootType.name}({st.rootType.ID})\n" - if st.superType is None: - outstr += "[superType] \n" - else: - outstr += f"[superType] {st.superType.name}({st.superType.ID})\n" - outstr += f"[subTypes] {len(st.subTypes)} subtypes\n" - outstr += ( - " " - + ", ".join(f"{x.name}({x.ID})" for x in st.subTypes) - + "\n" * (len(st.subTypes) > 0) - ) - return outstr - - -def _pretty_frame_relation_type(freltyp): - - """ - Helper function for pretty-printing a frame relation type. - - :param freltyp: The frame relation type to be printed. - :type freltyp: AttrDict - :return: A nicely formatted string representation of the frame relation type. - :rtype: str - """ - outstr = " {0.subFrameName}>".format( - freltyp - ) - return outstr - - -def _pretty_frame_relation(frel): - - """ - Helper function for pretty-printing a frame relation. - - :param frel: The frame relation to be printed. - :type frel: AttrDict - :return: A nicely formatted string representation of the frame relation. - :rtype: str - """ - outstr = "<{0.type.superFrameName}={0.superFrameName} -- {0.type.name} -> {0.type.subFrameName}={0.subFrameName}>".format( - frel - ) - return outstr - - -def _pretty_fe_relation(ferel): - - """ - Helper function for pretty-printing an FE relation. - - :param ferel: The FE relation to be printed. - :type ferel: AttrDict - :return: A nicely formatted string representation of the FE relation. - :rtype: str - """ - outstr = "<{0.type.superFrameName}={0.frameRelation.superFrameName}.{0.superFEName} -- {0.type.name} -> {0.type.subFrameName}={0.frameRelation.subFrameName}.{0.subFEName}>".format( - ferel - ) - return outstr - - -def _pretty_lu(lu): - - """ - Helper function for pretty-printing a lexical unit. - - :param lu: The lu to be printed. - :type lu: AttrDict - :return: A nicely formatted string representation of the lexical unit. - :rtype: str - """ - - lukeys = lu.keys() - outstr = "" - outstr += "lexical unit ({0.ID}): {0.name}\n\n".format(lu) - if "definition" in lukeys: - outstr += "[definition]\n" - outstr += _pretty_longstring(lu.definition, " ") - if "frame" in lukeys: - outstr += f"\n[frame] {lu.frame.name}({lu.frame.ID})\n" - if "incorporatedFE" in lukeys: - outstr += f"\n[incorporatedFE] {lu.incorporatedFE}\n" - if "POS" in lukeys: - outstr += f"\n[POS] {lu.POS}\n" - if "status" in lukeys: - outstr += f"\n[status] {lu.status}\n" - if "totalAnnotated" in lukeys: - outstr += f"\n[totalAnnotated] {lu.totalAnnotated} annotated examples\n" - if "lexemes" in lukeys: - outstr += "\n[lexemes] {}\n".format( - " ".join(f"{lex.name}/{lex.POS}" for lex in lu.lexemes) - ) - if "semTypes" in lukeys: - outstr += f"\n[semTypes] {len(lu.semTypes)} semantic types\n" - outstr += ( - " " * (len(lu.semTypes) > 0) - + ", ".join(f"{x.name}({x.ID})" for x in lu.semTypes) - + "\n" * (len(lu.semTypes) > 0) - ) - if "URL" in lukeys: - outstr += f"\n[URL] {lu.URL}\n" - if "subCorpus" in lukeys: - subc = [x.name for x in lu.subCorpus] - outstr += f"\n[subCorpus] {len(lu.subCorpus)} subcorpora\n" - for line in textwrap.fill(", ".join(sorted(subc)), 60).split("\n"): - outstr += f" {line}\n" - if "exemplars" in lukeys: - outstr += "\n[exemplars] {} sentences across all subcorpora\n".format( - len(lu.exemplars) - ) - - return outstr - - -def _pretty_exemplars(exemplars, lu): - """ - Helper function for pretty-printing a list of exemplar sentences for a lexical unit. - - :param sent: The list of exemplar sentences to be printed. - :type sent: list(AttrDict) - :return: An index of the text of the exemplar sentences. - :rtype: str - """ - - outstr = "" - outstr += "exemplar sentences for {0.name} in {0.frame.name}:\n\n".format(lu) - for i, sent in enumerate(exemplars): - outstr += f"[{i}] {sent.text}\n" - outstr += "\n" - return outstr - - -def _pretty_fulltext_sentences(sents): - """ - Helper function for pretty-printing a list of annotated sentences for a full-text document. - - :param sent: The list of sentences to be printed. - :type sent: list(AttrDict) - :return: An index of the text of the sentences. - :rtype: str - """ - - outstr = "" - outstr += "full-text document ({0.ID}) {0.name}:\n\n".format(sents) - outstr += "[corpid] {0.corpid}\n[corpname] {0.corpname}\n[description] {0.description}\n[URL] {0.URL}\n\n".format( - sents - ) - outstr += f"[sentence]\n" - for i, sent in enumerate(sents.sentence): - outstr += f"[{i}] {sent.text}\n" - outstr += "\n" - return outstr - - -def _pretty_fulltext_sentence(sent): - """ - Helper function for pretty-printing an annotated sentence from a full-text document. - - :param sent: The sentence to be printed. - :type sent: list(AttrDict) - :return: The text of the sentence with annotation set indices on frame targets. - :rtype: str - """ - - outstr = "" - outstr += "full-text sentence ({0.ID}) in {1}:\n\n".format( - sent, sent.doc.get("name", sent.doc.description) - ) - outstr += f"\n[POS] {len(sent.POS)} tags\n" - outstr += f"\n[POS_tagset] {sent.POS_tagset}\n\n" - outstr += "[text] + [annotationSet]\n\n" - outstr += sent._ascii() # -> _annotation_ascii() - outstr += "\n" - return outstr - - -def _pretty_pos(aset): - """ - Helper function for pretty-printing a sentence with its POS tags. - - :param aset: The POS annotation set of the sentence to be printed. - :type sent: list(AttrDict) - :return: The text of the sentence and its POS tags. - :rtype: str - """ - - outstr = "" - outstr += "POS annotation set ({0.ID}) {0.POS_tagset} in sentence {0.sent.ID}:\n\n".format( - aset - ) - - # list the target spans and their associated aset index - overt = sorted(aset.POS) - - sent = aset.sent - s0 = sent.text - s1 = "" - s2 = "" - i = 0 - adjust = 0 - for j, k, lbl in overt: - assert j >= i, ("Overlapping targets?", (j, k, lbl)) - s1 += " " * (j - i) + "-" * (k - j) - if len(lbl) > (k - j): - # add space in the sentence to make room for the annotation index - amt = len(lbl) - (k - j) - s0 = ( - s0[: k + adjust] + "~" * amt + s0[k + adjust :] - ) # '~' to prevent line wrapping - s1 = s1[: k + adjust] + " " * amt + s1[k + adjust :] - adjust += amt - s2 += " " * (j - i) + lbl.ljust(k - j) - i = k - - long_lines = [s0, s1, s2] - - outstr += "\n\n".join( - map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" ")) - ).replace("~", " ") - outstr += "\n" - return outstr - - -def _pretty_annotation(sent, aset_level=False): - """ - Helper function for pretty-printing an exemplar sentence for a lexical unit. - - :param sent: An annotation set or exemplar sentence to be printed. - :param aset_level: If True, 'sent' is actually an annotation set within a sentence. - :type sent: AttrDict - :return: A nicely formatted string representation of the exemplar sentence - with its target, frame, and FE annotations. - :rtype: str - """ - - sentkeys = sent.keys() - outstr = "annotation set" if aset_level else "exemplar sentence" - outstr += f" ({sent.ID}):\n" - if aset_level: # TODO: any UNANN exemplars? - outstr += f"\n[status] {sent.status}\n" - for k in ("corpID", "docID", "paragNo", "sentNo", "aPos"): - if k in sentkeys: - outstr += f"[{k}] {sent[k]}\n" - outstr += ( - "\n[LU] ({0.ID}) {0.name} in {0.frame.name}\n".format(sent.LU) - if sent.LU - else "\n[LU] Not found!" - ) - outstr += "\n[frame] ({0.ID}) {0.name}\n".format( - sent.frame - ) # redundant with above, but .frame is convenient - if not aset_level: - outstr += "\n[annotationSet] {} annotation sets\n".format( - len(sent.annotationSet) - ) - outstr += f"\n[POS] {len(sent.POS)} tags\n" - outstr += f"\n[POS_tagset] {sent.POS_tagset}\n" - outstr += "\n[GF] {} relation{}\n".format( - len(sent.GF), "s" if len(sent.GF) != 1 else "" - ) - outstr += "\n[PT] {} phrase{}\n".format( - len(sent.PT), "s" if len(sent.PT) != 1 else "" - ) - """ - Special Layers - -------------- - - The 'NER' layer contains, for some of the data, named entity labels. - - The 'WSL' (word status layer) contains, for some of the data, - spans which should not in principle be considered targets (NT). - - The 'Other' layer records relative clause constructions (Rel=relativizer, Ant=antecedent), - pleonastic 'it' (Null), and existential 'there' (Exist). - On occasion they are duplicated by accident (e.g., annotationSet 1467275 in lu6700.xml). - - The 'Sent' layer appears to contain labels that the annotator has flagged the - sentence with for their convenience: values include - 'sense1', 'sense2', 'sense3', etc.; - 'Blend', 'Canonical', 'Idiom', 'Metaphor', 'Special-Sent', - 'keepS', 'deleteS', 'reexamine' - (sometimes they are duplicated for no apparent reason). - - The POS-specific layers may contain the following kinds of spans: - Asp (aspectual particle), Non-Asp (non-aspectual particle), - Cop (copula), Supp (support), Ctrlr (controller), - Gov (governor), X. Gov and X always cooccur. - - >>> from nltk.corpus import framenet as fn - >>> def f(luRE, lyr, ignore=set()): - ... for i,ex in enumerate(fn.exemplars(luRE)): - ... if lyr in ex and ex[lyr] and set(zip(*ex[lyr])[2]) - ignore: - ... print(i,ex[lyr]) - - - Verb: Asp, Non-Asp - - Noun: Cop, Supp, Ctrlr, Gov, X - - Adj: Cop, Supp, Ctrlr, Gov, X - - Prep: Cop, Supp, Ctrlr - - Adv: Ctrlr - - Scon: (none) - - Art: (none) - """ - for lyr in ("NER", "WSL", "Other", "Sent"): - if lyr in sent and sent[lyr]: - outstr += "\n[{}] {} entr{}\n".format( - lyr, len(sent[lyr]), "ies" if len(sent[lyr]) != 1 else "y" - ) - outstr += "\n[text] + [Target] + [FE]" - # POS-specific layers: syntactically important words that are neither the target - # nor the FEs. Include these along with the first FE layer but with '^' underlining. - for lyr in ("Verb", "Noun", "Adj", "Adv", "Prep", "Scon", "Art"): - if lyr in sent and sent[lyr]: - outstr += f" + [{lyr}]" - if "FE2" in sentkeys: - outstr += " + [FE2]" - if "FE3" in sentkeys: - outstr += " + [FE3]" - outstr += "\n\n" - outstr += sent._ascii() # -> _annotation_ascii() - outstr += "\n" - - return outstr - - -def _annotation_ascii(sent): - """ - Given a sentence or FE annotation set, construct the width-limited string showing - an ASCII visualization of the sentence's annotations, calling either - _annotation_ascii_frames() or _annotation_ascii_FEs() as appropriate. - This will be attached as a method to appropriate AttrDict instances - and called in the full pretty-printing of the instance. - """ - if sent._type == "fulltext_sentence" or ( - "annotationSet" in sent and len(sent.annotationSet) > 2 - ): - # a full-text sentence OR sentence with multiple targets. - # (multiple targets = >2 annotation sets, because the first annotation set is POS.) - return _annotation_ascii_frames(sent) - else: # an FE annotation set, or an LU sentence with 1 target - return _annotation_ascii_FEs(sent) - - -def _annotation_ascii_frames(sent): - """ - ASCII string rendering of the sentence along with its targets and frame names. - Called for all full-text sentences, as well as the few LU sentences with multiple - targets (e.g., fn.lu(6412).exemplars[82] has two want.v targets). - Line-wrapped to limit the display width. - """ - # list the target spans and their associated aset index - overt = [] - for a, aset in enumerate(sent.annotationSet[1:]): - for j, k in aset.Target: - indexS = f"[{a + 1}]" - if aset.status == "UNANN" or aset.LU.status == "Problem": - indexS += " " - if aset.status == "UNANN": - indexS += "!" # warning indicator that there is a frame annotation but no FE annotation - if aset.LU.status == "Problem": - indexS += "?" # warning indicator that there is a missing LU definition (because the LU has Problem status) - overt.append((j, k, aset.LU.frame.name, indexS)) - overt = sorted(overt) - - duplicates = set() - for o, (j, k, fname, asetIndex) in enumerate(overt): - if o > 0 and j <= overt[o - 1][1]: - # multiple annotation sets on the same target - # (e.g. due to a coordination construction or multiple annotators) - if ( - overt[o - 1][:2] == (j, k) and overt[o - 1][2] == fname - ): # same target, same frame - # splice indices together - combinedIndex = ( - overt[o - 1][3] + asetIndex - ) # e.g., '[1][2]', '[1]! [2]' - combinedIndex = combinedIndex.replace(" !", "! ").replace(" ?", "? ") - overt[o - 1] = overt[o - 1][:3] + (combinedIndex,) - duplicates.add(o) - else: # different frames, same or overlapping targets - s = sent.text - for j, k, fname, asetIndex in overt: - s += "\n" + asetIndex + " " + sent.text[j:k] + " :: " + fname - s += "\n(Unable to display sentence with targets marked inline due to overlap)" - return s - for o in reversed(sorted(duplicates)): - del overt[o] - - s0 = sent.text - s1 = "" - s11 = "" - s2 = "" - i = 0 - adjust = 0 - fAbbrevs = OrderedDict() - for j, k, fname, asetIndex in overt: - if not j >= i: - assert j >= i, ( - "Overlapping targets?" - + ( - " UNANN" - if any(aset.status == "UNANN" for aset in sent.annotationSet[1:]) - else "" - ), - (j, k, asetIndex), - ) - s1 += " " * (j - i) + "*" * (k - j) - short = fname[: k - j] - if (k - j) < len(fname): - r = 0 - while short in fAbbrevs: - if fAbbrevs[short] == fname: - break - r += 1 - short = fname[: k - j - 1] + str(r) - else: # short not in fAbbrevs - fAbbrevs[short] = fname - s11 += " " * (j - i) + short.ljust(k - j) - if len(asetIndex) > (k - j): - # add space in the sentence to make room for the annotation index - amt = len(asetIndex) - (k - j) - s0 = ( - s0[: k + adjust] + "~" * amt + s0[k + adjust :] - ) # '~' to prevent line wrapping - s1 = s1[: k + adjust] + " " * amt + s1[k + adjust :] - s11 = s11[: k + adjust] + " " * amt + s11[k + adjust :] - adjust += amt - s2 += " " * (j - i) + asetIndex.ljust(k - j) - i = k - - long_lines = [s0, s1, s11, s2] - - outstr = "\n\n".join( - map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" ")) - ).replace("~", " ") - outstr += "\n" - if fAbbrevs: - outstr += " (" + ", ".join("=".join(pair) for pair in fAbbrevs.items()) + ")" - assert len(fAbbrevs) == len(dict(fAbbrevs)), "Abbreviation clash" - - return outstr - - -def _annotation_ascii_FE_layer(overt, ni, feAbbrevs): - """Helper for _annotation_ascii_FEs().""" - s1 = "" - s2 = "" - i = 0 - for j, k, fename in overt: - s1 += " " * (j - i) + ("^" if fename.islower() else "-") * (k - j) - short = fename[: k - j] - if len(fename) > len(short): - r = 0 - while short in feAbbrevs: - if feAbbrevs[short] == fename: - break - r += 1 - short = fename[: k - j - 1] + str(r) - else: # short not in feAbbrevs - feAbbrevs[short] = fename - s2 += " " * (j - i) + short.ljust(k - j) - i = k - - sNI = "" - if ni: - sNI += " [" + ", ".join(":".join(x) for x in sorted(ni.items())) + "]" - return [s1, s2, sNI] - - -def _annotation_ascii_FEs(sent): - """ - ASCII string rendering of the sentence along with a single target and its FEs. - Secondary and tertiary FE layers are included if present. - 'sent' can be an FE annotation set or an LU sentence with a single target. - Line-wrapped to limit the display width. - """ - feAbbrevs = OrderedDict() - posspec = [] # POS-specific layer spans (e.g., Supp[ort], Cop[ula]) - posspec_separate = False - for lyr in ("Verb", "Noun", "Adj", "Adv", "Prep", "Scon", "Art"): - if lyr in sent and sent[lyr]: - for a, b, lbl in sent[lyr]: - if ( - lbl == "X" - ): # skip this, which covers an entire phrase typically containing the target and all its FEs - # (but do display the Gov) - continue - if any(1 for x, y, felbl in sent.FE[0] if x <= a < y or a <= x < b): - # overlap between one of the POS-specific layers and first FE layer - posspec_separate = ( - True # show POS-specific layers on a separate line - ) - posspec.append( - (a, b, lbl.lower().replace("-", "")) - ) # lowercase Cop=>cop, Non-Asp=>nonasp, etc. to distinguish from FE names - if posspec_separate: - POSSPEC = _annotation_ascii_FE_layer(posspec, {}, feAbbrevs) - FE1 = _annotation_ascii_FE_layer( - sorted(sent.FE[0] + (posspec if not posspec_separate else [])), - sent.FE[1], - feAbbrevs, - ) - FE2 = FE3 = None - if "FE2" in sent: - FE2 = _annotation_ascii_FE_layer(sent.FE2[0], sent.FE2[1], feAbbrevs) - if "FE3" in sent: - FE3 = _annotation_ascii_FE_layer(sent.FE3[0], sent.FE3[1], feAbbrevs) - - for i, j in sent.Target: - FE1span, FE1name, FE1exp = FE1 - if len(FE1span) < j: - FE1span += " " * (j - len(FE1span)) - if len(FE1name) < j: - FE1name += " " * (j - len(FE1name)) - FE1[1] = FE1name - FE1[0] = ( - FE1span[:i] + FE1span[i:j].replace(" ", "*").replace("-", "=") + FE1span[j:] - ) - long_lines = [sent.text] - if posspec_separate: - long_lines.extend(POSSPEC[:2]) - long_lines.extend([FE1[0], FE1[1] + FE1[2]]) # lines with no length limit - if FE2: - long_lines.extend([FE2[0], FE2[1] + FE2[2]]) - if FE3: - long_lines.extend([FE3[0], FE3[1] + FE3[2]]) - long_lines.append("") - outstr = "\n".join( - map("\n".join, zip_longest(*mimic_wrap(long_lines), fillvalue=" ")) - ) - if feAbbrevs: - outstr += "(" + ", ".join("=".join(pair) for pair in feAbbrevs.items()) + ")" - assert len(feAbbrevs) == len(dict(feAbbrevs)), "Abbreviation clash" - outstr += "\n" - - return outstr - - -def _pretty_fe(fe): - - """ - Helper function for pretty-printing a frame element. - - :param fe: The frame element to be printed. - :type fe: AttrDict - :return: A nicely formatted string representation of the frame element. - :rtype: str - """ - fekeys = fe.keys() - outstr = "" - outstr += "frame element ({0.ID}): {0.name}\n of {1.name}({1.ID})\n".format( - fe, fe.frame - ) - if "definition" in fekeys: - outstr += "[definition]\n" - outstr += _pretty_longstring(fe.definition, " ") - if "abbrev" in fekeys: - outstr += f"[abbrev] {fe.abbrev}\n" - if "coreType" in fekeys: - outstr += f"[coreType] {fe.coreType}\n" - if "requiresFE" in fekeys: - outstr += "[requiresFE] " - if fe.requiresFE is None: - outstr += "\n" - else: - outstr += f"{fe.requiresFE.name}({fe.requiresFE.ID})\n" - if "excludesFE" in fekeys: - outstr += "[excludesFE] " - if fe.excludesFE is None: - outstr += "\n" - else: - outstr += f"{fe.excludesFE.name}({fe.excludesFE.ID})\n" - if "semType" in fekeys: - outstr += "[semType] " - if fe.semType is None: - outstr += "\n" - else: - outstr += "\n " + f"{fe.semType.name}({fe.semType.ID})" + "\n" - - return outstr - - -def _pretty_frame(frame): - - """ - Helper function for pretty-printing a frame. - - :param frame: The frame to be printed. - :type frame: AttrDict - :return: A nicely formatted string representation of the frame. - :rtype: str - """ - - outstr = "" - outstr += "frame ({0.ID}): {0.name}\n\n".format(frame) - outstr += f"[URL] {frame.URL}\n\n" - outstr += "[definition]\n" - outstr += _pretty_longstring(frame.definition, " ") + "\n" - - outstr += f"[semTypes] {len(frame.semTypes)} semantic types\n" - outstr += ( - " " * (len(frame.semTypes) > 0) - + ", ".join(f"{x.name}({x.ID})" for x in frame.semTypes) - + "\n" * (len(frame.semTypes) > 0) - ) - - outstr += "\n[frameRelations] {} frame relations\n".format( - len(frame.frameRelations) - ) - outstr += " " + "\n ".join(repr(frel) for frel in frame.frameRelations) + "\n" - - outstr += f"\n[lexUnit] {len(frame.lexUnit)} lexical units\n" - lustrs = [] - for luName, lu in sorted(frame.lexUnit.items()): - tmpstr = f"{luName} ({lu.ID})" - lustrs.append(tmpstr) - outstr += "{}\n".format(_pretty_longstring(", ".join(lustrs), prefix=" ")) - - outstr += f"\n[FE] {len(frame.FE)} frame elements\n" - fes = {} - for feName, fe in sorted(frame.FE.items()): - try: - fes[fe.coreType].append(f"{feName} ({fe.ID})") - except KeyError: - fes[fe.coreType] = [] - fes[fe.coreType].append(f"{feName} ({fe.ID})") - for ct in sorted( - fes.keys(), - key=lambda ct2: [ - "Core", - "Core-Unexpressed", - "Peripheral", - "Extra-Thematic", - ].index(ct2), - ): - outstr += "{:>16}: {}\n".format(ct, ", ".join(sorted(fes[ct]))) - - outstr += "\n[FEcoreSets] {} frame element core sets\n".format( - len(frame.FEcoreSets) - ) - outstr += ( - " " - + "\n ".join( - ", ".join([x.name for x in coreSet]) for coreSet in frame.FEcoreSets - ) - + "\n" - ) - - return outstr - - -class FramenetError(Exception): - - """An exception class for framenet-related errors.""" - - -class AttrDict(dict): - - """A class that wraps a dict and allows accessing the keys of the - dict as if they were attributes. Taken from here: - https://stackoverflow.com/a/14620633/8879 - - >>> foo = {'a':1, 'b':2, 'c':3} - >>> bar = AttrDict(foo) - >>> pprint(dict(bar)) - {'a': 1, 'b': 2, 'c': 3} - >>> bar.b - 2 - >>> bar.d = 4 - >>> pprint(dict(bar)) - {'a': 1, 'b': 2, 'c': 3, 'd': 4} - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # self.__dict__ = self - - def __setattr__(self, name, value): - self[name] = value - - def __getattr__(self, name): - if name == "_short_repr": - return self._short_repr - return self[name] - - def __getitem__(self, name): - v = super().__getitem__(name) - if isinstance(v, Future): - return v._data() - return v - - def _short_repr(self): - if "_type" in self: - if self["_type"].endswith("relation"): - return self.__repr__() - try: - return "<{} ID={} name={}>".format( - self["_type"], self["ID"], self["name"] - ) - except KeyError: - try: # no ID--e.g., for _type=lusubcorpus - return "<{} name={}>".format(self["_type"], self["name"]) - except KeyError: # no name--e.g., for _type=lusentence - return "<{} ID={}>".format(self["_type"], self["ID"]) - else: - return self.__repr__() - - def _str(self): - outstr = "" - - if "_type" not in self: - outstr = _pretty_any(self) - elif self["_type"] == "frame": - outstr = _pretty_frame(self) - elif self["_type"] == "fe": - outstr = _pretty_fe(self) - elif self["_type"] == "lu": - outstr = _pretty_lu(self) - elif self["_type"] == "luexemplars": # list of ALL exemplars for LU - outstr = _pretty_exemplars(self, self[0].LU) - elif ( - self["_type"] == "fulltext_annotation" - ): # list of all sentences for full-text doc - outstr = _pretty_fulltext_sentences(self) - elif self["_type"] == "lusentence": - outstr = _pretty_annotation(self) - elif self["_type"] == "fulltext_sentence": - outstr = _pretty_fulltext_sentence(self) - elif self["_type"] in ("luannotationset", "fulltext_annotationset"): - outstr = _pretty_annotation(self, aset_level=True) - elif self["_type"] == "posannotationset": - outstr = _pretty_pos(self) - elif self["_type"] == "semtype": - outstr = _pretty_semtype(self) - elif self["_type"] == "framerelationtype": - outstr = _pretty_frame_relation_type(self) - elif self["_type"] == "framerelation": - outstr = _pretty_frame_relation(self) - elif self["_type"] == "ferelation": - outstr = _pretty_fe_relation(self) - else: - outstr = _pretty_any(self) - - # ensure result is unicode string prior to applying the - # decorator (because non-ASCII characters - # could in principle occur in the data and would trigger an encoding error when - # passed as arguments to str.format()). - # assert isinstance(outstr, unicode) # not in Python 3.2 - return outstr - - def __str__(self): - return self._str() - - def __repr__(self): - return self.__str__() - - -class SpecialList(list): - """ - A list subclass which adds a '_type' attribute for special printing - (similar to an AttrDict, though this is NOT an AttrDict subclass). - """ - - def __init__(self, typ, *args, **kwargs): - super().__init__(*args, **kwargs) - self._type = typ - - def _str(self): - outstr = "" - - assert self._type - if len(self) == 0: - outstr = "[]" - elif self._type == "luexemplars": # list of ALL exemplars for LU - outstr = _pretty_exemplars(self, self[0].LU) - else: - assert False, self._type - return outstr - - def __str__(self): - return self._str() - - def __repr__(self): - return self.__str__() - - -class Future: - """ - Wraps and acts as a proxy for a value to be loaded lazily (on demand). - Adapted from https://gist.github.com/sergey-miryanov/2935416 - """ - - def __init__(self, loader, *args, **kwargs): - """ - :param loader: when called with no arguments, returns the value to be stored - :type loader: callable - """ - super().__init__(*args, **kwargs) - self._loader = loader - self._d = None - - def _data(self): - if callable(self._loader): - self._d = self._loader() - self._loader = None # the data is now cached - return self._d - - def __nonzero__(self): - return bool(self._data()) - - def __len__(self): - return len(self._data()) - - def __setitem__(self, key, value): - return self._data().__setitem__(key, value) - - def __getitem__(self, key): - return self._data().__getitem__(key) - - def __getattr__(self, key): - return self._data().__getattr__(key) - - def __str__(self): - return self._data().__str__() - - def __repr__(self): - return self._data().__repr__() - - -class PrettyDict(AttrDict): - """ - Displays an abbreviated repr of values where possible. - Inherits from AttrDict, so a callable value will - be lazily converted to an actual value. - """ - - def __init__(self, *args, **kwargs): - _BREAK_LINES = kwargs.pop("breakLines", False) - super().__init__(*args, **kwargs) - dict.__setattr__(self, "_BREAK_LINES", _BREAK_LINES) - - def __repr__(self): - parts = [] - for k, v in sorted(self.items()): - kv = repr(k) + ": " - try: - kv += v._short_repr() - except AttributeError: - kv += repr(v) - parts.append(kv) - return "{" + (",\n " if self._BREAK_LINES else ", ").join(parts) + "}" - - -class PrettyList(list): - """ - Displays an abbreviated repr of only the first several elements, not the whole list. - """ - - # from nltk.util - def __init__(self, *args, **kwargs): - self._MAX_REPR_SIZE = kwargs.pop("maxReprSize", 60) - self._BREAK_LINES = kwargs.pop("breakLines", False) - super().__init__(*args, **kwargs) - - def __repr__(self): - """ - Return a string representation for this corpus view that is - similar to a list's representation; but if it would be more - than 60 characters long, it is truncated. - """ - pieces = [] - length = 5 - - for elt in self: - pieces.append( - elt._short_repr() - ) # key difference from inherited version: call to _short_repr() - length += len(pieces[-1]) + 2 - if self._MAX_REPR_SIZE and length > self._MAX_REPR_SIZE and len(pieces) > 2: - return "[%s, ...]" % str(",\n " if self._BREAK_LINES else ", ").join( - pieces[:-1] - ) - return "[%s]" % str(",\n " if self._BREAK_LINES else ", ").join(pieces) - - -class PrettyLazyMap(LazyMap): - """ - Displays an abbreviated repr of only the first several elements, not the whole list. - """ - - # from nltk.util - _MAX_REPR_SIZE = 60 - - def __repr__(self): - """ - Return a string representation for this corpus view that is - similar to a list's representation; but if it would be more - than 60 characters long, it is truncated. - """ - pieces = [] - length = 5 - for elt in self: - pieces.append( - elt._short_repr() - ) # key difference from inherited version: call to _short_repr() - length += len(pieces[-1]) + 2 - if length > self._MAX_REPR_SIZE and len(pieces) > 2: - return "[%s, ...]" % ", ".join(pieces[:-1]) - return "[%s]" % ", ".join(pieces) - - -class PrettyLazyIteratorList(LazyIteratorList): - """ - Displays an abbreviated repr of only the first several elements, not the whole list. - """ - - # from nltk.util - _MAX_REPR_SIZE = 60 - - def __repr__(self): - """ - Return a string representation for this corpus view that is - similar to a list's representation; but if it would be more - than 60 characters long, it is truncated. - """ - pieces = [] - length = 5 - for elt in self: - pieces.append( - elt._short_repr() - ) # key difference from inherited version: call to _short_repr() - length += len(pieces[-1]) + 2 - if length > self._MAX_REPR_SIZE and len(pieces) > 2: - return "[%s, ...]" % ", ".join(pieces[:-1]) - return "[%s]" % ", ".join(pieces) - - -class PrettyLazyConcatenation(LazyConcatenation): - """ - Displays an abbreviated repr of only the first several elements, not the whole list. - """ - - # from nltk.util - _MAX_REPR_SIZE = 60 - - def __repr__(self): - """ - Return a string representation for this corpus view that is - similar to a list's representation; but if it would be more - than 60 characters long, it is truncated. - """ - pieces = [] - length = 5 - for elt in self: - pieces.append( - elt._short_repr() - ) # key difference from inherited version: call to _short_repr() - length += len(pieces[-1]) + 2 - if length > self._MAX_REPR_SIZE and len(pieces) > 2: - return "[%s, ...]" % ", ".join(pieces[:-1]) - return "[%s]" % ", ".join(pieces) - - def __add__(self, other): - """Return a list concatenating self with other.""" - return PrettyLazyIteratorList(itertools.chain(self, other)) - - def __radd__(self, other): - """Return a list concatenating other with self.""" - return PrettyLazyIteratorList(itertools.chain(other, self)) - - -class FramenetCorpusReader(XMLCorpusReader): - """A corpus reader for the Framenet Corpus. - - >>> from nltk.corpus import framenet as fn - >>> fn.lu(3238).frame.lexUnit['glint.v'] is fn.lu(3238) - True - >>> fn.frame_by_name('Replacing') is fn.lus('replace.v')[0].frame - True - >>> fn.lus('prejudice.n')[0].frame.frameRelations == fn.frame_relations('Partiality') - True - """ - - _bad_statuses = ["Problem"] - """ - When loading LUs for a frame, those whose status is in this list will be ignored. - Due to caching, if user code modifies this, it should do so before loading any data. - 'Problem' should always be listed for FrameNet 1.5, as these LUs are not included - in the XML index. - """ - - _warnings = False - - def warnings(self, v): - """Enable or disable warnings of data integrity issues as they are encountered. - If v is truthy, warnings will be enabled. - - (This is a function rather than just an attribute/property to ensure that if - enabling warnings is the first action taken, the corpus reader is instantiated first.) - """ - self._warnings = v - - def __init__(self, root, fileids): - XMLCorpusReader.__init__(self, root, fileids) - - # framenet corpus sub dirs - # sub dir containing the xml files for frames - self._frame_dir = "frame" - # sub dir containing the xml files for lexical units - self._lu_dir = "lu" - # sub dir containing the xml files for fulltext annotation files - self._fulltext_dir = "fulltext" - - # location of latest development version of FrameNet - self._fnweb_url = "https://framenet2.icsi.berkeley.edu/fnReports/data" - - # Indexes used for faster look-ups - self._frame_idx = None - self._cached_frames = {} # name -> ID - self._lu_idx = None - self._fulltext_idx = None - self._semtypes = None - self._freltyp_idx = None # frame relation types (Inheritance, Using, etc.) - self._frel_idx = None # frame-to-frame relation instances - self._ferel_idx = None # FE-to-FE relation instances - self._frel_f_idx = None # frame-to-frame relations associated with each frame - - self._readme = "README.txt" - - def help(self, attrname=None): - """Display help information summarizing the main methods.""" - - if attrname is not None: - return help(self.__getattribute__(attrname)) - - # No need to mention frame_by_name() or frame_by_id(), - # as it's easier to just call frame(). - # Also not mentioning lu_basic(). - - msg = """ -Citation: Nathan Schneider and Chuck Wooters (2017), -"The NLTK FrameNet API: Designing for Discoverability with a Rich Linguistic Resource". -Proceedings of EMNLP: System Demonstrations. https://arxiv.org/abs/1703.07438 - -Use the following methods to access data in FrameNet. -Provide a method name to `help()` for more information. - -FRAMES -====== - -frame() to look up a frame by its exact name or ID -frames() to get frames matching a name pattern -frames_by_lemma() to get frames containing an LU matching a name pattern -frame_ids_and_names() to get a mapping from frame IDs to names - -FRAME ELEMENTS -============== - -fes() to get frame elements (a.k.a. roles) matching a name pattern, optionally constrained - by a frame name pattern - -LEXICAL UNITS -============= - -lu() to look up an LU by its ID -lus() to get lexical units matching a name pattern, optionally constrained by frame -lu_ids_and_names() to get a mapping from LU IDs to names - -RELATIONS -========= - -frame_relation_types() to get the different kinds of frame-to-frame relations - (Inheritance, Subframe, Using, etc.). -frame_relations() to get the relation instances, optionally constrained by - frame(s) or relation type -fe_relations() to get the frame element pairs belonging to a frame-to-frame relation - -SEMANTIC TYPES -============== - -semtypes() to get the different kinds of semantic types that can be applied to - FEs, LUs, and entire frames -semtype() to look up a particular semtype by name, ID, or abbreviation -semtype_inherits() to check whether two semantic types have a subtype-supertype - relationship in the semtype hierarchy -propagate_semtypes() to apply inference rules that distribute semtypes over relations - between FEs - -ANNOTATIONS -=========== - -annotations() to get annotation sets, in which a token in a sentence is annotated - with a lexical unit in a frame, along with its frame elements and their syntactic properties; - can be constrained by LU name pattern and limited to lexicographic exemplars or full-text. - Sentences of full-text annotation can have multiple annotation sets. -sents() to get annotated sentences illustrating one or more lexical units -exemplars() to get sentences of lexicographic annotation, most of which have - just 1 annotation set; can be constrained by LU name pattern, frame, and overt FE(s) -doc() to look up a document of full-text annotation by its ID -docs() to get documents of full-text annotation that match a name pattern -docs_metadata() to get metadata about all full-text documents without loading them -ft_sents() to iterate over sentences of full-text annotation - -UTILITIES -========= - -buildindexes() loads metadata about all frames, LUs, etc. into memory to avoid - delay when one is accessed for the first time. It does not load annotations. -readme() gives the text of the FrameNet README file -warnings(True) to display corpus consistency warnings when loading data - """ - print(msg) - - def _buildframeindex(self): - # The total number of Frames in Framenet is fairly small (~1200) so - # this index should not be very large - if not self._frel_idx: - self._buildrelationindex() # always load frame relations before frames, - # otherwise weird ordering effects might result in incomplete information - self._frame_idx = {} - with XMLCorpusView( - self.abspath("frameIndex.xml"), "frameIndex/frame", self._handle_elt - ) as view: - for f in view: - self._frame_idx[f["ID"]] = f - - def _buildcorpusindex(self): - # The total number of fulltext annotated documents in Framenet - # is fairly small (~90) so this index should not be very large - self._fulltext_idx = {} - with XMLCorpusView( - self.abspath("fulltextIndex.xml"), - "fulltextIndex/corpus", - self._handle_fulltextindex_elt, - ) as view: - for doclist in view: - for doc in doclist: - self._fulltext_idx[doc.ID] = doc - - def _buildluindex(self): - # The number of LUs in Framenet is about 13,000 so this index - # should not be very large - self._lu_idx = {} - with XMLCorpusView( - self.abspath("luIndex.xml"), "luIndex/lu", self._handle_elt - ) as view: - for lu in view: - self._lu_idx[ - lu["ID"] - ] = lu # populate with LU index entries. if any of these - # are looked up they will be replaced by full LU objects. - - def _buildrelationindex(self): - # print('building relation index...', file=sys.stderr) - self._freltyp_idx = {} - self._frel_idx = {} - self._frel_f_idx = defaultdict(set) - self._ferel_idx = {} - - with XMLCorpusView( - self.abspath("frRelation.xml"), - "frameRelations/frameRelationType", - self._handle_framerelationtype_elt, - ) as view: - for freltyp in view: - self._freltyp_idx[freltyp.ID] = freltyp - for frel in freltyp.frameRelations: - supF = frel.superFrame = frel[freltyp.superFrameName] = Future( - (lambda fID: lambda: self.frame_by_id(fID))(frel.supID) - ) - subF = frel.subFrame = frel[freltyp.subFrameName] = Future( - (lambda fID: lambda: self.frame_by_id(fID))(frel.subID) - ) - self._frel_idx[frel.ID] = frel - self._frel_f_idx[frel.supID].add(frel.ID) - self._frel_f_idx[frel.subID].add(frel.ID) - for ferel in frel.feRelations: - ferel.superFrame = supF - ferel.subFrame = subF - ferel.superFE = Future( - (lambda fer: lambda: fer.superFrame.FE[fer.superFEName])( - ferel - ) - ) - ferel.subFE = Future( - (lambda fer: lambda: fer.subFrame.FE[fer.subFEName])(ferel) - ) - self._ferel_idx[ferel.ID] = ferel - # print('...done building relation index', file=sys.stderr) - - def _warn(self, *message, **kwargs): - if self._warnings: - kwargs.setdefault("file", sys.stderr) - print(*message, **kwargs) - - def buildindexes(self): - """ - Build the internal indexes to make look-ups faster. - """ - # Frames - self._buildframeindex() - # LUs - self._buildluindex() - # Fulltext annotation corpora index - self._buildcorpusindex() - # frame and FE relations - self._buildrelationindex() - - def doc(self, fn_docid): - """ - Returns the annotated document whose id number is - ``fn_docid``. This id number can be obtained by calling the - Documents() function. - - The dict that is returned from this function will contain the - following keys: - - - '_type' : 'fulltextannotation' - - 'sentence' : a list of sentences in the document - - Each item in the list is a dict containing the following keys: - - 'ID' : the ID number of the sentence - - '_type' : 'sentence' - - 'text' : the text of the sentence - - 'paragNo' : the paragraph number - - 'sentNo' : the sentence number - - 'docID' : the document ID number - - 'corpID' : the corpus ID number - - 'aPos' : the annotation position - - 'annotationSet' : a list of annotation layers for the sentence - - Each item in the list is a dict containing the following keys: - - 'ID' : the ID number of the annotation set - - '_type' : 'annotationset' - - 'status' : either 'MANUAL' or 'UNANN' - - 'luName' : (only if status is 'MANUAL') - - 'luID' : (only if status is 'MANUAL') - - 'frameID' : (only if status is 'MANUAL') - - 'frameName': (only if status is 'MANUAL') - - 'layer' : a list of labels for the layer - - Each item in the layer is a dict containing the following keys: - - '_type': 'layer' - - 'rank' - - 'name' - - 'label' : a list of labels in the layer - - Each item is a dict containing the following keys: - - 'start' - - 'end' - - 'name' - - 'feID' (optional) - - :param fn_docid: The Framenet id number of the document - :type fn_docid: int - :return: Information about the annotated document - :rtype: dict - """ - try: - xmlfname = self._fulltext_idx[fn_docid].filename - except TypeError: # happens when self._fulltext_idx == None - # build the index - self._buildcorpusindex() - xmlfname = self._fulltext_idx[fn_docid].filename - except KeyError as e: # probably means that fn_docid was not in the index - raise FramenetError(f"Unknown document id: {fn_docid}") from e - - # construct the path name for the xml file containing the document info - locpath = os.path.join(f"{self._root}", self._fulltext_dir, xmlfname) - - # Grab the top-level xml element containing the fulltext annotation - with XMLCorpusView(locpath, "fullTextAnnotation") as view: - elt = view[0] - info = self._handle_fulltextannotation_elt(elt) - # add metadata - for k, v in self._fulltext_idx[fn_docid].items(): - info[k] = v - return info - - def frame_by_id(self, fn_fid, ignorekeys=[]): - """ - Get the details for the specified Frame using the frame's id - number. - - Usage examples: - - >>> from nltk.corpus import framenet as fn - >>> f = fn.frame_by_id(256) - >>> f.ID - 256 - >>> f.name - 'Medical_specialties' - >>> f.definition # doctest: +NORMALIZE_WHITESPACE - "This frame includes words that name medical specialties and is closely related to the - Medical_professionals frame. The FE Type characterizing a sub-are in a Specialty may also be - expressed. 'Ralph practices paediatric oncology.'" - - :param fn_fid: The Framenet id number of the frame - :type fn_fid: int - :param ignorekeys: The keys to ignore. These keys will not be - included in the output. (optional) - :type ignorekeys: list(str) - :return: Information about a frame - :rtype: dict - - Also see the ``frame()`` function for details about what is - contained in the dict that is returned. - """ - - # get the name of the frame with this id number - try: - fentry = self._frame_idx[fn_fid] - if "_type" in fentry: - return fentry # full frame object is cached - name = fentry["name"] - except TypeError: - self._buildframeindex() - name = self._frame_idx[fn_fid]["name"] - except KeyError as e: - raise FramenetError(f"Unknown frame id: {fn_fid}") from e - - return self.frame_by_name(name, ignorekeys, check_cache=False) - - def frame_by_name(self, fn_fname, ignorekeys=[], check_cache=True): - """ - Get the details for the specified Frame using the frame's name. - - Usage examples: - - >>> from nltk.corpus import framenet as fn - >>> f = fn.frame_by_name('Medical_specialties') - >>> f.ID - 256 - >>> f.name - 'Medical_specialties' - >>> f.definition # doctest: +NORMALIZE_WHITESPACE - "This frame includes words that name medical specialties and is closely related to the - Medical_professionals frame. The FE Type characterizing a sub-are in a Specialty may also be - expressed. 'Ralph practices paediatric oncology.'" - - :param fn_fname: The name of the frame - :type fn_fname: str - :param ignorekeys: The keys to ignore. These keys will not be - included in the output. (optional) - :type ignorekeys: list(str) - :return: Information about a frame - :rtype: dict - - Also see the ``frame()`` function for details about what is - contained in the dict that is returned. - """ - - if check_cache and fn_fname in self._cached_frames: - return self._frame_idx[self._cached_frames[fn_fname]] - elif not self._frame_idx: - self._buildframeindex() - - # construct the path name for the xml file containing the Frame info - locpath = os.path.join(f"{self._root}", self._frame_dir, fn_fname + ".xml") - # print(locpath, file=sys.stderr) - # Grab the xml for the frame - try: - with XMLCorpusView(locpath, "frame") as view: - elt = view[0] - except OSError as e: - raise FramenetError(f"Unknown frame: {fn_fname}") from e - - fentry = self._handle_frame_elt(elt, ignorekeys) - assert fentry - - fentry.URL = self._fnweb_url + "/" + self._frame_dir + "/" + fn_fname + ".xml" - - # INFERENCE RULE: propagate lexical semtypes from the frame to all its LUs - for st in fentry.semTypes: - if st.rootType.name == "Lexical_type": - for lu in fentry.lexUnit.values(): - if not any( - x is st for x in lu.semTypes - ): # identity containment check - lu.semTypes.append(st) - - self._frame_idx[fentry.ID] = fentry - self._cached_frames[fentry.name] = fentry.ID - """ - # now set up callables to resolve the LU pointers lazily. - # (could also do this here--caching avoids infinite recursion.) - for luName,luinfo in fentry.lexUnit.items(): - fentry.lexUnit[luName] = (lambda luID: Future(lambda: self.lu(luID)))(luinfo.ID) - """ - return fentry - - def frame(self, fn_fid_or_fname, ignorekeys=[]): - """ - Get the details for the specified Frame using the frame's name - or id number. - - Usage examples: - - >>> from nltk.corpus import framenet as fn - >>> f = fn.frame(256) - >>> f.name - 'Medical_specialties' - >>> f = fn.frame('Medical_specialties') - >>> f.ID - 256 - >>> # ensure non-ASCII character in definition doesn't trigger an encoding error: - >>> fn.frame('Imposing_obligation') # doctest: +ELLIPSIS - frame (1494): Imposing_obligation... - - - The dict that is returned from this function will contain the - following information about the Frame: - - - 'name' : the name of the Frame (e.g. 'Birth', 'Apply_heat', etc.) - - 'definition' : textual definition of the Frame - - 'ID' : the internal ID number of the Frame - - 'semTypes' : a list of semantic types for this frame - - Each item in the list is a dict containing the following keys: - - 'name' : can be used with the semtype() function - - 'ID' : can be used with the semtype() function - - - 'lexUnit' : a dict containing all of the LUs for this frame. - The keys in this dict are the names of the LUs and - the value for each key is itself a dict containing - info about the LU (see the lu() function for more info.) - - - 'FE' : a dict containing the Frame Elements that are part of this frame - The keys in this dict are the names of the FEs (e.g. 'Body_system') - and the values are dicts containing the following keys - - - 'definition' : The definition of the FE - - 'name' : The name of the FE e.g. 'Body_system' - - 'ID' : The id number - - '_type' : 'fe' - - 'abbrev' : Abbreviation e.g. 'bod' - - 'coreType' : one of "Core", "Peripheral", or "Extra-Thematic" - - 'semType' : if not None, a dict with the following two keys: - - 'name' : name of the semantic type. can be used with - the semtype() function - - 'ID' : id number of the semantic type. can be used with - the semtype() function - - 'requiresFE' : if not None, a dict with the following two keys: - - 'name' : the name of another FE in this frame - - 'ID' : the id of the other FE in this frame - - 'excludesFE' : if not None, a dict with the following two keys: - - 'name' : the name of another FE in this frame - - 'ID' : the id of the other FE in this frame - - - 'frameRelation' : a list of objects describing frame relations - - 'FEcoreSets' : a list of Frame Element core sets for this frame - - Each item in the list is a list of FE objects - - :param fn_fid_or_fname: The Framenet name or id number of the frame - :type fn_fid_or_fname: int or str - :param ignorekeys: The keys to ignore. These keys will not be - included in the output. (optional) - :type ignorekeys: list(str) - :return: Information about a frame - :rtype: dict - """ - - # get the frame info by name or id number - if isinstance(fn_fid_or_fname, str): - f = self.frame_by_name(fn_fid_or_fname, ignorekeys) - else: - f = self.frame_by_id(fn_fid_or_fname, ignorekeys) - - return f - - def frames_by_lemma(self, pat): - """ - Returns a list of all frames that contain LUs in which the - ``name`` attribute of the LU matches the given regular expression - ``pat``. Note that LU names are composed of "lemma.POS", where - the "lemma" part can be made up of either a single lexeme - (e.g. 'run') or multiple lexemes (e.g. 'a little'). - - Note: if you are going to be doing a lot of this type of - searching, you'd want to build an index that maps from lemmas to - frames because each time frames_by_lemma() is called, it has to - search through ALL of the frame XML files in the db. - - >>> from nltk.corpus import framenet as fn - >>> from nltk.corpus.reader.framenet import PrettyList - >>> PrettyList(sorted(fn.frames_by_lemma(r'(?i)a little'), key=itemgetter('ID'))) # doctest: +ELLIPSIS - [, ] - - :return: A list of frame objects. - :rtype: list(AttrDict) - """ - return PrettyList( - f - for f in self.frames() - if any(re.search(pat, luName) for luName in f.lexUnit) - ) - - def lu_basic(self, fn_luid): - """ - Returns basic information about the LU whose id is - ``fn_luid``. This is basically just a wrapper around the - ``lu()`` function with "subCorpus" info excluded. - - >>> from nltk.corpus import framenet as fn - >>> lu = PrettyDict(fn.lu_basic(256), breakLines=True) - >>> # ellipses account for differences between FN 1.5 and 1.7 - >>> lu # doctest: +ELLIPSIS - {'ID': 256, - 'POS': 'V', - 'URL': 'https://framenet2.icsi.berkeley.edu/fnReports/data/lu/lu256.xml', - '_type': 'lu', - 'cBy': ..., - 'cDate': '02/08/2001 01:27:50 PST Thu', - 'definition': 'COD: be aware of beforehand; predict.', - 'definitionMarkup': 'COD: be aware of beforehand; predict.', - 'frame': , - 'lemmaID': 15082, - 'lexemes': [{'POS': 'V', 'breakBefore': 'false', 'headword': 'false', 'name': 'foresee', 'order': 1}], - 'name': 'foresee.v', - 'semTypes': [], - 'sentenceCount': {'annotated': ..., 'total': ...}, - 'status': 'FN1_Sent'} - - :param fn_luid: The id number of the desired LU - :type fn_luid: int - :return: Basic information about the lexical unit - :rtype: dict - """ - return self.lu(fn_luid, ignorekeys=["subCorpus", "exemplars"]) - - def lu(self, fn_luid, ignorekeys=[], luName=None, frameID=None, frameName=None): - """ - Access a lexical unit by its ID. luName, frameID, and frameName are used - only in the event that the LU does not have a file in the database - (which is the case for LUs with "Problem" status); in this case, - a placeholder LU is created which just contains its name, ID, and frame. - - - Usage examples: - - >>> from nltk.corpus import framenet as fn - >>> fn.lu(256).name - 'foresee.v' - >>> fn.lu(256).definition - 'COD: be aware of beforehand; predict.' - >>> fn.lu(256).frame.name - 'Expectation' - >>> list(map(PrettyDict, fn.lu(256).lexemes)) - [{'POS': 'V', 'breakBefore': 'false', 'headword': 'false', 'name': 'foresee', 'order': 1}] - - >>> fn.lu(227).exemplars[23] # doctest: +NORMALIZE_WHITESPACE - exemplar sentence (352962): - [sentNo] 0 - [aPos] 59699508 - - [LU] (227) guess.v in Coming_to_believe - - [frame] (23) Coming_to_believe - - [annotationSet] 2 annotation sets - - [POS] 18 tags - - [POS_tagset] BNC - - [GF] 3 relations - - [PT] 3 phrases - - [Other] 1 entry - - [text] + [Target] + [FE] - - When he was inside the house , Culley noticed the characteristic - ------------------ - Content - - he would n't have guessed at . - -- ******* -- - Co C1 [Evidence:INI] - (Co=Cognizer, C1=Content) - - - - The dict that is returned from this function will contain most of the - following information about the LU. Note that some LUs do not contain - all of these pieces of information - particularly 'totalAnnotated' and - 'incorporatedFE' may be missing in some LUs: - - - 'name' : the name of the LU (e.g. 'merger.n') - - 'definition' : textual definition of the LU - - 'ID' : the internal ID number of the LU - - '_type' : 'lu' - - 'status' : e.g. 'Created' - - 'frame' : Frame that this LU belongs to - - 'POS' : the part of speech of this LU (e.g. 'N') - - 'totalAnnotated' : total number of examples annotated with this LU - - 'incorporatedFE' : FE that incorporates this LU (e.g. 'Ailment') - - 'sentenceCount' : a dict with the following two keys: - - 'annotated': number of sentences annotated with this LU - - 'total' : total number of sentences with this LU - - - 'lexemes' : a list of dicts describing the lemma of this LU. - Each dict in the list contains these keys: - - - 'POS' : part of speech e.g. 'N' - - 'name' : either single-lexeme e.g. 'merger' or - multi-lexeme e.g. 'a little' - - 'order': the order of the lexeme in the lemma (starting from 1) - - 'headword': a boolean ('true' or 'false') - - 'breakBefore': Can this lexeme be separated from the previous lexeme? - Consider: "take over.v" as in:: - - Germany took over the Netherlands in 2 days. - Germany took the Netherlands over in 2 days. - - In this case, 'breakBefore' would be "true" for the lexeme - "over". Contrast this with "take after.v" as in:: - - Mary takes after her grandmother. - *Mary takes her grandmother after. - - In this case, 'breakBefore' would be "false" for the lexeme "after" - - - 'lemmaID' : Can be used to connect lemmas in different LUs - - 'semTypes' : a list of semantic type objects for this LU - - 'subCorpus' : a list of subcorpora - - Each item in the list is a dict containing the following keys: - - 'name' : - - 'sentence' : a list of sentences in the subcorpus - - each item in the list is a dict with the following keys: - - 'ID': - - 'sentNo': - - 'text': the text of the sentence - - 'aPos': - - 'annotationSet': a list of annotation sets - - each item in the list is a dict with the following keys: - - 'ID': - - 'status': - - 'layer': a list of layers - - each layer is a dict containing the following keys: - - 'name': layer name (e.g. 'BNC') - - 'rank': - - 'label': a list of labels for the layer - - each label is a dict containing the following keys: - - 'start': start pos of label in sentence 'text' (0-based) - - 'end': end pos of label in sentence 'text' (0-based) - - 'name': name of label (e.g. 'NN1') - - Under the hood, this implementation looks up the lexical unit information - in the *frame* definition file. That file does not contain - corpus annotations, so the LU files will be accessed on demand if those are - needed. In principle, valence patterns could be loaded here too, - though these are not currently supported. - - :param fn_luid: The id number of the lexical unit - :type fn_luid: int - :param ignorekeys: The keys to ignore. These keys will not be - included in the output. (optional) - :type ignorekeys: list(str) - :return: All information about the lexical unit - :rtype: dict - """ - # look for this LU in cache - if not self._lu_idx: - self._buildluindex() - OOV = object() - luinfo = self._lu_idx.get(fn_luid, OOV) - if luinfo is OOV: - # LU not in the index. We create a placeholder by falling back to - # luName, frameID, and frameName. However, this will not be listed - # among the LUs for its frame. - self._warn( - "LU ID not found: {} ({}) in {} ({})".format( - luName, fn_luid, frameName, frameID - ) - ) - luinfo = AttrDict( - { - "_type": "lu", - "ID": fn_luid, - "name": luName, - "frameID": frameID, - "status": "Problem", - } - ) - f = self.frame_by_id(luinfo.frameID) - assert f.name == frameName, (f.name, frameName) - luinfo["frame"] = f - self._lu_idx[fn_luid] = luinfo - elif "_type" not in luinfo: - # we only have an index entry for the LU. loading the frame will replace this. - f = self.frame_by_id(luinfo.frameID) - luinfo = self._lu_idx[fn_luid] - if ignorekeys: - return AttrDict({k: v for k, v in luinfo.items() if k not in ignorekeys}) - - return luinfo - - def _lu_file(self, lu, ignorekeys=[]): - """ - Augment the LU information that was loaded from the frame file - with additional information from the LU file. - """ - fn_luid = lu.ID - - fname = f"lu{fn_luid}.xml" - locpath = os.path.join(f"{self._root}", self._lu_dir, fname) - # print(locpath, file=sys.stderr) - if not self._lu_idx: - self._buildluindex() - - try: - with XMLCorpusView(locpath, "lexUnit") as view: - elt = view[0] - except OSError as e: - raise FramenetError(f"Unknown LU id: {fn_luid}") from e - - lu2 = self._handle_lexunit_elt(elt, ignorekeys) - lu.URL = self._fnweb_url + "/" + self._lu_dir + "/" + fname - lu.subCorpus = lu2.subCorpus - lu.exemplars = SpecialList( - "luexemplars", [sent for subc in lu.subCorpus for sent in subc.sentence] - ) - for sent in lu.exemplars: - sent["LU"] = lu - sent["frame"] = lu.frame - for aset in sent.annotationSet: - aset["LU"] = lu - aset["frame"] = lu.frame - - return lu - - def _loadsemtypes(self): - """Create the semantic types index.""" - self._semtypes = AttrDict() - with XMLCorpusView( - self.abspath("semTypes.xml"), - "semTypes/semType", - self._handle_semtype_elt, - ) as view: - for st in view: - n = st["name"] - a = st["abbrev"] - i = st["ID"] - # Both name and abbrev should be able to retrieve the - # ID. The ID will retrieve the semantic type dict itself. - self._semtypes[n] = i - self._semtypes[a] = i - self._semtypes[i] = st - # now that all individual semtype XML is loaded, we can link them together - roots = [] - for st in self.semtypes(): - if st.superType: - st.superType = self.semtype(st.superType.supID) - st.superType.subTypes.append(st) - else: - if st not in roots: - roots.append(st) - st.rootType = st - queue = list(roots) - assert queue - while queue: - st = queue.pop(0) - for child in st.subTypes: - child.rootType = st.rootType - queue.append(child) - # self.propagate_semtypes() # apply inferencing over FE relations - - def propagate_semtypes(self): - """ - Apply inference rules to distribute semtypes over relations between FEs. - For FrameNet 1.5, this results in 1011 semtypes being propagated. - (Not done by default because it requires loading all frame files, - which takes several seconds. If this needed to be fast, it could be rewritten - to traverse the neighboring relations on demand for each FE semtype.) - - >>> from nltk.corpus import framenet as fn - >>> x = sum(1 for f in fn.frames() for fe in f.FE.values() if fe.semType) - >>> fn.propagate_semtypes() - >>> y = sum(1 for f in fn.frames() for fe in f.FE.values() if fe.semType) - >>> y-x > 1000 - True - """ - if not self._semtypes: - self._loadsemtypes() - if not self._ferel_idx: - self._buildrelationindex() - changed = True - i = 0 - nPropagations = 0 - while changed: - # make a pass and see if anything needs to be propagated - i += 1 - changed = False - for ferel in self.fe_relations(): - superST = ferel.superFE.semType - subST = ferel.subFE.semType - try: - if superST and superST is not subST: - # propagate downward - assert subST is None or self.semtype_inherits(subST, superST), ( - superST.name, - ferel, - subST.name, - ) - if subST is None: - ferel.subFE.semType = subST = superST - changed = True - nPropagations += 1 - if ( - ferel.type.name in ["Perspective_on", "Subframe", "Precedes"] - and subST - and subST is not superST - ): - # propagate upward - assert superST is None, (superST.name, ferel, subST.name) - ferel.superFE.semType = superST = subST - changed = True - nPropagations += 1 - except AssertionError as ex: - # bug in the data! ignore - # print(ex, file=sys.stderr) - continue - # print(i, nPropagations, file=sys.stderr) - - def semtype(self, key): - """ - >>> from nltk.corpus import framenet as fn - >>> fn.semtype(233).name - 'Temperature' - >>> fn.semtype(233).abbrev - 'Temp' - >>> fn.semtype('Temperature').ID - 233 - - :param key: The name, abbreviation, or id number of the semantic type - :type key: string or int - :return: Information about a semantic type - :rtype: dict - """ - if isinstance(key, int): - stid = key - else: - try: - stid = self._semtypes[key] - except TypeError: - self._loadsemtypes() - stid = self._semtypes[key] - - try: - st = self._semtypes[stid] - except TypeError: - self._loadsemtypes() - st = self._semtypes[stid] - - return st - - def semtype_inherits(self, st, superST): - if not isinstance(st, dict): - st = self.semtype(st) - if not isinstance(superST, dict): - superST = self.semtype(superST) - par = st.superType - while par: - if par is superST: - return True - par = par.superType - return False - - def frames(self, name=None): - """ - Obtain details for a specific frame. - - >>> from nltk.corpus import framenet as fn - >>> len(fn.frames()) in (1019, 1221) # FN 1.5 and 1.7, resp. - True - >>> x = PrettyList(fn.frames(r'(?i)crim'), maxReprSize=0, breakLines=True) - >>> x.sort(key=itemgetter('ID')) - >>> x - [, - , - , - ] - - A brief intro to Frames (excerpted from "FrameNet II: Extended - Theory and Practice" by Ruppenhofer et. al., 2010): - - A Frame is a script-like conceptual structure that describes a - particular type of situation, object, or event along with the - participants and props that are needed for that Frame. For - example, the "Apply_heat" frame describes a common situation - involving a Cook, some Food, and a Heating_Instrument, and is - evoked by words such as bake, blanch, boil, broil, brown, - simmer, steam, etc. - - We call the roles of a Frame "frame elements" (FEs) and the - frame-evoking words are called "lexical units" (LUs). - - FrameNet includes relations between Frames. Several types of - relations are defined, of which the most important are: - - - Inheritance: An IS-A relation. The child frame is a subtype - of the parent frame, and each FE in the parent is bound to - a corresponding FE in the child. An example is the - "Revenge" frame which inherits from the - "Rewards_and_punishments" frame. - - - Using: The child frame presupposes the parent frame as - background, e.g the "Speed" frame "uses" (or presupposes) - the "Motion" frame; however, not all parent FEs need to be - bound to child FEs. - - - Subframe: The child frame is a subevent of a complex event - represented by the parent, e.g. the "Criminal_process" frame - has subframes of "Arrest", "Arraignment", "Trial", and - "Sentencing". - - - Perspective_on: The child frame provides a particular - perspective on an un-perspectivized parent frame. A pair of - examples consists of the "Hiring" and "Get_a_job" frames, - which perspectivize the "Employment_start" frame from the - Employer's and the Employee's point of view, respectively. - - :param name: A regular expression pattern used to match against - Frame names. If 'name' is None, then a list of all - Framenet Frames will be returned. - :type name: str - :return: A list of matching Frames (or all Frames). - :rtype: list(AttrDict) - """ - try: - fIDs = list(self._frame_idx.keys()) - except AttributeError: - self._buildframeindex() - fIDs = list(self._frame_idx.keys()) - - if name is not None: - return PrettyList( - self.frame(fID) for fID, finfo in self.frame_ids_and_names(name).items() - ) - else: - return PrettyLazyMap(self.frame, fIDs) - - def frame_ids_and_names(self, name=None): - """ - Uses the frame index, which is much faster than looking up each frame definition - if only the names and IDs are needed. - """ - if not self._frame_idx: - self._buildframeindex() - return { - fID: finfo.name - for fID, finfo in self._frame_idx.items() - if name is None or re.search(name, finfo.name) is not None - } - - def fes(self, name=None, frame=None): - """ - Lists frame element objects. If 'name' is provided, this is treated as - a case-insensitive regular expression to filter by frame name. - (Case-insensitivity is because casing of frame element names is not always - consistent across frames.) Specify 'frame' to filter by a frame name pattern, - ID, or object. - - >>> from nltk.corpus import framenet as fn - >>> fn.fes('Noise_maker') - [] - >>> sorted([(fe.frame.name,fe.name) for fe in fn.fes('sound')]) # doctest: +NORMALIZE_WHITESPACE - [('Cause_to_make_noise', 'Sound_maker'), ('Make_noise', 'Sound'), - ('Make_noise', 'Sound_source'), ('Sound_movement', 'Location_of_sound_source'), - ('Sound_movement', 'Sound'), ('Sound_movement', 'Sound_source'), - ('Sounds', 'Component_sound'), ('Sounds', 'Location_of_sound_source'), - ('Sounds', 'Sound_source'), ('Vocalizations', 'Location_of_sound_source'), - ('Vocalizations', 'Sound_source')] - >>> sorted([(fe.frame.name,fe.name) for fe in fn.fes('sound',r'(?i)make_noise')]) # doctest: +NORMALIZE_WHITESPACE - [('Cause_to_make_noise', 'Sound_maker'), - ('Make_noise', 'Sound'), - ('Make_noise', 'Sound_source')] - >>> sorted(set(fe.name for fe in fn.fes('^sound'))) - ['Sound', 'Sound_maker', 'Sound_source'] - >>> len(fn.fes('^sound$')) - 2 - - :param name: A regular expression pattern used to match against - frame element names. If 'name' is None, then a list of all - frame elements will be returned. - :type name: str - :return: A list of matching frame elements - :rtype: list(AttrDict) - """ - # what frames are we searching in? - if frame is not None: - if isinstance(frame, int): - frames = [self.frame(frame)] - elif isinstance(frame, str): - frames = self.frames(frame) - else: - frames = [frame] - else: - frames = self.frames() - - return PrettyList( - fe - for f in frames - for fename, fe in f.FE.items() - if name is None or re.search(name, fename, re.I) - ) - - def lus(self, name=None, frame=None): - """ - Obtain details for lexical units. - Optionally restrict by lexical unit name pattern, and/or to a certain frame - or frames whose name matches a pattern. - - >>> from nltk.corpus import framenet as fn - >>> len(fn.lus()) in (11829, 13572) # FN 1.5 and 1.7, resp. - True - >>> PrettyList(sorted(fn.lus(r'(?i)a little'), key=itemgetter('ID')), maxReprSize=0, breakLines=True) - [, - , - ] - >>> PrettyList(sorted(fn.lus(r'interest', r'(?i)stimulus'), key=itemgetter('ID'))) - [, ] - - A brief intro to Lexical Units (excerpted from "FrameNet II: - Extended Theory and Practice" by Ruppenhofer et. al., 2010): - - A lexical unit (LU) is a pairing of a word with a meaning. For - example, the "Apply_heat" Frame describes a common situation - involving a Cook, some Food, and a Heating Instrument, and is - _evoked_ by words such as bake, blanch, boil, broil, brown, - simmer, steam, etc. These frame-evoking words are the LUs in the - Apply_heat frame. Each sense of a polysemous word is a different - LU. - - We have used the word "word" in talking about LUs. The reality - is actually rather complex. When we say that the word "bake" is - polysemous, we mean that the lemma "bake.v" (which has the - word-forms "bake", "bakes", "baked", and "baking") is linked to - three different frames: - - - Apply_heat: "Michelle baked the potatoes for 45 minutes." - - - Cooking_creation: "Michelle baked her mother a cake for her birthday." - - - Absorb_heat: "The potatoes have to bake for more than 30 minutes." - - These constitute three different LUs, with different - definitions. - - Multiword expressions such as "given name" and hyphenated words - like "shut-eye" can also be LUs. Idiomatic phrases such as - "middle of nowhere" and "give the slip (to)" are also defined as - LUs in the appropriate frames ("Isolated_places" and "Evading", - respectively), and their internal structure is not analyzed. - - Framenet provides multiple annotated examples of each sense of a - word (i.e. each LU). Moreover, the set of examples - (approximately 20 per LU) illustrates all of the combinatorial - possibilities of the lexical unit. - - Each LU is linked to a Frame, and hence to the other words which - evoke that Frame. This makes the FrameNet database similar to a - thesaurus, grouping together semantically similar words. - - In the simplest case, frame-evoking words are verbs such as - "fried" in: - - "Matilde fried the catfish in a heavy iron skillet." - - Sometimes event nouns may evoke a Frame. For example, - "reduction" evokes "Cause_change_of_scalar_position" in: - - "...the reduction of debt levels to $665 million from $2.6 billion." - - Adjectives may also evoke a Frame. For example, "asleep" may - evoke the "Sleep" frame as in: - - "They were asleep for hours." - - Many common nouns, such as artifacts like "hat" or "tower", - typically serve as dependents rather than clearly evoking their - own frames. - - :param name: A regular expression pattern used to search the LU - names. Note that LU names take the form of a dotted - string (e.g. "run.v" or "a little.adv") in which a - lemma precedes the "." and a POS follows the - dot. The lemma may be composed of a single lexeme - (e.g. "run") or of multiple lexemes (e.g. "a - little"). If 'name' is not given, then all LUs will - be returned. - - The valid POSes are: - - v - verb - n - noun - a - adjective - adv - adverb - prep - preposition - num - numbers - intj - interjection - art - article - c - conjunction - scon - subordinating conjunction - - :type name: str - :type frame: str or int or frame - :return: A list of selected (or all) lexical units - :rtype: list of LU objects (dicts). See the lu() function for info - about the specifics of LU objects. - - """ - if not self._lu_idx: - self._buildluindex() - - if name is not None: # match LUs, then restrict by frame - result = PrettyList( - self.lu(luID) for luID, luName in self.lu_ids_and_names(name).items() - ) - if frame is not None: - if isinstance(frame, int): - frameIDs = {frame} - elif isinstance(frame, str): - frameIDs = {f.ID for f in self.frames(frame)} - else: - frameIDs = {frame.ID} - result = PrettyList(lu for lu in result if lu.frame.ID in frameIDs) - elif frame is not None: # all LUs in matching frames - if isinstance(frame, int): - frames = [self.frame(frame)] - elif isinstance(frame, str): - frames = self.frames(frame) - else: - frames = [frame] - result = PrettyLazyIteratorList( - iter(LazyConcatenation(list(f.lexUnit.values()) for f in frames)) - ) - else: # all LUs - luIDs = [ - luID - for luID, lu in self._lu_idx.items() - if lu.status not in self._bad_statuses - ] - result = PrettyLazyMap(self.lu, luIDs) - return result - - def lu_ids_and_names(self, name=None): - """ - Uses the LU index, which is much faster than looking up each LU definition - if only the names and IDs are needed. - """ - if not self._lu_idx: - self._buildluindex() - return { - luID: luinfo.name - for luID, luinfo in self._lu_idx.items() - if luinfo.status not in self._bad_statuses - and (name is None or re.search(name, luinfo.name) is not None) - } - - def docs_metadata(self, name=None): - """ - Return an index of the annotated documents in Framenet. - - Details for a specific annotated document can be obtained using this - class's doc() function and pass it the value of the 'ID' field. - - >>> from nltk.corpus import framenet as fn - >>> len(fn.docs()) in (78, 107) # FN 1.5 and 1.7, resp. - True - >>> set([x.corpname for x in fn.docs_metadata()])>=set(['ANC', 'KBEval', \ - 'LUCorpus-v0.3', 'Miscellaneous', 'NTI', 'PropBank']) - True - - :param name: A regular expression pattern used to search the - file name of each annotated document. The document's - file name contains the name of the corpus that the - document is from, followed by two underscores "__" - followed by the document name. So, for example, the - file name "LUCorpus-v0.3__20000410_nyt-NEW.xml" is - from the corpus named "LUCorpus-v0.3" and the - document name is "20000410_nyt-NEW.xml". - :type name: str - :return: A list of selected (or all) annotated documents - :rtype: list of dicts, where each dict object contains the following - keys: - - - 'name' - - 'ID' - - 'corpid' - - 'corpname' - - 'description' - - 'filename' - """ - try: - ftlist = PrettyList(self._fulltext_idx.values()) - except AttributeError: - self._buildcorpusindex() - ftlist = PrettyList(self._fulltext_idx.values()) - - if name is None: - return ftlist - else: - return PrettyList( - x for x in ftlist if re.search(name, x["filename"]) is not None - ) - - def docs(self, name=None): - """ - Return a list of the annotated full-text documents in FrameNet, - optionally filtered by a regex to be matched against the document name. - """ - return PrettyLazyMap((lambda x: self.doc(x.ID)), self.docs_metadata(name)) - - def sents(self, exemplars=True, full_text=True): - """ - Annotated sentences matching the specified criteria. - """ - if exemplars: - if full_text: - return self.exemplars() + self.ft_sents() - else: - return self.exemplars() - elif full_text: - return self.ft_sents() - - def annotations(self, luNamePattern=None, exemplars=True, full_text=True): - """ - Frame annotation sets matching the specified criteria. - """ - - if exemplars: - epart = PrettyLazyIteratorList( - sent.frameAnnotation for sent in self.exemplars(luNamePattern) - ) - else: - epart = [] - - if full_text: - if luNamePattern is not None: - matchedLUIDs = set(self.lu_ids_and_names(luNamePattern).keys()) - ftpart = PrettyLazyIteratorList( - aset - for sent in self.ft_sents() - for aset in sent.annotationSet[1:] - if luNamePattern is None or aset.get("luID", "CXN_ASET") in matchedLUIDs - ) - else: - ftpart = [] - - if exemplars: - if full_text: - return epart + ftpart - else: - return epart - elif full_text: - return ftpart - - def exemplars(self, luNamePattern=None, frame=None, fe=None, fe2=None): - """ - Lexicographic exemplar sentences, optionally filtered by LU name and/or 1-2 FEs that - are realized overtly. 'frame' may be a name pattern, frame ID, or frame instance. - 'fe' may be a name pattern or FE instance; if specified, 'fe2' may also - be specified to retrieve sentences with both overt FEs (in either order). - """ - if fe is None and fe2 is not None: - raise FramenetError("exemplars(..., fe=None, fe2=) is not allowed") - elif fe is not None and fe2 is not None: - if not isinstance(fe2, str): - if isinstance(fe, str): - # fe2 is specific to a particular frame. swap fe and fe2 so fe is always used to determine the frame. - fe, fe2 = fe2, fe - elif fe.frame is not fe2.frame: # ensure frames match - raise FramenetError( - "exemplars() call with inconsistent `fe` and `fe2` specification (frames must match)" - ) - if frame is None and fe is not None and not isinstance(fe, str): - frame = fe.frame - - # narrow down to frames matching criteria - - lusByFrame = defaultdict( - list - ) # frame name -> matching LUs, if luNamePattern is specified - if frame is not None or luNamePattern is not None: - if frame is None or isinstance(frame, str): - if luNamePattern is not None: - frames = set() - for lu in self.lus(luNamePattern, frame=frame): - frames.add(lu.frame.ID) - lusByFrame[lu.frame.name].append(lu) - frames = LazyMap(self.frame, list(frames)) - else: - frames = self.frames(frame) - else: - if isinstance(frame, int): - frames = [self.frame(frame)] - else: # frame object - frames = [frame] - - if luNamePattern is not None: - lusByFrame = {frame.name: self.lus(luNamePattern, frame=frame)} - - if fe is not None: # narrow to frames that define this FE - if isinstance(fe, str): - frames = PrettyLazyIteratorList( - f - for f in frames - if fe in f.FE - or any(re.search(fe, ffe, re.I) for ffe in f.FE.keys()) - ) - else: - if fe.frame not in frames: - raise FramenetError( - "exemplars() call with inconsistent `frame` and `fe` specification" - ) - frames = [fe.frame] - - if fe2 is not None: # narrow to frames that ALSO define this FE - if isinstance(fe2, str): - frames = PrettyLazyIteratorList( - f - for f in frames - if fe2 in f.FE - or any(re.search(fe2, ffe, re.I) for ffe in f.FE.keys()) - ) - # else we already narrowed it to a single frame - else: # frame, luNamePattern are None. fe, fe2 are None or strings - if fe is not None: - frames = {ffe.frame.ID for ffe in self.fes(fe)} - if fe2 is not None: - frames2 = {ffe.frame.ID for ffe in self.fes(fe2)} - frames = frames & frames2 - frames = LazyMap(self.frame, list(frames)) - else: - frames = self.frames() - - # we've narrowed down 'frames' - # now get exemplars for relevant LUs in those frames - - def _matching_exs(): - for f in frames: - fes = fes2 = None # FEs of interest - if fe is not None: - fes = ( - {ffe for ffe in f.FE.keys() if re.search(fe, ffe, re.I)} - if isinstance(fe, str) - else {fe.name} - ) - if fe2 is not None: - fes2 = ( - {ffe for ffe in f.FE.keys() if re.search(fe2, ffe, re.I)} - if isinstance(fe2, str) - else {fe2.name} - ) - - for lu in ( - lusByFrame[f.name] - if luNamePattern is not None - else f.lexUnit.values() - ): - for ex in lu.exemplars: - if (fes is None or self._exemplar_of_fes(ex, fes)) and ( - fes2 is None or self._exemplar_of_fes(ex, fes2) - ): - yield ex - - return PrettyLazyIteratorList(_matching_exs()) - - def _exemplar_of_fes(self, ex, fes=None): - """ - Given an exemplar sentence and a set of FE names, return the subset of FE names - that are realized overtly in the sentence on the FE, FE2, or FE3 layer. - - If 'fes' is None, returns all overt FE names. - """ - overtNames = set(list(zip(*ex.FE[0]))[2]) if ex.FE[0] else set() - if "FE2" in ex: - overtNames |= set(list(zip(*ex.FE2[0]))[2]) if ex.FE2[0] else set() - if "FE3" in ex: - overtNames |= set(list(zip(*ex.FE3[0]))[2]) if ex.FE3[0] else set() - return overtNames & fes if fes is not None else overtNames - - def ft_sents(self, docNamePattern=None): - """ - Full-text annotation sentences, optionally filtered by document name. - """ - return PrettyLazyIteratorList( - sent for d in self.docs(docNamePattern) for sent in d.sentence - ) - - def frame_relation_types(self): - """ - Obtain a list of frame relation types. - - >>> from nltk.corpus import framenet as fn - >>> frts = sorted(fn.frame_relation_types(), key=itemgetter('ID')) - >>> isinstance(frts, list) - True - >>> len(frts) in (9, 10) # FN 1.5 and 1.7, resp. - True - >>> PrettyDict(frts[0], breakLines=True) - {'ID': 1, - '_type': 'framerelationtype', - 'frameRelations': [ Child=Change_of_consistency>, Child=Rotting>, ...], - 'name': 'Inheritance', - 'subFrameName': 'Child', - 'superFrameName': 'Parent'} - - :return: A list of all of the frame relation types in framenet - :rtype: list(dict) - """ - if not self._freltyp_idx: - self._buildrelationindex() - return self._freltyp_idx.values() - - def frame_relations(self, frame=None, frame2=None, type=None): - """ - :param frame: (optional) frame object, name, or ID; only relations involving - this frame will be returned - :param frame2: (optional; 'frame' must be a different frame) only show relations - between the two specified frames, in either direction - :param type: (optional) frame relation type (name or object); show only relations - of this type - :type frame: int or str or AttrDict - :return: A list of all of the frame relations in framenet - :rtype: list(dict) - - >>> from nltk.corpus import framenet as fn - >>> frels = fn.frame_relations() - >>> isinstance(frels, list) - True - >>> len(frels) in (1676, 2070) # FN 1.5 and 1.7, resp. - True - >>> PrettyList(fn.frame_relations('Cooking_creation'), maxReprSize=0, breakLines=True) - [ Child=Cooking_creation>, - Child=Cooking_creation>, - ReferringEntry=Cooking_creation>] - >>> PrettyList(fn.frame_relations(274), breakLines=True) - [ Child=Dodging>, - Child=Evading>, ...] - >>> PrettyList(fn.frame_relations(fn.frame('Cooking_creation')), breakLines=True) - [ Child=Cooking_creation>, - Child=Cooking_creation>, ...] - >>> PrettyList(fn.frame_relations('Cooking_creation', type='Inheritance')) - [ Child=Cooking_creation>] - >>> PrettyList(fn.frame_relations('Cooking_creation', 'Apply_heat'), breakLines=True) # doctest: +NORMALIZE_WHITESPACE - [ Child=Cooking_creation>, - ReferringEntry=Cooking_creation>] - """ - relation_type = type - - if not self._frel_idx: - self._buildrelationindex() - - rels = None - - if relation_type is not None: - if not isinstance(relation_type, dict): - type = [rt for rt in self.frame_relation_types() if rt.name == type][0] - assert isinstance(type, dict) - - # lookup by 'frame' - if frame is not None: - if isinstance(frame, dict) and "frameRelations" in frame: - rels = PrettyList(frame.frameRelations) - else: - if not isinstance(frame, int): - if isinstance(frame, dict): - frame = frame.ID - else: - frame = self.frame_by_name(frame).ID - rels = [self._frel_idx[frelID] for frelID in self._frel_f_idx[frame]] - - # filter by 'type' - if type is not None: - rels = [rel for rel in rels if rel.type is type] - elif type is not None: - # lookup by 'type' - rels = type.frameRelations - else: - rels = self._frel_idx.values() - - # filter by 'frame2' - if frame2 is not None: - if frame is None: - raise FramenetError( - "frame_relations(frame=None, frame2=) is not allowed" - ) - if not isinstance(frame2, int): - if isinstance(frame2, dict): - frame2 = frame2.ID - else: - frame2 = self.frame_by_name(frame2).ID - if frame == frame2: - raise FramenetError( - "The two frame arguments to frame_relations() must be different frames" - ) - rels = [ - rel - for rel in rels - if rel.superFrame.ID == frame2 or rel.subFrame.ID == frame2 - ] - - return PrettyList( - sorted( - rels, - key=lambda frel: (frel.type.ID, frel.superFrameName, frel.subFrameName), - ) - ) - - def fe_relations(self): - """ - Obtain a list of frame element relations. - - >>> from nltk.corpus import framenet as fn - >>> ferels = fn.fe_relations() - >>> isinstance(ferels, list) - True - >>> len(ferels) in (10020, 12393) # FN 1.5 and 1.7, resp. - True - >>> PrettyDict(ferels[0], breakLines=True) # doctest: +NORMALIZE_WHITESPACE - {'ID': 14642, - '_type': 'ferelation', - 'frameRelation': Child=Lively_place>, - 'subFE': , - 'subFEName': 'Degree', - 'subFrame': , - 'subID': 11370, - 'supID': 2271, - 'superFE': , - 'superFEName': 'Degree', - 'superFrame': , - 'type': } - - :return: A list of all of the frame element relations in framenet - :rtype: list(dict) - """ - if not self._ferel_idx: - self._buildrelationindex() - return PrettyList( - sorted( - self._ferel_idx.values(), - key=lambda ferel: ( - ferel.type.ID, - ferel.frameRelation.superFrameName, - ferel.superFEName, - ferel.frameRelation.subFrameName, - ferel.subFEName, - ), - ) - ) - - def semtypes(self): - """ - Obtain a list of semantic types. - - >>> from nltk.corpus import framenet as fn - >>> stypes = fn.semtypes() - >>> len(stypes) in (73, 109) # FN 1.5 and 1.7, resp. - True - >>> sorted(stypes[0].keys()) - ['ID', '_type', 'abbrev', 'definition', 'definitionMarkup', 'name', 'rootType', 'subTypes', 'superType'] - - :return: A list of all of the semantic types in framenet - :rtype: list(dict) - """ - if not self._semtypes: - self._loadsemtypes() - return PrettyList( - self._semtypes[i] for i in self._semtypes if isinstance(i, int) - ) - - def _load_xml_attributes(self, d, elt): - """ - Extracts a subset of the attributes from the given element and - returns them in a dictionary. - - :param d: A dictionary in which to store the attributes. - :type d: dict - :param elt: An ElementTree Element - :type elt: Element - :return: Returns the input dict ``d`` possibly including attributes from ``elt`` - :rtype: dict - """ - - d = type(d)(d) - - try: - attr_dict = elt.attrib - except AttributeError: - return d - - if attr_dict is None: - return d - - # Ignore these attributes when loading attributes from an xml node - ignore_attrs = [ #'cBy', 'cDate', 'mDate', # <-- annotation metadata that could be of interest - "xsi", - "schemaLocation", - "xmlns", - "bgColor", - "fgColor", - ] - - for attr in attr_dict: - - if any(attr.endswith(x) for x in ignore_attrs): - continue - - val = attr_dict[attr] - if val.isdigit(): - d[attr] = int(val) - else: - d[attr] = val - - return d - - def _strip_tags(self, data): - """ - Gets rid of all tags and newline characters from the given input - - :return: A cleaned-up version of the input string - :rtype: str - """ - - try: - r""" - # Look for boundary issues in markup. (Sometimes FEs are pluralized in definitions.) - m = re.search(r'\w[<][^/]|[<][/][^>]+[>](s\w|[a-rt-z0-9])', data) - if m: - print('Markup boundary:', data[max(0,m.start(0)-10):m.end(0)+10].replace('\n',' '), file=sys.stderr) - """ - - data = data.replace("", "") - data = data.replace("", "") - data = re.sub('', "", data) - data = data.replace("", "") - data = data.replace("", "") - data = data.replace("", "") - data = data.replace("", "") - data = data.replace("", "") - data = data.replace("", "") - data = data.replace("", "") - data = data.replace("", "'") - data = data.replace("", "'") - data = data.replace("", "") - data = data.replace("", "") - data = data.replace("", "") - data = data.replace("", "") - - # Get rid of and tags - data = data.replace("", "") - data = data.replace("", "") - - data = data.replace("\n", " ") - except AttributeError: - pass - - return data - - def _handle_elt(self, elt, tagspec=None): - """Extracts and returns the attributes of the given element""" - return self._load_xml_attributes(AttrDict(), elt) - - def _handle_fulltextindex_elt(self, elt, tagspec=None): - """ - Extracts corpus/document info from the fulltextIndex.xml file. - - Note that this function "flattens" the information contained - in each of the "corpus" elements, so that each "document" - element will contain attributes for the corpus and - corpusid. Also, each of the "document" items will contain a - new attribute called "filename" that is the base file name of - the xml file for the document in the "fulltext" subdir of the - Framenet corpus. - """ - ftinfo = self._load_xml_attributes(AttrDict(), elt) - corpname = ftinfo.name - corpid = ftinfo.ID - retlist = [] - for sub in elt: - if sub.tag.endswith("document"): - doc = self._load_xml_attributes(AttrDict(), sub) - if "name" in doc: - docname = doc.name - else: - docname = doc.description - doc.filename = f"{corpname}__{docname}.xml" - doc.URL = ( - self._fnweb_url + "/" + self._fulltext_dir + "/" + doc.filename - ) - doc.corpname = corpname - doc.corpid = corpid - retlist.append(doc) - - return retlist - - def _handle_frame_elt(self, elt, ignorekeys=[]): - """Load the info for a Frame from a frame xml file""" - frinfo = self._load_xml_attributes(AttrDict(), elt) - - frinfo["_type"] = "frame" - frinfo["definition"] = "" - frinfo["definitionMarkup"] = "" - frinfo["FE"] = PrettyDict() - frinfo["FEcoreSets"] = [] - frinfo["lexUnit"] = PrettyDict() - frinfo["semTypes"] = [] - for k in ignorekeys: - if k in frinfo: - del frinfo[k] - - for sub in elt: - if sub.tag.endswith("definition") and "definition" not in ignorekeys: - frinfo["definitionMarkup"] = sub.text - frinfo["definition"] = self._strip_tags(sub.text) - elif sub.tag.endswith("FE") and "FE" not in ignorekeys: - feinfo = self._handle_fe_elt(sub) - frinfo["FE"][feinfo.name] = feinfo - feinfo["frame"] = frinfo # backpointer - elif sub.tag.endswith("FEcoreSet") and "FEcoreSet" not in ignorekeys: - coreset = self._handle_fecoreset_elt(sub) - # assumes all FEs have been loaded before coresets - frinfo["FEcoreSets"].append( - PrettyList(frinfo["FE"][fe.name] for fe in coreset) - ) - elif sub.tag.endswith("lexUnit") and "lexUnit" not in ignorekeys: - luentry = self._handle_framelexunit_elt(sub) - if luentry["status"] in self._bad_statuses: - # problematic LU entry; ignore it - continue - luentry["frame"] = frinfo - luentry["URL"] = ( - self._fnweb_url - + "/" - + self._lu_dir - + "/" - + "lu{}.xml".format(luentry["ID"]) - ) - luentry["subCorpus"] = Future( - (lambda lu: lambda: self._lu_file(lu).subCorpus)(luentry) - ) - luentry["exemplars"] = Future( - (lambda lu: lambda: self._lu_file(lu).exemplars)(luentry) - ) - frinfo["lexUnit"][luentry.name] = luentry - if not self._lu_idx: - self._buildluindex() - self._lu_idx[luentry.ID] = luentry - elif sub.tag.endswith("semType") and "semTypes" not in ignorekeys: - semtypeinfo = self._load_xml_attributes(AttrDict(), sub) - frinfo["semTypes"].append(self.semtype(semtypeinfo.ID)) - - frinfo["frameRelations"] = self.frame_relations(frame=frinfo) - - # resolve 'requires' and 'excludes' links between FEs of this frame - for fe in frinfo.FE.values(): - if fe.requiresFE: - name, ID = fe.requiresFE.name, fe.requiresFE.ID - fe.requiresFE = frinfo.FE[name] - assert fe.requiresFE.ID == ID - if fe.excludesFE: - name, ID = fe.excludesFE.name, fe.excludesFE.ID - fe.excludesFE = frinfo.FE[name] - assert fe.excludesFE.ID == ID - - return frinfo - - def _handle_fecoreset_elt(self, elt): - """Load fe coreset info from xml.""" - info = self._load_xml_attributes(AttrDict(), elt) - tmp = [] - for sub in elt: - tmp.append(self._load_xml_attributes(AttrDict(), sub)) - - return tmp - - def _handle_framerelationtype_elt(self, elt, *args): - """Load frame-relation element and its child fe-relation elements from frRelation.xml.""" - info = self._load_xml_attributes(AttrDict(), elt) - info["_type"] = "framerelationtype" - info["frameRelations"] = PrettyList() - - for sub in elt: - if sub.tag.endswith("frameRelation"): - frel = self._handle_framerelation_elt(sub) - frel["type"] = info # backpointer - for ferel in frel.feRelations: - ferel["type"] = info - info["frameRelations"].append(frel) - - return info - - def _handle_framerelation_elt(self, elt): - """Load frame-relation element and its child fe-relation elements from frRelation.xml.""" - info = self._load_xml_attributes(AttrDict(), elt) - assert info["superFrameName"] != info["subFrameName"], (elt, info) - info["_type"] = "framerelation" - info["feRelations"] = PrettyList() - - for sub in elt: - if sub.tag.endswith("FERelation"): - ferel = self._handle_elt(sub) - ferel["_type"] = "ferelation" - ferel["frameRelation"] = info # backpointer - info["feRelations"].append(ferel) - - return info - - def _handle_fulltextannotation_elt(self, elt): - """Load full annotation info for a document from its xml - file. The main element (fullTextAnnotation) contains a 'header' - element (which we ignore here) and a bunch of 'sentence' - elements.""" - info = AttrDict() - info["_type"] = "fulltext_annotation" - info["sentence"] = [] - - for sub in elt: - if sub.tag.endswith("header"): - continue # not used - elif sub.tag.endswith("sentence"): - s = self._handle_fulltext_sentence_elt(sub) - s.doc = info - info["sentence"].append(s) - - return info - - def _handle_fulltext_sentence_elt(self, elt): - """Load information from the given 'sentence' element. Each - 'sentence' element contains a "text" and "annotationSet" sub - elements.""" - info = self._load_xml_attributes(AttrDict(), elt) - info["_type"] = "fulltext_sentence" - info["annotationSet"] = [] - info["targets"] = [] - target_spans = set() - info["_ascii"] = types.MethodType( - _annotation_ascii, info - ) # attach a method for this instance - info["text"] = "" - - for sub in elt: - if sub.tag.endswith("text"): - info["text"] = self._strip_tags(sub.text) - elif sub.tag.endswith("annotationSet"): - a = self._handle_fulltextannotationset_elt( - sub, is_pos=(len(info["annotationSet"]) == 0) - ) - if "cxnID" in a: # ignoring construction annotations for now - continue - a.sent = info - a.text = info.text - info["annotationSet"].append(a) - if "Target" in a: - for tspan in a.Target: - if tspan in target_spans: - self._warn( - 'Duplicate target span "{}"'.format( - info.text[slice(*tspan)] - ), - tspan, - "in sentence", - info["ID"], - info.text, - ) - # this can happen in cases like "chemical and biological weapons" - # being annotated as "chemical weapons" and "biological weapons" - else: - target_spans.add(tspan) - info["targets"].append((a.Target, a.luName, a.frameName)) - - assert info["annotationSet"][0].status == "UNANN" - info["POS"] = info["annotationSet"][0].POS - info["POS_tagset"] = info["annotationSet"][0].POS_tagset - return info - - def _handle_fulltextannotationset_elt(self, elt, is_pos=False): - """Load information from the given 'annotationSet' element. Each - 'annotationSet' contains several "layer" elements.""" - - info = self._handle_luannotationset_elt(elt, is_pos=is_pos) - if not is_pos: - info["_type"] = "fulltext_annotationset" - if "cxnID" not in info: # ignoring construction annotations for now - info["LU"] = self.lu( - info.luID, - luName=info.luName, - frameID=info.frameID, - frameName=info.frameName, - ) - info["frame"] = info.LU.frame - return info - - def _handle_fulltextlayer_elt(self, elt): - """Load information from the given 'layer' element. Each - 'layer' contains several "label" elements.""" - info = self._load_xml_attributes(AttrDict(), elt) - info["_type"] = "layer" - info["label"] = [] - - for sub in elt: - if sub.tag.endswith("label"): - l = self._load_xml_attributes(AttrDict(), sub) - info["label"].append(l) - - return info - - def _handle_framelexunit_elt(self, elt): - """Load the lexical unit info from an xml element in a frame's xml file.""" - luinfo = AttrDict() - luinfo["_type"] = "lu" - luinfo = self._load_xml_attributes(luinfo, elt) - luinfo["definition"] = "" - luinfo["definitionMarkup"] = "" - luinfo["sentenceCount"] = PrettyDict() - luinfo["lexemes"] = PrettyList() # multiword LUs have multiple lexemes - luinfo["semTypes"] = PrettyList() # an LU can have multiple semtypes - - for sub in elt: - if sub.tag.endswith("definition"): - luinfo["definitionMarkup"] = sub.text - luinfo["definition"] = self._strip_tags(sub.text) - elif sub.tag.endswith("sentenceCount"): - luinfo["sentenceCount"] = self._load_xml_attributes(PrettyDict(), sub) - elif sub.tag.endswith("lexeme"): - lexemeinfo = self._load_xml_attributes(PrettyDict(), sub) - if not isinstance(lexemeinfo.name, str): - # some lexeme names are ints by default: e.g., - # thousand.num has lexeme with name="1000" - lexemeinfo.name = str(lexemeinfo.name) - luinfo["lexemes"].append(lexemeinfo) - elif sub.tag.endswith("semType"): - semtypeinfo = self._load_xml_attributes(PrettyDict(), sub) - luinfo["semTypes"].append(self.semtype(semtypeinfo.ID)) - - # sort lexemes by 'order' attribute - # otherwise, e.g., 'write down.v' may have lexemes in wrong order - luinfo["lexemes"].sort(key=lambda x: x.order) - - return luinfo - - def _handle_lexunit_elt(self, elt, ignorekeys): - """ - Load full info for a lexical unit from its xml file. - This should only be called when accessing corpus annotations - (which are not included in frame files). - """ - luinfo = self._load_xml_attributes(AttrDict(), elt) - luinfo["_type"] = "lu" - luinfo["definition"] = "" - luinfo["definitionMarkup"] = "" - luinfo["subCorpus"] = PrettyList() - luinfo["lexemes"] = PrettyList() # multiword LUs have multiple lexemes - luinfo["semTypes"] = PrettyList() # an LU can have multiple semtypes - for k in ignorekeys: - if k in luinfo: - del luinfo[k] - - for sub in elt: - if sub.tag.endswith("header"): - continue # not used - elif sub.tag.endswith("valences"): - continue # not used - elif sub.tag.endswith("definition") and "definition" not in ignorekeys: - luinfo["definitionMarkup"] = sub.text - luinfo["definition"] = self._strip_tags(sub.text) - elif sub.tag.endswith("subCorpus") and "subCorpus" not in ignorekeys: - sc = self._handle_lusubcorpus_elt(sub) - if sc is not None: - luinfo["subCorpus"].append(sc) - elif sub.tag.endswith("lexeme") and "lexeme" not in ignorekeys: - luinfo["lexemes"].append(self._load_xml_attributes(PrettyDict(), sub)) - elif sub.tag.endswith("semType") and "semType" not in ignorekeys: - semtypeinfo = self._load_xml_attributes(AttrDict(), sub) - luinfo["semTypes"].append(self.semtype(semtypeinfo.ID)) - - return luinfo - - def _handle_lusubcorpus_elt(self, elt): - """Load a subcorpus of a lexical unit from the given xml.""" - sc = AttrDict() - try: - sc["name"] = elt.get("name") - except AttributeError: - return None - sc["_type"] = "lusubcorpus" - sc["sentence"] = [] - - for sub in elt: - if sub.tag.endswith("sentence"): - s = self._handle_lusentence_elt(sub) - if s is not None: - sc["sentence"].append(s) - - return sc - - def _handle_lusentence_elt(self, elt): - """Load a sentence from a subcorpus of an LU from xml.""" - info = self._load_xml_attributes(AttrDict(), elt) - info["_type"] = "lusentence" - info["annotationSet"] = [] - info["_ascii"] = types.MethodType( - _annotation_ascii, info - ) # attach a method for this instance - for sub in elt: - if sub.tag.endswith("text"): - info["text"] = self._strip_tags(sub.text) - elif sub.tag.endswith("annotationSet"): - annset = self._handle_luannotationset_elt( - sub, is_pos=(len(info["annotationSet"]) == 0) - ) - if annset is not None: - assert annset.status == "UNANN" or "FE" in annset, annset - if annset.status != "UNANN": - info["frameAnnotation"] = annset - # copy layer info up to current level - for k in ( - "Target", - "FE", - "FE2", - "FE3", - "GF", - "PT", - "POS", - "POS_tagset", - "Other", - "Sent", - "Verb", - "Noun", - "Adj", - "Adv", - "Prep", - "Scon", - "Art", - ): - if k in annset: - info[k] = annset[k] - info["annotationSet"].append(annset) - annset["sent"] = info - annset["text"] = info.text - return info - - def _handle_luannotationset_elt(self, elt, is_pos=False): - """Load an annotation set from a sentence in an subcorpus of an LU""" - info = self._load_xml_attributes(AttrDict(), elt) - info["_type"] = "posannotationset" if is_pos else "luannotationset" - info["layer"] = [] - info["_ascii"] = types.MethodType( - _annotation_ascii, info - ) # attach a method for this instance - - if "cxnID" in info: # ignoring construction annotations for now. - return info - - for sub in elt: - if sub.tag.endswith("layer"): - l = self._handle_lulayer_elt(sub) - if l is not None: - overt = [] - ni = {} # null instantiations - - info["layer"].append(l) - for lbl in l.label: - if "start" in lbl: - thespan = (lbl.start, lbl.end + 1, lbl.name) - if l.name not in ( - "Sent", - "Other", - ): # 'Sent' and 'Other' layers sometimes contain accidental duplicate spans - assert thespan not in overt, (info.ID, l.name, thespan) - overt.append(thespan) - else: # null instantiation - if lbl.name in ni: - self._warn( - "FE with multiple NI entries:", - lbl.name, - ni[lbl.name], - lbl.itype, - ) - else: - ni[lbl.name] = lbl.itype - overt = sorted(overt) - - if l.name == "Target": - if not overt: - self._warn( - "Skipping empty Target layer in annotation set ID={}".format( - info.ID - ) - ) - continue - assert all(lblname == "Target" for i, j, lblname in overt) - if "Target" in info: - self._warn( - "Annotation set {} has multiple Target layers".format( - info.ID - ) - ) - else: - info["Target"] = [(i, j) for (i, j, _) in overt] - elif l.name == "FE": - if l.rank == 1: - assert "FE" not in info - info["FE"] = (overt, ni) - # assert False,info - else: - # sometimes there are 3 FE layers! e.g. Change_position_on_a_scale.fall.v - assert 2 <= l.rank <= 3, l.rank - k = "FE" + str(l.rank) - assert k not in info - info[k] = (overt, ni) - elif l.name in ("GF", "PT"): - assert l.rank == 1 - info[l.name] = overt - elif l.name in ("BNC", "PENN"): - assert l.rank == 1 - info["POS"] = overt - info["POS_tagset"] = l.name - else: - if is_pos: - if l.name not in ("NER", "WSL"): - self._warn( - "Unexpected layer in sentence annotationset:", - l.name, - ) - else: - if l.name not in ( - "Sent", - "Verb", - "Noun", - "Adj", - "Adv", - "Prep", - "Scon", - "Art", - "Other", - ): - self._warn( - "Unexpected layer in frame annotationset:", l.name - ) - info[l.name] = overt - if not is_pos and "cxnID" not in info: - if "Target" not in info: - self._warn(f"Missing target in annotation set ID={info.ID}") - assert "FE" in info - if "FE3" in info: - assert "FE2" in info - - return info - - def _handle_lulayer_elt(self, elt): - """Load a layer from an annotation set""" - layer = self._load_xml_attributes(AttrDict(), elt) - layer["_type"] = "lulayer" - layer["label"] = [] - - for sub in elt: - if sub.tag.endswith("label"): - l = self._load_xml_attributes(AttrDict(), sub) - if l is not None: - layer["label"].append(l) - return layer - - def _handle_fe_elt(self, elt): - feinfo = self._load_xml_attributes(AttrDict(), elt) - feinfo["_type"] = "fe" - feinfo["definition"] = "" - feinfo["definitionMarkup"] = "" - feinfo["semType"] = None - feinfo["requiresFE"] = None - feinfo["excludesFE"] = None - for sub in elt: - if sub.tag.endswith("definition"): - feinfo["definitionMarkup"] = sub.text - feinfo["definition"] = self._strip_tags(sub.text) - elif sub.tag.endswith("semType"): - stinfo = self._load_xml_attributes(AttrDict(), sub) - feinfo["semType"] = self.semtype(stinfo.ID) - elif sub.tag.endswith("requiresFE"): - feinfo["requiresFE"] = self._load_xml_attributes(AttrDict(), sub) - elif sub.tag.endswith("excludesFE"): - feinfo["excludesFE"] = self._load_xml_attributes(AttrDict(), sub) - - return feinfo - - def _handle_semtype_elt(self, elt, tagspec=None): - semt = self._load_xml_attributes(AttrDict(), elt) - semt["_type"] = "semtype" - semt["superType"] = None - semt["subTypes"] = PrettyList() - for sub in elt: - if sub.text is not None: - semt["definitionMarkup"] = sub.text - semt["definition"] = self._strip_tags(sub.text) - else: - supertypeinfo = self._load_xml_attributes(AttrDict(), sub) - semt["superType"] = supertypeinfo - # the supertype may not have been loaded yet - - return semt - - -# -# Demo -# -def demo(): - from nltk.corpus import framenet as fn - - # - # It is not necessary to explicitly build the indexes by calling - # buildindexes(). We do this here just for demo purposes. If the - # indexes are not built explicitly, they will be built as needed. - # - print("Building the indexes...") - fn.buildindexes() - - # - # Get some statistics about the corpus - # - print("Number of Frames:", len(fn.frames())) - print("Number of Lexical Units:", len(fn.lus())) - print("Number of annotated documents:", len(fn.docs())) - print() - - # - # Frames - # - print( - 'getting frames whose name matches the (case insensitive) regex: "(?i)medical"' - ) - medframes = fn.frames(r"(?i)medical") - print(f'Found {len(medframes)} Frames whose name matches "(?i)medical":') - print([(f.name, f.ID) for f in medframes]) - - # - # store the first frame in the list of frames - # - tmp_id = medframes[0].ID - m_frame = fn.frame(tmp_id) # reads all info for the frame - - # - # get the frame relations - # - print( - '\nNumber of frame relations for the "{}" ({}) frame:'.format( - m_frame.name, m_frame.ID - ), - len(m_frame.frameRelations), - ) - for fr in m_frame.frameRelations: - print(" ", fr) - - # - # get the names of the Frame Elements - # - print( - f'\nNumber of Frame Elements in the "{m_frame.name}" frame:', - len(m_frame.FE), - ) - print(" ", [x for x in m_frame.FE]) - - # - # get the names of the "Core" Frame Elements - # - print(f'\nThe "core" Frame Elements in the "{m_frame.name}" frame:') - print(" ", [x.name for x in m_frame.FE.values() if x.coreType == "Core"]) - - # - # get all of the Lexical Units that are incorporated in the - # 'Ailment' FE of the 'Medical_conditions' frame (id=239) - # - print('\nAll Lexical Units that are incorporated in the "Ailment" FE:') - m_frame = fn.frame(239) - ailment_lus = [ - x - for x in m_frame.lexUnit.values() - if "incorporatedFE" in x and x.incorporatedFE == "Ailment" - ] - print(" ", [x.name for x in ailment_lus]) - - # - # get all of the Lexical Units for the frame - # - print( - f'\nNumber of Lexical Units in the "{m_frame.name}" frame:', - len(m_frame.lexUnit), - ) - print(" ", [x.name for x in m_frame.lexUnit.values()][:5], "...") - - # - # get basic info on the second LU in the frame - # - tmp_id = m_frame.lexUnit["ailment.n"].ID # grab the id of the specified LU - luinfo = fn.lu_basic(tmp_id) # get basic info on the LU - print(f"\nInformation on the LU: {luinfo.name}") - pprint(luinfo) - - # - # Get a list of all of the corpora used for fulltext annotation - # - print("\nNames of all of the corpora used for fulltext annotation:") - allcorpora = {x.corpname for x in fn.docs_metadata()} - pprint(list(allcorpora)) - - # - # Get the names of the annotated documents in the first corpus - # - firstcorp = list(allcorpora)[0] - firstcorp_docs = fn.docs(firstcorp) - print(f'\nNames of the annotated documents in the "{firstcorp}" corpus:') - pprint([x.filename for x in firstcorp_docs]) - - # - # Search for frames containing LUs whose name attribute matches a - # regexp pattern. - # - # Note: if you were going to be doing a lot of this type of - # searching, you'd want to build an index that maps from - # lemmas to frames because each time frames_by_lemma() is - # called, it has to search through ALL of the frame XML files - # in the db. - print( - '\nSearching for all Frames that have a lemma that matches the regexp: "^run.v$":' - ) - pprint(fn.frames_by_lemma(r"^run.v$")) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/corpus/reader/ieer.py b/pipeline/nltk/corpus/reader/ieer.py deleted file mode 100644 index 24f83cfaebcf9a583a33806136f8788b112aaf95..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/ieer.py +++ /dev/null @@ -1,116 +0,0 @@ -# Natural Language Toolkit: IEER Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -Corpus reader for the Information Extraction and Entity Recognition Corpus. - -NIST 1999 Information Extraction: Entity Recognition Evaluation -https://www.itl.nist.gov/iad/894.01/tests/ie-er/er_99/er_99.htm - -This corpus contains the NEWSWIRE development test data for the -NIST 1999 IE-ER Evaluation. The files were taken from the -subdirectory: ``/ie_er_99/english/devtest/newswire/*.ref.nwt`` -and filenames were shortened. - -The corpus contains the following files: APW_19980314, APW_19980424, -APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407. -""" - -import nltk -from nltk.corpus.reader.api import * - -#: A dictionary whose keys are the names of documents in this corpus; -#: and whose values are descriptions of those documents' contents. -titles = { - "APW_19980314": "Associated Press Weekly, 14 March 1998", - "APW_19980424": "Associated Press Weekly, 24 April 1998", - "APW_19980429": "Associated Press Weekly, 29 April 1998", - "NYT_19980315": "New York Times, 15 March 1998", - "NYT_19980403": "New York Times, 3 April 1998", - "NYT_19980407": "New York Times, 7 April 1998", -} - -#: A list of all documents in this corpus. -documents = sorted(titles) - - -class IEERDocument: - def __init__(self, text, docno=None, doctype=None, date_time=None, headline=""): - self.text = text - self.docno = docno - self.doctype = doctype - self.date_time = date_time - self.headline = headline - - def __repr__(self): - if self.headline: - headline = " ".join(self.headline.leaves()) - else: - headline = ( - " ".join([w for w in self.text.leaves() if w[:1] != "<"][:12]) + "..." - ) - if self.docno is not None: - return f"" - else: - return "" % headline - - -class IEERCorpusReader(CorpusReader): - """ """ - - def docs(self, fileids=None): - return concat( - [ - StreamBackedCorpusView(fileid, self._read_block, encoding=enc) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def parsed_docs(self, fileids=None): - return concat( - [ - StreamBackedCorpusView(fileid, self._read_parsed_block, encoding=enc) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def _read_parsed_block(self, stream): - # TODO: figure out while empty documents are being returned - return [ - self._parse(doc) - for doc in self._read_block(stream) - if self._parse(doc).docno is not None - ] - - def _parse(self, doc): - val = nltk.chunk.ieerstr2tree(doc, root_label="DOCUMENT") - if isinstance(val, dict): - return IEERDocument(**val) - else: - return IEERDocument(val) - - def _read_block(self, stream): - out = [] - # Skip any preamble. - while True: - line = stream.readline() - if not line: - break - if line.strip() == "": - break - out.append(line) - # Read the document - while True: - line = stream.readline() - if not line: - break - out.append(line) - if line.strip() == "": - break - # Return the document - return ["\n".join(out)] diff --git a/pipeline/nltk/corpus/reader/indian.py b/pipeline/nltk/corpus/reader/indian.py deleted file mode 100644 index 23c6434c34b38dcb4e0227851afb2aefde2fd090..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/indian.py +++ /dev/null @@ -1,93 +0,0 @@ -# Natural Language Toolkit: Indian Language POS-Tagged Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -Indian Language POS-Tagged Corpus -Collected by A Kumaran, Microsoft Research, India -Distributed with permission - -Contents: - - Bangla: IIT Kharagpur - - Hindi: Microsoft Research India - - Marathi: IIT Bombay - - Telugu: IIIT Hyderabad -""" - -from nltk.corpus.reader.api import * -from nltk.corpus.reader.util import * -from nltk.tag import map_tag, str2tuple - - -class IndianCorpusReader(CorpusReader): - """ - List of words, one per line. Blank lines are ignored. - """ - - def words(self, fileids=None): - return concat( - [ - IndianCorpusView(fileid, enc, False, False) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def tagged_words(self, fileids=None, tagset=None): - if tagset and tagset != self._tagset: - tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) - else: - tag_mapping_function = None - return concat( - [ - IndianCorpusView(fileid, enc, True, False, tag_mapping_function) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def sents(self, fileids=None): - return concat( - [ - IndianCorpusView(fileid, enc, False, True) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def tagged_sents(self, fileids=None, tagset=None): - if tagset and tagset != self._tagset: - tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) - else: - tag_mapping_function = None - return concat( - [ - IndianCorpusView(fileid, enc, True, True, tag_mapping_function) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - -class IndianCorpusView(StreamBackedCorpusView): - def __init__( - self, corpus_file, encoding, tagged, group_by_sent, tag_mapping_function=None - ): - self._tagged = tagged - self._group_by_sent = group_by_sent - self._tag_mapping_function = tag_mapping_function - StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) - - def read_block(self, stream): - line = stream.readline() - if line.startswith("<"): - return [] - sent = [str2tuple(word, sep="_") for word in line.split()] - if self._tag_mapping_function: - sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent] - if not self._tagged: - sent = [w for (w, t) in sent] - if self._group_by_sent: - return [sent] - else: - return sent diff --git a/pipeline/nltk/corpus/reader/ipipan.py b/pipeline/nltk/corpus/reader/ipipan.py deleted file mode 100644 index d2d16c90f4edf380658af969a0488c28d5f1b24a..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/ipipan.py +++ /dev/null @@ -1,356 +0,0 @@ -# Natural Language Toolkit: IPI PAN Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Konrad Goluchowski -# URL: -# For license information, see LICENSE.TXT - -import functools - -from nltk.corpus.reader.api import CorpusReader -from nltk.corpus.reader.util import StreamBackedCorpusView, concat - - -def _parse_args(fun): - @functools.wraps(fun) - def decorator(self, fileids=None, **kwargs): - kwargs.pop("tags", None) - if not fileids: - fileids = self.fileids() - return fun(self, fileids, **kwargs) - - return decorator - - -class IPIPANCorpusReader(CorpusReader): - """ - Corpus reader designed to work with corpus created by IPI PAN. - See http://korpus.pl/en/ for more details about IPI PAN corpus. - - The corpus includes information about text domain, channel and categories. - You can access possible values using ``domains()``, ``channels()`` and - ``categories()``. You can use also this metadata to filter files, e.g.: - ``fileids(channel='prasa')``, ``fileids(categories='publicystyczny')``. - - The reader supports methods: words, sents, paras and their tagged versions. - You can get part of speech instead of full tag by giving "simplify_tags=True" - parameter, e.g.: ``tagged_sents(simplify_tags=True)``. - - Also you can get all tags disambiguated tags specifying parameter - "one_tag=False", e.g.: ``tagged_paras(one_tag=False)``. - - You can get all tags that were assigned by a morphological analyzer specifying - parameter "disamb_only=False", e.g. ``tagged_words(disamb_only=False)``. - - The IPIPAN Corpus contains tags indicating if there is a space between two - tokens. To add special "no space" markers, you should specify parameter - "append_no_space=True", e.g. ``tagged_words(append_no_space=True)``. - As a result in place where there should be no space between two tokens new - pair ('', 'no-space') will be inserted (for tagged data) and just '' for - methods without tags. - - The corpus reader can also try to append spaces between words. To enable this - option, specify parameter "append_space=True", e.g. ``words(append_space=True)``. - As a result either ' ' or (' ', 'space') will be inserted between tokens. - - By default, xml entities like " and & are replaced by corresponding - characters. You can turn off this feature, specifying parameter - "replace_xmlentities=False", e.g. ``words(replace_xmlentities=False)``. - """ - - def __init__(self, root, fileids): - CorpusReader.__init__(self, root, fileids, None, None) - - def channels(self, fileids=None): - if not fileids: - fileids = self.fileids() - return self._parse_header(fileids, "channel") - - def domains(self, fileids=None): - if not fileids: - fileids = self.fileids() - return self._parse_header(fileids, "domain") - - def categories(self, fileids=None): - if not fileids: - fileids = self.fileids() - return [ - self._map_category(cat) for cat in self._parse_header(fileids, "keyTerm") - ] - - def fileids(self, channels=None, domains=None, categories=None): - if channels is not None and domains is not None and categories is not None: - raise ValueError( - "You can specify only one of channels, domains " - "and categories parameter at once" - ) - if channels is None and domains is None and categories is None: - return CorpusReader.fileids(self) - if isinstance(channels, str): - channels = [channels] - if isinstance(domains, str): - domains = [domains] - if isinstance(categories, str): - categories = [categories] - if channels: - return self._list_morph_files_by("channel", channels) - elif domains: - return self._list_morph_files_by("domain", domains) - else: - return self._list_morph_files_by( - "keyTerm", categories, map=self._map_category - ) - - @_parse_args - def sents(self, fileids=None, **kwargs): - return concat( - [ - self._view( - fileid, mode=IPIPANCorpusView.SENTS_MODE, tags=False, **kwargs - ) - for fileid in self._list_morph_files(fileids) - ] - ) - - @_parse_args - def paras(self, fileids=None, **kwargs): - return concat( - [ - self._view( - fileid, mode=IPIPANCorpusView.PARAS_MODE, tags=False, **kwargs - ) - for fileid in self._list_morph_files(fileids) - ] - ) - - @_parse_args - def words(self, fileids=None, **kwargs): - return concat( - [ - self._view(fileid, tags=False, **kwargs) - for fileid in self._list_morph_files(fileids) - ] - ) - - @_parse_args - def tagged_sents(self, fileids=None, **kwargs): - return concat( - [ - self._view(fileid, mode=IPIPANCorpusView.SENTS_MODE, **kwargs) - for fileid in self._list_morph_files(fileids) - ] - ) - - @_parse_args - def tagged_paras(self, fileids=None, **kwargs): - return concat( - [ - self._view(fileid, mode=IPIPANCorpusView.PARAS_MODE, **kwargs) - for fileid in self._list_morph_files(fileids) - ] - ) - - @_parse_args - def tagged_words(self, fileids=None, **kwargs): - return concat( - [self._view(fileid, **kwargs) for fileid in self._list_morph_files(fileids)] - ) - - def _list_morph_files(self, fileids): - return [f for f in self.abspaths(fileids)] - - def _list_header_files(self, fileids): - return [ - f.replace("morph.xml", "header.xml") - for f in self._list_morph_files(fileids) - ] - - def _parse_header(self, fileids, tag): - values = set() - for f in self._list_header_files(fileids): - values_list = self._get_tag(f, tag) - for v in values_list: - values.add(v) - return list(values) - - def _list_morph_files_by(self, tag, values, map=None): - fileids = self.fileids() - ret_fileids = set() - for f in fileids: - fp = self.abspath(f).replace("morph.xml", "header.xml") - values_list = self._get_tag(fp, tag) - for value in values_list: - if map is not None: - value = map(value) - if value in values: - ret_fileids.add(f) - return list(ret_fileids) - - def _get_tag(self, f, tag): - tags = [] - with open(f) as infile: - header = infile.read() - tag_end = 0 - while True: - tag_pos = header.find("<" + tag, tag_end) - if tag_pos < 0: - return tags - tag_end = header.find("", tag_pos) - tags.append(header[tag_pos + len(tag) + 2 : tag_end]) - - def _map_category(self, cat): - pos = cat.find(">") - if pos == -1: - return cat - else: - return cat[pos + 1 :] - - def _view(self, filename, **kwargs): - tags = kwargs.pop("tags", True) - mode = kwargs.pop("mode", 0) - simplify_tags = kwargs.pop("simplify_tags", False) - one_tag = kwargs.pop("one_tag", True) - disamb_only = kwargs.pop("disamb_only", True) - append_no_space = kwargs.pop("append_no_space", False) - append_space = kwargs.pop("append_space", False) - replace_xmlentities = kwargs.pop("replace_xmlentities", True) - - if len(kwargs) > 0: - raise ValueError("Unexpected arguments: %s" % kwargs.keys()) - if not one_tag and not disamb_only: - raise ValueError( - "You cannot specify both one_tag=False and " "disamb_only=False" - ) - if not tags and (simplify_tags or not one_tag or not disamb_only): - raise ValueError( - "You cannot specify simplify_tags, one_tag or " - "disamb_only with functions other than tagged_*" - ) - - return IPIPANCorpusView( - filename, - tags=tags, - mode=mode, - simplify_tags=simplify_tags, - one_tag=one_tag, - disamb_only=disamb_only, - append_no_space=append_no_space, - append_space=append_space, - replace_xmlentities=replace_xmlentities, - ) - - -class IPIPANCorpusView(StreamBackedCorpusView): - - WORDS_MODE = 0 - SENTS_MODE = 1 - PARAS_MODE = 2 - - def __init__(self, filename, startpos=0, **kwargs): - StreamBackedCorpusView.__init__(self, filename, None, startpos, None) - self.in_sentence = False - self.position = 0 - - self.show_tags = kwargs.pop("tags", True) - self.disamb_only = kwargs.pop("disamb_only", True) - self.mode = kwargs.pop("mode", IPIPANCorpusView.WORDS_MODE) - self.simplify_tags = kwargs.pop("simplify_tags", False) - self.one_tag = kwargs.pop("one_tag", True) - self.append_no_space = kwargs.pop("append_no_space", False) - self.append_space = kwargs.pop("append_space", False) - self.replace_xmlentities = kwargs.pop("replace_xmlentities", True) - - def read_block(self, stream): - sentence = [] - sentences = [] - space = False - no_space = False - - tags = set() - - lines = self._read_data(stream) - - while True: - - # we may have only part of last line - if len(lines) <= 1: - self._seek(stream) - lines = self._read_data(stream) - - if lines == [""]: - assert not sentences - return [] - - line = lines.pop() - self.position += len(line) + 1 - - if line.startswith('"): - if self.append_space: - no_space = True - if self.append_no_space: - if self.show_tags: - sentence.append(("", "no-space")) - else: - sentence.append("") - elif line.startswith(" -# URL: -# For license information, see LICENSE.TXT - -# For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html - -import re - -from nltk.corpus.reader.api import CorpusReader, SyntaxCorpusReader -from nltk.corpus.reader.util import ( - FileSystemPathPointer, - find_corpus_fileids, - read_blankline_block, -) -from nltk.parse import DependencyGraph - -# default function to convert morphlist to str for tree representation -_morphs2str_default = lambda morphs: "/".join(m[0] for m in morphs if m[0] != "EOS") - - -class KNBCorpusReader(SyntaxCorpusReader): - """ - This class implements: - - ``__init__``, which specifies the location of the corpus - and a method for detecting the sentence blocks in corpus files. - - ``_read_block``, which reads a block from the input stream. - - ``_word``, which takes a block and returns a list of list of words. - - ``_tag``, which takes a block and returns a list of list of tagged - words. - - ``_parse``, which takes a block and returns a list of parsed - sentences. - - The structure of tagged words: - tagged_word = (word(str), tags(tuple)) - tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...) - - Usage example - - >>> from nltk.corpus.util import LazyCorpusLoader - >>> knbc = LazyCorpusLoader( - ... 'knbc/corpus1', - ... KNBCorpusReader, - ... r'.*/KN.*', - ... encoding='euc-jp', - ... ) - - >>> len(knbc.sents()[0]) - 9 - - """ - - def __init__(self, root, fileids, encoding="utf8", morphs2str=_morphs2str_default): - """ - Initialize KNBCorpusReader - morphs2str is a function to convert morphlist to str for tree representation - for _parse() - """ - SyntaxCorpusReader.__init__(self, root, fileids, encoding) - self.morphs2str = morphs2str - - def _read_block(self, stream): - # blocks are split by blankline (or EOF) - default - return read_blankline_block(stream) - - def _word(self, t): - res = [] - for line in t.splitlines(): - # ignore the Bunsets headers - if not re.match(r"EOS|\*|\#|\+", line): - cells = line.strip().split(" ") - res.append(cells[0]) - - return res - - # ignores tagset argument - def _tag(self, t, tagset=None): - res = [] - for line in t.splitlines(): - # ignore the Bunsets headers - if not re.match(r"EOS|\*|\#|\+", line): - cells = line.strip().split(" ") - # convert cells to morph tuples - res.append((cells[0], " ".join(cells[1:]))) - - return res - - def _parse(self, t): - dg = DependencyGraph() - i = 0 - for line in t.splitlines(): - if line[0] in "*+": - # start of bunsetsu or tag - - cells = line.strip().split(" ", 3) - m = re.match(r"([\-0-9]*)([ADIP])", cells[1]) - - assert m is not None - - node = dg.nodes[i] - node.update({"address": i, "rel": m.group(2), "word": []}) - - dep_parent = int(m.group(1)) - - if dep_parent == -1: - dg.root = node - else: - dg.nodes[dep_parent]["deps"].append(i) - - i += 1 - elif line[0] != "#": - # normal morph - cells = line.strip().split(" ") - # convert cells to morph tuples - morph = cells[0], " ".join(cells[1:]) - dg.nodes[i - 1]["word"].append(morph) - - if self.morphs2str: - for node in dg.nodes.values(): - node["word"] = self.morphs2str(node["word"]) - - return dg.tree() - - -###################################################################### -# Demo -###################################################################### - - -def demo(): - - import nltk - from nltk.corpus.util import LazyCorpusLoader - - root = nltk.data.find("corpora/knbc/corpus1") - fileids = [ - f - for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") - if re.search(r"\d\-\d\-[\d]+\-[\d]+", f) - ] - - def _knbc_fileids_sort(x): - cells = x.split("-") - return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) - - knbc = LazyCorpusLoader( - "knbc/corpus1", - KNBCorpusReader, - sorted(fileids, key=_knbc_fileids_sort), - encoding="euc-jp", - ) - - print(knbc.fileids()[:10]) - print("".join(knbc.words()[:100])) - - print("\n\n".join(str(tree) for tree in knbc.parsed_sents()[:2])) - - knbc.morphs2str = lambda morphs: "/".join( - "{}({})".format(m[0], m[1].split(" ")[2]) for m in morphs if m[0] != "EOS" - ).encode("utf-8") - - print("\n\n".join("%s" % tree for tree in knbc.parsed_sents()[:2])) - - print( - "\n".join( - " ".join("{}/{}".format(w[0], w[1].split(" ")[2]) for w in sent) - for sent in knbc.tagged_sents()[0:2] - ) - ) - - -def test(): - - from nltk.corpus.util import LazyCorpusLoader - - knbc = LazyCorpusLoader( - "knbc/corpus1", KNBCorpusReader, r".*/KN.*", encoding="euc-jp" - ) - assert isinstance(knbc.words()[0], str) - assert isinstance(knbc.sents()[0][0], str) - assert isinstance(knbc.tagged_words()[0], tuple) - assert isinstance(knbc.tagged_sents()[0][0], tuple) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/corpus/reader/lin.py b/pipeline/nltk/corpus/reader/lin.py deleted file mode 100644 index 15c20a6803c0c83557cd2f4689cddecfdd2d83da..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/lin.py +++ /dev/null @@ -1,183 +0,0 @@ -# Natural Language Toolkit: Lin's Thesaurus -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Dan Blanchard -# URL: -# For license information, see LICENSE.txt - -import re -from collections import defaultdict -from functools import reduce - -from nltk.corpus.reader import CorpusReader - - -class LinThesaurusCorpusReader(CorpusReader): - """Wrapper for the LISP-formatted thesauruses distributed by Dekang Lin.""" - - # Compiled regular expression for extracting the key from the first line of each - # thesaurus entry - _key_re = re.compile(r'\("?([^"]+)"? \(desc [0-9.]+\).+') - - @staticmethod - def __defaultdict_factory(): - """Factory for creating defaultdict of defaultdict(dict)s""" - return defaultdict(dict) - - def __init__(self, root, badscore=0.0): - """ - Initialize the thesaurus. - - :param root: root directory containing thesaurus LISP files - :type root: C{string} - :param badscore: the score to give to words which do not appear in each other's sets of synonyms - :type badscore: C{float} - """ - - super().__init__(root, r"sim[A-Z]\.lsp") - self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory) - self._badscore = badscore - for path, encoding, fileid in self.abspaths( - include_encoding=True, include_fileid=True - ): - with open(path) as lin_file: - first = True - for line in lin_file: - line = line.strip() - # Start of entry - if first: - key = LinThesaurusCorpusReader._key_re.sub(r"\1", line) - first = False - # End of entry - elif line == "))": - first = True - # Lines with pairs of ngrams and scores - else: - split_line = line.split("\t") - if len(split_line) == 2: - ngram, score = split_line - self._thesaurus[fileid][key][ngram.strip('"')] = float( - score - ) - - def similarity(self, ngram1, ngram2, fileid=None): - """ - Returns the similarity score for two ngrams. - - :param ngram1: first ngram to compare - :type ngram1: C{string} - :param ngram2: second ngram to compare - :type ngram2: C{string} - :param fileid: thesaurus fileid to search in. If None, search all fileids. - :type fileid: C{string} - :return: If fileid is specified, just the score for the two ngrams; otherwise, - list of tuples of fileids and scores. - """ - # Entries don't contain themselves, so make sure similarity between item and itself is 1.0 - if ngram1 == ngram2: - if fileid: - return 1.0 - else: - return [(fid, 1.0) for fid in self._fileids] - else: - if fileid: - return ( - self._thesaurus[fileid][ngram1][ngram2] - if ngram2 in self._thesaurus[fileid][ngram1] - else self._badscore - ) - else: - return [ - ( - fid, - ( - self._thesaurus[fid][ngram1][ngram2] - if ngram2 in self._thesaurus[fid][ngram1] - else self._badscore - ), - ) - for fid in self._fileids - ] - - def scored_synonyms(self, ngram, fileid=None): - """ - Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram - - :param ngram: ngram to lookup - :type ngram: C{string} - :param fileid: thesaurus fileid to search in. If None, search all fileids. - :type fileid: C{string} - :return: If fileid is specified, list of tuples of scores and synonyms; otherwise, - list of tuples of fileids and lists, where inner lists consist of tuples of - scores and synonyms. - """ - if fileid: - return self._thesaurus[fileid][ngram].items() - else: - return [ - (fileid, self._thesaurus[fileid][ngram].items()) - for fileid in self._fileids - ] - - def synonyms(self, ngram, fileid=None): - """ - Returns a list of synonyms for the current ngram. - - :param ngram: ngram to lookup - :type ngram: C{string} - :param fileid: thesaurus fileid to search in. If None, search all fileids. - :type fileid: C{string} - :return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and - lists, where inner lists contain synonyms. - """ - if fileid: - return self._thesaurus[fileid][ngram].keys() - else: - return [ - (fileid, self._thesaurus[fileid][ngram].keys()) - for fileid in self._fileids - ] - - def __contains__(self, ngram): - """ - Determines whether or not the given ngram is in the thesaurus. - - :param ngram: ngram to lookup - :type ngram: C{string} - :return: whether the given ngram is in the thesaurus. - """ - return reduce( - lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]), - self._fileids, - False, - ) - - -###################################################################### -# Demo -###################################################################### - - -def demo(): - from nltk.corpus import lin_thesaurus as thes - - word1 = "business" - word2 = "enterprise" - print("Getting synonyms for " + word1) - print(thes.synonyms(word1)) - - print("Getting scored synonyms for " + word1) - print(thes.scored_synonyms(word1)) - - print("Getting synonyms from simN.lsp (noun subsection) for " + word1) - print(thes.synonyms(word1, fileid="simN.lsp")) - - print("Getting synonyms from simN.lsp (noun subsection) for " + word1) - print(thes.synonyms(word1, fileid="simN.lsp")) - - print(f"Similarity score for {word1} and {word2}:") - print(thes.similarity(word1, word2)) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/corpus/reader/markdown.py b/pipeline/nltk/corpus/reader/markdown.py deleted file mode 100644 index 8df4f924e25426dbe30ef2484f3a0cb4cb1a1740..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/markdown.py +++ /dev/null @@ -1,342 +0,0 @@ -from collections import namedtuple -from functools import partial, wraps - -from nltk.corpus.reader.api import CategorizedCorpusReader -from nltk.corpus.reader.plaintext import PlaintextCorpusReader -from nltk.corpus.reader.util import concat, read_blankline_block -from nltk.tokenize import blankline_tokenize, sent_tokenize, word_tokenize - - -def comma_separated_string_args(func): - """ - A decorator that allows a function to be called with - a single string of comma-separated values which become - individual function arguments. - """ - - @wraps(func) - def wrapper(*args, **kwargs): - _args = list() - for arg in args: - if isinstance(arg, str): - _args.append({part.strip() for part in arg.split(",")}) - elif isinstance(arg, list): - _args.append(set(arg)) - else: - _args.append(arg) - for name, value in kwargs.items(): - if isinstance(value, str): - kwargs[name] = {part.strip() for part in value.split(",")} - return func(*_args, **kwargs) - - return wrapper - - -def read_parse_blankline_block(stream, parser): - block = read_blankline_block(stream) - if block: - return [parser.render(block[0])] - return block - - -class MarkdownBlock: - def __init__(self, content): - self.content = content - self.truncate_at = 16 - - def __repr__(self): - return f"{self.__class__.__name__}(content={repr(str(self))})" - - def __str__(self): - return ( - f"{self.content[:self.truncate_at]}" - f"{'...' if len(self.content) > self.truncate_at else ''}" - ) - - @property - def raw(self): - return self.content - - @property - def words(self): - return word_tokenize(self.content) - - @property - def sents(self): - return [word_tokenize(sent) for sent in sent_tokenize(self.content)] - - @property - def paras(self): - return [ - [word_tokenize(sent) for sent in sent_tokenize(para)] - for para in blankline_tokenize(self.content) - ] - - -class CodeBlock(MarkdownBlock): - def __init__(self, language, *args): - self.language = language - super().__init__(*args) - - @property - def sents(self): - return [word_tokenize(line) for line in self.content.splitlines()] - - @property - def lines(self): - return self.content.splitlines() - - @property - def paras(self): - return [ - [word_tokenize(line) for line in para.splitlines()] - for para in blankline_tokenize(self.content) - ] - - -class MarkdownSection(MarkdownBlock): - def __init__(self, heading, level, *args): - self.heading = heading - self.level = level - super().__init__(*args) - - -Image = namedtuple("Image", "label, src, title") -Link = namedtuple("Link", "label, href, title") -List = namedtuple("List", "is_ordered, items") - - -class MarkdownCorpusReader(PlaintextCorpusReader): - def __init__(self, *args, parser=None, **kwargs): - from markdown_it import MarkdownIt - from mdit_plain.renderer import RendererPlain - from mdit_py_plugins.front_matter import front_matter_plugin - - self.parser = parser - if self.parser is None: - self.parser = MarkdownIt("commonmark", renderer_cls=RendererPlain) - self.parser.use(front_matter_plugin) - - kwargs.setdefault( - "para_block_reader", partial(read_parse_blankline_block, parser=self.parser) - ) - super().__init__(*args, **kwargs) - - # This override takes care of removing markup. - def _read_word_block(self, stream): - words = list() - for para in self._para_block_reader(stream): - words.extend(self._word_tokenizer.tokenize(para)) - return words - - -class CategorizedMarkdownCorpusReader(CategorizedCorpusReader, MarkdownCorpusReader): - """ - A reader for markdown corpora whose documents are divided into - categories based on their file identifiers. - - Based on nltk.corpus.reader.plaintext.CategorizedPlaintextCorpusReader: - https://www.nltk.org/_modules/nltk/corpus/reader/api.html#CategorizedCorpusReader - """ - - def __init__(self, *args, cat_field="tags", **kwargs): - """ - Initialize the corpus reader. Categorization arguments - (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to - the ``CategorizedCorpusReader`` constructor. The remaining arguments - are passed to the ``MarkdownCorpusReader`` constructor. - """ - cat_args = ["cat_pattern", "cat_map", "cat_file"] - if not any(arg in kwargs for arg in cat_args): - # Initialize with a blank map now, - # and try to build categories from document metadata later. - kwargs["cat_map"] = dict() - CategorizedCorpusReader.__init__(self, kwargs) - MarkdownCorpusReader.__init__(self, *args, **kwargs) - - # Map file IDs to categories if self._map exists but is still empty: - if self._map is not None and not self._map: - for file_id in self._fileids: - metadata = self.metadata(file_id) - if metadata: - self._map[file_id] = metadata[0].get(cat_field, []) - - ### Begin CategorizedCorpusReader Overrides - @comma_separated_string_args - def categories(self, fileids=None): - return super().categories(fileids) - - @comma_separated_string_args - def fileids(self, categories=None): - if categories is None: - return self._fileids - return super().fileids(categories) - - ### End CategorizedCorpusReader Overrides - - ### Begin MarkdownCorpusReader Overrides - @comma_separated_string_args - def raw(self, fileids=None, categories=None): - return super().raw(self._resolve(fileids, categories)) - - @comma_separated_string_args - def words(self, fileids=None, categories=None): - return super().words(self._resolve(fileids, categories)) - - @comma_separated_string_args - def sents(self, fileids=None, categories=None): - return super().sents(self._resolve(fileids, categories)) - - @comma_separated_string_args - def paras(self, fileids=None, categories=None): - return super().paras(self._resolve(fileids, categories)) - - ### End MarkdownCorpusReader Overrides - - def concatenated_view(self, reader, fileids, categories): - return concat( - [ - self.CorpusView(path, reader, encoding=enc) - for (path, enc) in self.abspaths( - self._resolve(fileids, categories), include_encoding=True - ) - ] - ) - - def metadata_reader(self, stream): - from yaml import safe_load - - return [ - safe_load(t.content) - for t in self.parser.parse(stream.read()) - if t.type == "front_matter" - ] - - @comma_separated_string_args - def metadata(self, fileids=None, categories=None): - return self.concatenated_view(self.metadata_reader, fileids, categories) - - def blockquote_reader(self, stream): - tokens = self.parser.parse(stream.read()) - opening_tokens = filter( - lambda t: t.level == 0 and t.type == "blockquote_open", tokens - ) - closing_tokens = filter( - lambda t: t.level == 0 and t.type == "blockquote_close", tokens - ) - blockquotes = list() - for o, c in zip(opening_tokens, closing_tokens): - opening_index = tokens.index(o) - closing_index = tokens.index(c, opening_index) - blockquotes.append(tokens[opening_index : closing_index + 1]) - return [ - MarkdownBlock( - self.parser.renderer.render(block, self.parser.options, env=None) - ) - for block in blockquotes - ] - - @comma_separated_string_args - def blockquotes(self, fileids=None, categories=None): - return self.concatenated_view(self.blockquote_reader, fileids, categories) - - def code_block_reader(self, stream): - return [ - CodeBlock( - t.info, - t.content, - ) - for t in self.parser.parse(stream.read()) - if t.level == 0 and t.type in ("fence", "code_block") - ] - - @comma_separated_string_args - def code_blocks(self, fileids=None, categories=None): - return self.concatenated_view(self.code_block_reader, fileids, categories) - - def image_reader(self, stream): - return [ - Image( - child_token.content, - child_token.attrGet("src"), - child_token.attrGet("title"), - ) - for inline_token in filter( - lambda t: t.type == "inline", self.parser.parse(stream.read()) - ) - for child_token in inline_token.children - if child_token.type == "image" - ] - - @comma_separated_string_args - def images(self, fileids=None, categories=None): - return self.concatenated_view(self.image_reader, fileids, categories) - - def link_reader(self, stream): - return [ - Link( - inline_token.children[i + 1].content, - child_token.attrGet("href"), - child_token.attrGet("title"), - ) - for inline_token in filter( - lambda t: t.type == "inline", self.parser.parse(stream.read()) - ) - for i, child_token in enumerate(inline_token.children) - if child_token.type == "link_open" - ] - - @comma_separated_string_args - def links(self, fileids=None, categories=None): - return self.concatenated_view(self.link_reader, fileids, categories) - - def list_reader(self, stream): - tokens = self.parser.parse(stream.read()) - opening_types = ("bullet_list_open", "ordered_list_open") - opening_tokens = filter( - lambda t: t.level == 0 and t.type in opening_types, tokens - ) - closing_types = ("bullet_list_close", "ordered_list_close") - closing_tokens = filter( - lambda t: t.level == 0 and t.type in closing_types, tokens - ) - list_blocks = list() - for o, c in zip(opening_tokens, closing_tokens): - opening_index = tokens.index(o) - closing_index = tokens.index(c, opening_index) - list_blocks.append(tokens[opening_index : closing_index + 1]) - return [ - List( - tokens[0].type == "ordered_list_open", - [t.content for t in tokens if t.content], - ) - for tokens in list_blocks - ] - - @comma_separated_string_args - def lists(self, fileids=None, categories=None): - return self.concatenated_view(self.list_reader, fileids, categories) - - def section_reader(self, stream): - section_blocks, block = list(), list() - in_heading = False - for t in self.parser.parse(stream.read()): - if t.level == 0 and t.type == "heading_open": - if block: - section_blocks.append(block) - block = list() - in_heading = True - if in_heading: - block.append(t) - return [ - MarkdownSection( - block[1].content, - block[0].markup.count("#"), - self.parser.renderer.render(block, self.parser.options, env=None), - ) - for block in section_blocks - ] - - @comma_separated_string_args - def sections(self, fileids=None, categories=None): - return self.concatenated_view(self.section_reader, fileids, categories) diff --git a/pipeline/nltk/corpus/reader/mte.py b/pipeline/nltk/corpus/reader/mte.py deleted file mode 100644 index 99190bed452095dc948e324ce5cc0f3c94c46505..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/mte.py +++ /dev/null @@ -1,397 +0,0 @@ -""" -A reader for corpora whose documents are in MTE format. -""" -import os -import re -from functools import reduce - -from nltk.corpus.reader import TaggedCorpusReader, concat -from nltk.corpus.reader.xmldocs import XMLCorpusView - - -def xpath(root, path, ns): - return root.findall(path, ns) - - -class MTECorpusView(XMLCorpusView): - """ - Class for lazy viewing the MTE Corpus. - """ - - def __init__(self, fileid, tagspec, elt_handler=None): - XMLCorpusView.__init__(self, fileid, tagspec, elt_handler) - - def read_block(self, stream, tagspec=None, elt_handler=None): - return list( - filter( - lambda x: x is not None, - XMLCorpusView.read_block(self, stream, tagspec, elt_handler), - ) - ) - - -class MTEFileReader: - """ - Class for loading the content of the multext-east corpus. It - parses the xml files and does some tag-filtering depending on the - given method parameters. - """ - - ns = { - "tei": "https://www.tei-c.org/ns/1.0", - "xml": "https://www.w3.org/XML/1998/namespace", - } - tag_ns = "{https://www.tei-c.org/ns/1.0}" - xml_ns = "{https://www.w3.org/XML/1998/namespace}" - word_path = "TEI/text/body/div/div/p/s/(w|c)" - sent_path = "TEI/text/body/div/div/p/s" - para_path = "TEI/text/body/div/div/p" - - def __init__(self, file_path): - self.__file_path = file_path - - @classmethod - def _word_elt(cls, elt, context): - return elt.text - - @classmethod - def _sent_elt(cls, elt, context): - return [cls._word_elt(w, None) for w in xpath(elt, "*", cls.ns)] - - @classmethod - def _para_elt(cls, elt, context): - return [cls._sent_elt(s, None) for s in xpath(elt, "*", cls.ns)] - - @classmethod - def _tagged_word_elt(cls, elt, context): - if "ana" not in elt.attrib: - return (elt.text, "") - - if cls.__tags == "" and cls.__tagset == "msd": - return (elt.text, elt.attrib["ana"]) - elif cls.__tags == "" and cls.__tagset == "universal": - return (elt.text, MTETagConverter.msd_to_universal(elt.attrib["ana"])) - else: - tags = re.compile("^" + re.sub("-", ".", cls.__tags) + ".*$") - if tags.match(elt.attrib["ana"]): - if cls.__tagset == "msd": - return (elt.text, elt.attrib["ana"]) - else: - return ( - elt.text, - MTETagConverter.msd_to_universal(elt.attrib["ana"]), - ) - else: - return None - - @classmethod - def _tagged_sent_elt(cls, elt, context): - return list( - filter( - lambda x: x is not None, - [cls._tagged_word_elt(w, None) for w in xpath(elt, "*", cls.ns)], - ) - ) - - @classmethod - def _tagged_para_elt(cls, elt, context): - return list( - filter( - lambda x: x is not None, - [cls._tagged_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)], - ) - ) - - @classmethod - def _lemma_word_elt(cls, elt, context): - if "lemma" not in elt.attrib: - return (elt.text, "") - else: - return (elt.text, elt.attrib["lemma"]) - - @classmethod - def _lemma_sent_elt(cls, elt, context): - return [cls._lemma_word_elt(w, None) for w in xpath(elt, "*", cls.ns)] - - @classmethod - def _lemma_para_elt(cls, elt, context): - return [cls._lemma_sent_elt(s, None) for s in xpath(elt, "*", cls.ns)] - - def words(self): - return MTECorpusView( - self.__file_path, MTEFileReader.word_path, MTEFileReader._word_elt - ) - - def sents(self): - return MTECorpusView( - self.__file_path, MTEFileReader.sent_path, MTEFileReader._sent_elt - ) - - def paras(self): - return MTECorpusView( - self.__file_path, MTEFileReader.para_path, MTEFileReader._para_elt - ) - - def lemma_words(self): - return MTECorpusView( - self.__file_path, MTEFileReader.word_path, MTEFileReader._lemma_word_elt - ) - - def tagged_words(self, tagset, tags): - MTEFileReader.__tagset = tagset - MTEFileReader.__tags = tags - return MTECorpusView( - self.__file_path, MTEFileReader.word_path, MTEFileReader._tagged_word_elt - ) - - def lemma_sents(self): - return MTECorpusView( - self.__file_path, MTEFileReader.sent_path, MTEFileReader._lemma_sent_elt - ) - - def tagged_sents(self, tagset, tags): - MTEFileReader.__tagset = tagset - MTEFileReader.__tags = tags - return MTECorpusView( - self.__file_path, MTEFileReader.sent_path, MTEFileReader._tagged_sent_elt - ) - - def lemma_paras(self): - return MTECorpusView( - self.__file_path, MTEFileReader.para_path, MTEFileReader._lemma_para_elt - ) - - def tagged_paras(self, tagset, tags): - MTEFileReader.__tagset = tagset - MTEFileReader.__tags = tags - return MTECorpusView( - self.__file_path, MTEFileReader.para_path, MTEFileReader._tagged_para_elt - ) - - -class MTETagConverter: - """ - Class for converting msd tags to universal tags, more conversion - options are currently not implemented. - """ - - mapping_msd_universal = { - "A": "ADJ", - "S": "ADP", - "R": "ADV", - "C": "CONJ", - "D": "DET", - "N": "NOUN", - "M": "NUM", - "Q": "PRT", - "P": "PRON", - "V": "VERB", - ".": ".", - "-": "X", - } - - @staticmethod - def msd_to_universal(tag): - """ - This function converts the annotation from the Multex-East to the universal tagset - as described in Chapter 5 of the NLTK-Book - - Unknown Tags will be mapped to X. Punctuation marks are not supported in MSD tags, so - """ - indicator = tag[0] if not tag[0] == "#" else tag[1] - - if not indicator in MTETagConverter.mapping_msd_universal: - indicator = "-" - - return MTETagConverter.mapping_msd_universal[indicator] - - -class MTECorpusReader(TaggedCorpusReader): - """ - Reader for corpora following the TEI-p5 xml scheme, such as MULTEXT-East. - MULTEXT-East contains part-of-speech-tagged words with a quite precise tagging - scheme. These tags can be converted to the Universal tagset - """ - - def __init__(self, root=None, fileids=None, encoding="utf8"): - """ - Construct a new MTECorpusreader for a set of documents - located at the given root directory. Example usage: - - >>> root = '/...path to corpus.../' - >>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP - - :param root: The root directory for this corpus. (default points to location in multext config file) - :param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml) - :param encoding: The encoding of the given files (default is utf8) - """ - TaggedCorpusReader.__init__(self, root, fileids, encoding) - self._readme = "00README.txt" - - def __fileids(self, fileids): - if fileids is None: - fileids = self._fileids - elif isinstance(fileids, str): - fileids = [fileids] - # filter wrong userinput - fileids = filter(lambda x: x in self._fileids, fileids) - # filter multext-east sourcefiles that are not compatible to the teip5 specification - fileids = filter(lambda x: x not in ["oana-bg.xml", "oana-mk.xml"], fileids) - if not fileids: - print("No valid multext-east file specified") - return fileids - - def words(self, fileids=None): - """ - :param fileids: A list specifying the fileids that should be used. - :return: the given file(s) as a list of words and punctuation symbols. - :rtype: list(str) - """ - return concat( - [ - MTEFileReader(os.path.join(self._root, f)).words() - for f in self.__fileids(fileids) - ] - ) - - def sents(self, fileids=None): - """ - :param fileids: A list specifying the fileids that should be used. - :return: the given file(s) as a list of sentences or utterances, - each encoded as a list of word strings - :rtype: list(list(str)) - """ - return concat( - [ - MTEFileReader(os.path.join(self._root, f)).sents() - for f in self.__fileids(fileids) - ] - ) - - def paras(self, fileids=None): - """ - :param fileids: A list specifying the fileids that should be used. - :return: the given file(s) as a list of paragraphs, each encoded as a list - of sentences, which are in turn encoded as lists of word string - :rtype: list(list(list(str))) - """ - return concat( - [ - MTEFileReader(os.path.join(self._root, f)).paras() - for f in self.__fileids(fileids) - ] - ) - - def lemma_words(self, fileids=None): - """ - :param fileids: A list specifying the fileids that should be used. - :return: the given file(s) as a list of words, the corresponding lemmas - and punctuation symbols, encoded as tuples (word, lemma) - :rtype: list(tuple(str,str)) - """ - return concat( - [ - MTEFileReader(os.path.join(self._root, f)).lemma_words() - for f in self.__fileids(fileids) - ] - ) - - def tagged_words(self, fileids=None, tagset="msd", tags=""): - """ - :param fileids: A list specifying the fileids that should be used. - :param tagset: The tagset that should be used in the returned object, - either "universal" or "msd", "msd" is the default - :param tags: An MSD Tag that is used to filter all parts of the used corpus - that are not more precise or at least equal to the given tag - :return: the given file(s) as a list of tagged words and punctuation symbols - encoded as tuples (word, tag) - :rtype: list(tuple(str, str)) - """ - if tagset == "universal" or tagset == "msd": - return concat( - [ - MTEFileReader(os.path.join(self._root, f)).tagged_words( - tagset, tags - ) - for f in self.__fileids(fileids) - ] - ) - else: - print("Unknown tagset specified.") - - def lemma_sents(self, fileids=None): - """ - :param fileids: A list specifying the fileids that should be used. - :return: the given file(s) as a list of sentences or utterances, each - encoded as a list of tuples of the word and the corresponding - lemma (word, lemma) - :rtype: list(list(tuple(str, str))) - """ - return concat( - [ - MTEFileReader(os.path.join(self._root, f)).lemma_sents() - for f in self.__fileids(fileids) - ] - ) - - def tagged_sents(self, fileids=None, tagset="msd", tags=""): - """ - :param fileids: A list specifying the fileids that should be used. - :param tagset: The tagset that should be used in the returned object, - either "universal" or "msd", "msd" is the default - :param tags: An MSD Tag that is used to filter all parts of the used corpus - that are not more precise or at least equal to the given tag - :return: the given file(s) as a list of sentences or utterances, each - each encoded as a list of (word,tag) tuples - :rtype: list(list(tuple(str, str))) - """ - if tagset == "universal" or tagset == "msd": - return concat( - [ - MTEFileReader(os.path.join(self._root, f)).tagged_sents( - tagset, tags - ) - for f in self.__fileids(fileids) - ] - ) - else: - print("Unknown tagset specified.") - - def lemma_paras(self, fileids=None): - """ - :param fileids: A list specifying the fileids that should be used. - :return: the given file(s) as a list of paragraphs, each encoded as a - list of sentences, which are in turn encoded as a list of - tuples of the word and the corresponding lemma (word, lemma) - :rtype: list(List(List(tuple(str, str)))) - """ - return concat( - [ - MTEFileReader(os.path.join(self._root, f)).lemma_paras() - for f in self.__fileids(fileids) - ] - ) - - def tagged_paras(self, fileids=None, tagset="msd", tags=""): - """ - :param fileids: A list specifying the fileids that should be used. - :param tagset: The tagset that should be used in the returned object, - either "universal" or "msd", "msd" is the default - :param tags: An MSD Tag that is used to filter all parts of the used corpus - that are not more precise or at least equal to the given tag - :return: the given file(s) as a list of paragraphs, each encoded as a - list of sentences, which are in turn encoded as a list - of (word,tag) tuples - :rtype: list(list(list(tuple(str, str)))) - """ - if tagset == "universal" or tagset == "msd": - return concat( - [ - MTEFileReader(os.path.join(self._root, f)).tagged_paras( - tagset, tags - ) - for f in self.__fileids(fileids) - ] - ) - else: - print("Unknown tagset specified.") diff --git a/pipeline/nltk/corpus/reader/nkjp.py b/pipeline/nltk/corpus/reader/nkjp.py deleted file mode 100644 index 685485590727fb8231062eedba6727cf3dc45d81..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/nkjp.py +++ /dev/null @@ -1,487 +0,0 @@ -# Natural Language Toolkit: NKJP Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Gabriela Kaczka -# URL: -# For license information, see LICENSE.TXT - -import functools -import os -import re -import tempfile - -from nltk.corpus.reader.util import concat -from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView - - -def _parse_args(fun): - """ - Wraps function arguments: - if fileids not specified then function set NKJPCorpusReader paths. - """ - - @functools.wraps(fun) - def decorator(self, fileids=None, **kwargs): - if not fileids: - fileids = self._paths - return fun(self, fileids, **kwargs) - - return decorator - - -class NKJPCorpusReader(XMLCorpusReader): - WORDS_MODE = 0 - SENTS_MODE = 1 - HEADER_MODE = 2 - RAW_MODE = 3 - - def __init__(self, root, fileids=".*"): - """ - Corpus reader designed to work with National Corpus of Polish. - See http://nkjp.pl/ for more details about NKJP. - use example: - import nltk - import nkjp - from nkjp import NKJPCorpusReader - x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='') # obtain the whole corpus - x.header() - x.raw() - x.words() - x.tagged_words(tags=['subst', 'comp']) #Link to find more tags: nkjp.pl/poliqarp/help/ense2.html - x.sents() - x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='Wilk*') # obtain particular file(s) - x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy']) - x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp']) - """ - if isinstance(fileids, str): - XMLCorpusReader.__init__(self, root, fileids + ".*/header.xml") - else: - XMLCorpusReader.__init__( - self, root, [fileid + "/header.xml" for fileid in fileids] - ) - self._paths = self.get_paths() - - def get_paths(self): - return [ - os.path.join(str(self._root), f.split("header.xml")[0]) - for f in self._fileids - ] - - def fileids(self): - """ - Returns a list of file identifiers for the fileids that make up - this corpus. - """ - return [f.split("header.xml")[0] for f in self._fileids] - - def _view(self, filename, tags=None, **kwargs): - """ - Returns a view specialised for use with particular corpus file. - """ - mode = kwargs.pop("mode", NKJPCorpusReader.WORDS_MODE) - if mode is NKJPCorpusReader.WORDS_MODE: - return NKJPCorpus_Morph_View(filename, tags=tags) - elif mode is NKJPCorpusReader.SENTS_MODE: - return NKJPCorpus_Segmentation_View(filename, tags=tags) - elif mode is NKJPCorpusReader.HEADER_MODE: - return NKJPCorpus_Header_View(filename, tags=tags) - elif mode is NKJPCorpusReader.RAW_MODE: - return NKJPCorpus_Text_View( - filename, tags=tags, mode=NKJPCorpus_Text_View.RAW_MODE - ) - - else: - raise NameError("No such mode!") - - def add_root(self, fileid): - """ - Add root if necessary to specified fileid. - """ - if self.root in fileid: - return fileid - return self.root + fileid - - @_parse_args - def header(self, fileids=None, **kwargs): - """ - Returns header(s) of specified fileids. - """ - return concat( - [ - self._view( - self.add_root(fileid), mode=NKJPCorpusReader.HEADER_MODE, **kwargs - ).handle_query() - for fileid in fileids - ] - ) - - @_parse_args - def sents(self, fileids=None, **kwargs): - """ - Returns sentences in specified fileids. - """ - return concat( - [ - self._view( - self.add_root(fileid), mode=NKJPCorpusReader.SENTS_MODE, **kwargs - ).handle_query() - for fileid in fileids - ] - ) - - @_parse_args - def words(self, fileids=None, **kwargs): - """ - Returns words in specified fileids. - """ - - return concat( - [ - self._view( - self.add_root(fileid), mode=NKJPCorpusReader.WORDS_MODE, **kwargs - ).handle_query() - for fileid in fileids - ] - ) - - @_parse_args - def tagged_words(self, fileids=None, **kwargs): - """ - Call with specified tags as a list, e.g. tags=['subst', 'comp']. - Returns tagged words in specified fileids. - """ - tags = kwargs.pop("tags", []) - return concat( - [ - self._view( - self.add_root(fileid), - mode=NKJPCorpusReader.WORDS_MODE, - tags=tags, - **kwargs - ).handle_query() - for fileid in fileids - ] - ) - - @_parse_args - def raw(self, fileids=None, **kwargs): - """ - Returns words in specified fileids. - """ - return concat( - [ - self._view( - self.add_root(fileid), mode=NKJPCorpusReader.RAW_MODE, **kwargs - ).handle_query() - for fileid in fileids - ] - ) - - -class NKJPCorpus_Header_View(XMLCorpusView): - def __init__(self, filename, **kwargs): - """ - HEADER_MODE - A stream backed corpus view specialized for use with - header.xml files in NKJP corpus. - """ - self.tagspec = ".*/sourceDesc$" - XMLCorpusView.__init__(self, filename + "header.xml", self.tagspec) - - def handle_query(self): - self._open() - header = [] - while True: - segm = XMLCorpusView.read_block(self, self._stream) - if len(segm) == 0: - break - header.extend(segm) - self.close() - return header - - def handle_elt(self, elt, context): - titles = elt.findall("bibl/title") - title = [] - if titles: - title = "\n".join(title.text.strip() for title in titles) - - authors = elt.findall("bibl/author") - author = [] - if authors: - author = "\n".join(author.text.strip() for author in authors) - - dates = elt.findall("bibl/date") - date = [] - if dates: - date = "\n".join(date.text.strip() for date in dates) - - publishers = elt.findall("bibl/publisher") - publisher = [] - if publishers: - publisher = "\n".join(publisher.text.strip() for publisher in publishers) - - idnos = elt.findall("bibl/idno") - idno = [] - if idnos: - idno = "\n".join(idno.text.strip() for idno in idnos) - - notes = elt.findall("bibl/note") - note = [] - if notes: - note = "\n".join(note.text.strip() for note in notes) - - return { - "title": title, - "author": author, - "date": date, - "publisher": publisher, - "idno": idno, - "note": note, - } - - -class XML_Tool: - """ - Helper class creating xml file to one without references to nkjp: namespace. - That's needed because the XMLCorpusView assumes that one can find short substrings - of XML that are valid XML, which is not true if a namespace is declared at top level - """ - - def __init__(self, root, filename): - self.read_file = os.path.join(root, filename) - self.write_file = tempfile.NamedTemporaryFile(delete=False) - - def build_preprocessed_file(self): - try: - fr = open(self.read_file) - fw = self.write_file - line = " " - while len(line): - line = fr.readline() - x = re.split(r"nkjp:[^ ]* ", line) # in all files - ret = " ".join(x) - x = re.split("", ret) # in ann_segmentation.xml - ret = " ".join(x) - x = re.split("", ret) # in ann_segmentation.xml - ret = " ".join(x) - x = re.split("", ret) # in ann_segmentation.xml - ret = " ".join(x) - x = re.split("", ret) # in ann_segmentation.xml - ret = " ".join(x) - fw.write(ret) - fr.close() - fw.close() - return self.write_file.name - except Exception as e: - self.remove_preprocessed_file() - raise Exception from e - - def remove_preprocessed_file(self): - os.remove(self.write_file.name) - - -class NKJPCorpus_Segmentation_View(XMLCorpusView): - """ - A stream backed corpus view specialized for use with - ann_segmentation.xml files in NKJP corpus. - """ - - def __init__(self, filename, **kwargs): - self.tagspec = ".*p/.*s" - # intersperse NKJPCorpus_Text_View - self.text_view = NKJPCorpus_Text_View( - filename, mode=NKJPCorpus_Text_View.SENTS_MODE - ) - self.text_view.handle_query() - # xml preprocessing - self.xml_tool = XML_Tool(filename, "ann_segmentation.xml") - # base class init - XMLCorpusView.__init__( - self, self.xml_tool.build_preprocessed_file(), self.tagspec - ) - - def get_segm_id(self, example_word): - return example_word.split("(")[1].split(",")[0] - - def get_sent_beg(self, beg_word): - # returns index of beginning letter in sentence - return int(beg_word.split(",")[1]) - - def get_sent_end(self, end_word): - # returns index of end letter in sentence - splitted = end_word.split(")")[0].split(",") - return int(splitted[1]) + int(splitted[2]) - - def get_sentences(self, sent_segm): - # returns one sentence - id = self.get_segm_id(sent_segm[0]) - segm = self.text_view.segm_dict[id] # text segment - beg = self.get_sent_beg(sent_segm[0]) - end = self.get_sent_end(sent_segm[len(sent_segm) - 1]) - return segm[beg:end] - - def remove_choice(self, segm): - ret = [] - prev_txt_end = -1 - prev_txt_nr = -1 - for word in segm: - txt_nr = self.get_segm_id(word) - # get increasing sequence of ids: in case of choice get first possibility - if self.get_sent_beg(word) > prev_txt_end - 1 or prev_txt_nr != txt_nr: - ret.append(word) - prev_txt_end = self.get_sent_end(word) - prev_txt_nr = txt_nr - - return ret - - def handle_query(self): - try: - self._open() - sentences = [] - while True: - sent_segm = XMLCorpusView.read_block(self, self._stream) - if len(sent_segm) == 0: - break - for segm in sent_segm: - segm = self.remove_choice(segm) - sentences.append(self.get_sentences(segm)) - self.close() - self.xml_tool.remove_preprocessed_file() - return sentences - except Exception as e: - self.xml_tool.remove_preprocessed_file() - raise Exception from e - - def handle_elt(self, elt, context): - ret = [] - for seg in elt: - ret.append(seg.get("corresp")) - return ret - - -class NKJPCorpus_Text_View(XMLCorpusView): - """ - A stream backed corpus view specialized for use with - text.xml files in NKJP corpus. - """ - - SENTS_MODE = 0 - RAW_MODE = 1 - - def __init__(self, filename, **kwargs): - self.mode = kwargs.pop("mode", 0) - self.tagspec = ".*/div/ab" - self.segm_dict = dict() - # xml preprocessing - self.xml_tool = XML_Tool(filename, "text.xml") - # base class init - XMLCorpusView.__init__( - self, self.xml_tool.build_preprocessed_file(), self.tagspec - ) - - def handle_query(self): - try: - self._open() - x = self.read_block(self._stream) - self.close() - self.xml_tool.remove_preprocessed_file() - return x - except Exception as e: - self.xml_tool.remove_preprocessed_file() - raise Exception from e - - def read_block(self, stream, tagspec=None, elt_handler=None): - """ - Returns text as a list of sentences. - """ - txt = [] - while True: - segm = XMLCorpusView.read_block(self, stream) - if len(segm) == 0: - break - for part in segm: - txt.append(part) - - return [" ".join([segm for segm in txt])] - - def get_segm_id(self, elt): - for attr in elt.attrib: - if attr.endswith("id"): - return elt.get(attr) - - def handle_elt(self, elt, context): - # fill dictionary to use later in sents mode - if self.mode is NKJPCorpus_Text_View.SENTS_MODE: - self.segm_dict[self.get_segm_id(elt)] = elt.text - return elt.text - - -class NKJPCorpus_Morph_View(XMLCorpusView): - """ - A stream backed corpus view specialized for use with - ann_morphosyntax.xml files in NKJP corpus. - """ - - def __init__(self, filename, **kwargs): - self.tags = kwargs.pop("tags", None) - self.tagspec = ".*/seg/fs" - self.xml_tool = XML_Tool(filename, "ann_morphosyntax.xml") - XMLCorpusView.__init__( - self, self.xml_tool.build_preprocessed_file(), self.tagspec - ) - - def handle_query(self): - try: - self._open() - words = [] - while True: - segm = XMLCorpusView.read_block(self, self._stream) - if len(segm) == 0: - break - for part in segm: - if part is not None: - words.append(part) - self.close() - self.xml_tool.remove_preprocessed_file() - return words - except Exception as e: - self.xml_tool.remove_preprocessed_file() - raise Exception from e - - def handle_elt(self, elt, context): - word = "" - flag = False - is_not_interp = True - # if tags not specified, then always return word - if self.tags is None: - flag = True - - for child in elt: - - # get word - if "name" in child.keys() and child.attrib["name"] == "orth": - for symbol in child: - if symbol.tag == "string": - word = symbol.text - elif "name" in child.keys() and child.attrib["name"] == "interps": - for symbol in child: - if "type" in symbol.keys() and symbol.attrib["type"] == "lex": - for symbol2 in symbol: - if ( - "name" in symbol2.keys() - and symbol2.attrib["name"] == "ctag" - ): - for symbol3 in symbol2: - if ( - "value" in symbol3.keys() - and self.tags is not None - and symbol3.attrib["value"] in self.tags - ): - flag = True - elif ( - "value" in symbol3.keys() - and symbol3.attrib["value"] == "interp" - ): - is_not_interp = False - if flag and is_not_interp: - return word diff --git a/pipeline/nltk/corpus/reader/nombank.py b/pipeline/nltk/corpus/reader/nombank.py deleted file mode 100644 index ddee6206019c644968058e7cb6cac83f5076ade6..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/nombank.py +++ /dev/null @@ -1,466 +0,0 @@ -# Natural Language Toolkit: NomBank Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Authors: Paul Bedaride -# Edward Loper -# URL: -# For license information, see LICENSE.TXT - -from functools import total_ordering -from xml.etree import ElementTree - -from nltk.corpus.reader.api import * -from nltk.corpus.reader.util import * -from nltk.internals import raise_unorderable_types -from nltk.tree import Tree - - -class NombankCorpusReader(CorpusReader): - """ - Corpus reader for the nombank corpus, which augments the Penn - Treebank with information about the predicate argument structure - of every noun instance. The corpus consists of two parts: the - predicate-argument annotations themselves, and a set of "frameset - files" which define the argument labels used by the annotations, - on a per-noun basis. Each "frameset file" contains one or more - predicates, such as ``'turn'`` or ``'turn_on'``, each of which is - divided into coarse-grained word senses called "rolesets". For - each "roleset", the frameset file provides descriptions of the - argument roles, along with examples. - """ - - def __init__( - self, - root, - nomfile, - framefiles="", - nounsfile=None, - parse_fileid_xform=None, - parse_corpus=None, - encoding="utf8", - ): - """ - :param root: The root directory for this corpus. - :param nomfile: The name of the file containing the predicate- - argument annotations (relative to ``root``). - :param framefiles: A list or regexp specifying the frameset - fileids for this corpus. - :param parse_fileid_xform: A transform that should be applied - to the fileids in this corpus. This should be a function - of one argument (a fileid) that returns a string (the new - fileid). - :param parse_corpus: The corpus containing the parse trees - corresponding to this corpus. These parse trees are - necessary to resolve the tree pointers used by nombank. - """ - - # If framefiles is specified as a regexp, expand it. - if isinstance(framefiles, str): - self._fileids = find_corpus_fileids(root, framefiles) - self._fileids = list(framefiles) - # Initialize the corpus reader. - CorpusReader.__init__(self, root, framefiles, encoding) - - # Record our nom file & nouns file. - self._nomfile = nomfile - self._nounsfile = nounsfile - self._parse_fileid_xform = parse_fileid_xform - self._parse_corpus = parse_corpus - - def instances(self, baseform=None): - """ - :return: a corpus view that acts as a list of - ``NombankInstance`` objects, one for each noun in the corpus. - """ - kwargs = {} - if baseform is not None: - kwargs["instance_filter"] = lambda inst: inst.baseform == baseform - return StreamBackedCorpusView( - self.abspath(self._nomfile), - lambda stream: self._read_instance_block(stream, **kwargs), - encoding=self.encoding(self._nomfile), - ) - - def lines(self): - """ - :return: a corpus view that acts as a list of strings, one for - each line in the predicate-argument annotation file. - """ - return StreamBackedCorpusView( - self.abspath(self._nomfile), - read_line_block, - encoding=self.encoding(self._nomfile), - ) - - def roleset(self, roleset_id): - """ - :return: the xml description for the given roleset. - """ - baseform = roleset_id.split(".")[0] - baseform = baseform.replace("perc-sign", "%") - baseform = baseform.replace("oneslashonezero", "1/10").replace( - "1/10", "1-slash-10" - ) - framefile = "frames/%s.xml" % baseform - if framefile not in self.fileids(): - raise ValueError("Frameset file for %s not found" % roleset_id) - - # n.b.: The encoding for XML fileids is specified by the file - # itself; so we ignore self._encoding here. - with self.abspath(framefile).open() as fp: - etree = ElementTree.parse(fp).getroot() - for roleset in etree.findall("predicate/roleset"): - if roleset.attrib["id"] == roleset_id: - return roleset - raise ValueError(f"Roleset {roleset_id} not found in {framefile}") - - def rolesets(self, baseform=None): - """ - :return: list of xml descriptions for rolesets. - """ - if baseform is not None: - framefile = "frames/%s.xml" % baseform - if framefile not in self.fileids(): - raise ValueError("Frameset file for %s not found" % baseform) - framefiles = [framefile] - else: - framefiles = self.fileids() - - rsets = [] - for framefile in framefiles: - # n.b.: The encoding for XML fileids is specified by the file - # itself; so we ignore self._encoding here. - with self.abspath(framefile).open() as fp: - etree = ElementTree.parse(fp).getroot() - rsets.append(etree.findall("predicate/roleset")) - return LazyConcatenation(rsets) - - def nouns(self): - """ - :return: a corpus view that acts as a list of all noun lemmas - in this corpus (from the nombank.1.0.words file). - """ - return StreamBackedCorpusView( - self.abspath(self._nounsfile), - read_line_block, - encoding=self.encoding(self._nounsfile), - ) - - def _read_instance_block(self, stream, instance_filter=lambda inst: True): - block = [] - - # Read 100 at a time. - for i in range(100): - line = stream.readline().strip() - if line: - inst = NombankInstance.parse( - line, self._parse_fileid_xform, self._parse_corpus - ) - if instance_filter(inst): - block.append(inst) - - return block - - -###################################################################### -# { Nombank Instance & related datatypes -###################################################################### - - -class NombankInstance: - def __init__( - self, - fileid, - sentnum, - wordnum, - baseform, - sensenumber, - predicate, - predid, - arguments, - parse_corpus=None, - ): - - self.fileid = fileid - """The name of the file containing the parse tree for this - instance's sentence.""" - - self.sentnum = sentnum - """The sentence number of this sentence within ``fileid``. - Indexing starts from zero.""" - - self.wordnum = wordnum - """The word number of this instance's predicate within its - containing sentence. Word numbers are indexed starting from - zero, and include traces and other empty parse elements.""" - - self.baseform = baseform - """The baseform of the predicate.""" - - self.sensenumber = sensenumber - """The sense number of the predicate.""" - - self.predicate = predicate - """A ``NombankTreePointer`` indicating the position of this - instance's predicate within its containing sentence.""" - - self.predid = predid - """Identifier of the predicate.""" - - self.arguments = tuple(arguments) - """A list of tuples (argloc, argid), specifying the location - and identifier for each of the predicate's argument in the - containing sentence. Argument identifiers are strings such as - ``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain - the predicate.""" - - self.parse_corpus = parse_corpus - """A corpus reader for the parse trees corresponding to the - instances in this nombank corpus.""" - - @property - def roleset(self): - """The name of the roleset used by this instance's predicate. - Use ``nombank.roleset() `` to - look up information about the roleset.""" - r = self.baseform.replace("%", "perc-sign") - r = r.replace("1/10", "1-slash-10").replace("1-slash-10", "oneslashonezero") - return f"{r}.{self.sensenumber}" - - def __repr__(self): - return "".format( - self.fileid, - self.sentnum, - self.wordnum, - ) - - def __str__(self): - s = "{} {} {} {} {}".format( - self.fileid, - self.sentnum, - self.wordnum, - self.baseform, - self.sensenumber, - ) - items = self.arguments + ((self.predicate, "rel"),) - for (argloc, argid) in sorted(items): - s += f" {argloc}-{argid}" - return s - - def _get_tree(self): - if self.parse_corpus is None: - return None - if self.fileid not in self.parse_corpus.fileids(): - return None - return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum] - - tree = property( - _get_tree, - doc=""" - The parse tree corresponding to this instance, or None if - the corresponding tree is not available.""", - ) - - @staticmethod - def parse(s, parse_fileid_xform=None, parse_corpus=None): - pieces = s.split() - if len(pieces) < 6: - raise ValueError("Badly formatted nombank line: %r" % s) - - # Divide the line into its basic pieces. - (fileid, sentnum, wordnum, baseform, sensenumber) = pieces[:5] - - args = pieces[5:] - rel = [args.pop(i) for i, p in enumerate(args) if "-rel" in p] - if len(rel) != 1: - raise ValueError("Badly formatted nombank line: %r" % s) - - # Apply the fileid selector, if any. - if parse_fileid_xform is not None: - fileid = parse_fileid_xform(fileid) - - # Convert sentence & word numbers to ints. - sentnum = int(sentnum) - wordnum = int(wordnum) - - # Parse the predicate location. - - predloc, predid = rel[0].split("-", 1) - predicate = NombankTreePointer.parse(predloc) - - # Parse the arguments. - arguments = [] - for arg in args: - argloc, argid = arg.split("-", 1) - arguments.append((NombankTreePointer.parse(argloc), argid)) - - # Put it all together. - return NombankInstance( - fileid, - sentnum, - wordnum, - baseform, - sensenumber, - predicate, - predid, - arguments, - parse_corpus, - ) - - -class NombankPointer: - """ - A pointer used by nombank to identify one or more constituents in - a parse tree. ``NombankPointer`` is an abstract base class with - three concrete subclasses: - - - ``NombankTreePointer`` is used to point to single constituents. - - ``NombankSplitTreePointer`` is used to point to 'split' - constituents, which consist of a sequence of two or more - ``NombankTreePointer`` pointers. - - ``NombankChainTreePointer`` is used to point to entire trace - chains in a tree. It consists of a sequence of pieces, which - can be ``NombankTreePointer`` or ``NombankSplitTreePointer`` pointers. - """ - - def __init__(self): - if self.__class__ == NombankPointer: - raise NotImplementedError() - - -class NombankChainTreePointer(NombankPointer): - def __init__(self, pieces): - self.pieces = pieces - """A list of the pieces that make up this chain. Elements may - be either ``NombankSplitTreePointer`` or - ``NombankTreePointer`` pointers.""" - - def __str__(self): - return "*".join("%s" % p for p in self.pieces) - - def __repr__(self): - return "" % self - - def select(self, tree): - if tree is None: - raise ValueError("Parse tree not available") - return Tree("*CHAIN*", [p.select(tree) for p in self.pieces]) - - -class NombankSplitTreePointer(NombankPointer): - def __init__(self, pieces): - self.pieces = pieces - """A list of the pieces that make up this chain. Elements are - all ``NombankTreePointer`` pointers.""" - - def __str__(self): - return ",".join("%s" % p for p in self.pieces) - - def __repr__(self): - return "" % self - - def select(self, tree): - if tree is None: - raise ValueError("Parse tree not available") - return Tree("*SPLIT*", [p.select(tree) for p in self.pieces]) - - -@total_ordering -class NombankTreePointer(NombankPointer): - """ - wordnum:height*wordnum:height*... - wordnum:height, - - """ - - def __init__(self, wordnum, height): - self.wordnum = wordnum - self.height = height - - @staticmethod - def parse(s): - # Deal with chains (xx*yy*zz) - pieces = s.split("*") - if len(pieces) > 1: - return NombankChainTreePointer( - [NombankTreePointer.parse(elt) for elt in pieces] - ) - - # Deal with split args (xx,yy,zz) - pieces = s.split(",") - if len(pieces) > 1: - return NombankSplitTreePointer( - [NombankTreePointer.parse(elt) for elt in pieces] - ) - - # Deal with normal pointers. - pieces = s.split(":") - if len(pieces) != 2: - raise ValueError("bad nombank pointer %r" % s) - return NombankTreePointer(int(pieces[0]), int(pieces[1])) - - def __str__(self): - return f"{self.wordnum}:{self.height}" - - def __repr__(self): - return "NombankTreePointer(%d, %d)" % (self.wordnum, self.height) - - def __eq__(self, other): - while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)): - other = other.pieces[0] - - if not isinstance(other, NombankTreePointer): - return self is other - - return self.wordnum == other.wordnum and self.height == other.height - - def __ne__(self, other): - return not self == other - - def __lt__(self, other): - while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)): - other = other.pieces[0] - - if not isinstance(other, NombankTreePointer): - return id(self) < id(other) - - return (self.wordnum, -self.height) < (other.wordnum, -other.height) - - def select(self, tree): - if tree is None: - raise ValueError("Parse tree not available") - return tree[self.treepos(tree)] - - def treepos(self, tree): - """ - Convert this pointer to a standard 'tree position' pointer, - given that it points to the given tree. - """ - if tree is None: - raise ValueError("Parse tree not available") - stack = [tree] - treepos = [] - - wordnum = 0 - while True: - # tree node: - if isinstance(stack[-1], Tree): - # Select the next child. - if len(treepos) < len(stack): - treepos.append(0) - else: - treepos[-1] += 1 - # Update the stack. - if treepos[-1] < len(stack[-1]): - stack.append(stack[-1][treepos[-1]]) - else: - # End of node's child list: pop up a level. - stack.pop() - treepos.pop() - # word node: - else: - if wordnum == self.wordnum: - return tuple(treepos[: len(treepos) - self.height - 1]) - else: - wordnum += 1 - stack.pop() diff --git a/pipeline/nltk/corpus/reader/nps_chat.py b/pipeline/nltk/corpus/reader/nps_chat.py deleted file mode 100644 index 0bcf51dc66954866ad665a54ba926fc9c8a33116..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/nps_chat.py +++ /dev/null @@ -1,90 +0,0 @@ -# Natural Language Toolkit: NPS Chat Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT - -import re -import textwrap - -from nltk.corpus.reader.api import * -from nltk.corpus.reader.util import * -from nltk.corpus.reader.xmldocs import * -from nltk.internals import ElementWrapper -from nltk.tag import map_tag -from nltk.util import LazyConcatenation - - -class NPSChatCorpusReader(XMLCorpusReader): - def __init__(self, root, fileids, wrap_etree=False, tagset=None): - XMLCorpusReader.__init__(self, root, fileids, wrap_etree) - self._tagset = tagset - - def xml_posts(self, fileids=None): - if self._wrap_etree: - return concat( - [ - XMLCorpusView(fileid, "Session/Posts/Post", self._wrap_elt) - for fileid in self.abspaths(fileids) - ] - ) - else: - return concat( - [ - XMLCorpusView(fileid, "Session/Posts/Post") - for fileid in self.abspaths(fileids) - ] - ) - - def posts(self, fileids=None): - return concat( - [ - XMLCorpusView( - fileid, "Session/Posts/Post/terminals", self._elt_to_words - ) - for fileid in self.abspaths(fileids) - ] - ) - - def tagged_posts(self, fileids=None, tagset=None): - def reader(elt, handler): - return self._elt_to_tagged_words(elt, handler, tagset) - - return concat( - [ - XMLCorpusView(fileid, "Session/Posts/Post/terminals", reader) - for fileid in self.abspaths(fileids) - ] - ) - - def words(self, fileids=None): - return LazyConcatenation(self.posts(fileids)) - - def tagged_words(self, fileids=None, tagset=None): - return LazyConcatenation(self.tagged_posts(fileids, tagset)) - - def _wrap_elt(self, elt, handler): - return ElementWrapper(elt) - - def _elt_to_words(self, elt, handler): - return [self._simplify_username(t.attrib["word"]) for t in elt.findall("t")] - - def _elt_to_tagged_words(self, elt, handler, tagset=None): - tagged_post = [ - (self._simplify_username(t.attrib["word"]), t.attrib["pos"]) - for t in elt.findall("t") - ] - if tagset and tagset != self._tagset: - tagged_post = [ - (w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_post - ] - return tagged_post - - @staticmethod - def _simplify_username(word): - if "User" in word: - word = "U" + word.split("User", 1)[1] - elif isinstance(word, bytes): - word = word.decode("ascii") - return word diff --git a/pipeline/nltk/corpus/reader/opinion_lexicon.py b/pipeline/nltk/corpus/reader/opinion_lexicon.py deleted file mode 100644 index 87be7c97e6151c8ce19e64e2f8ac6683918e3aad..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/opinion_lexicon.py +++ /dev/null @@ -1,125 +0,0 @@ -# Natural Language Toolkit: Opinion Lexicon Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Pierpaolo Pantone <24alsecondo@gmail.com> -# URL: -# For license information, see LICENSE.TXT - -""" -CorpusReader for the Opinion Lexicon. - -Opinion Lexicon information -=========================== - -Authors: Minqing Hu and Bing Liu, 2004. - Department of Computer Science - University of Illinois at Chicago - -Contact: Bing Liu, liub@cs.uic.edu - https://www.cs.uic.edu/~liub - -Distributed with permission. - -Related papers: - -- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews". - Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery - & Data Mining (KDD-04), Aug 22-25, 2004, Seattle, Washington, USA. - -- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and - Comparing Opinions on the Web". Proceedings of the 14th International World - Wide Web conference (WWW-2005), May 10-14, 2005, Chiba, Japan. -""" - -from nltk.corpus.reader import WordListCorpusReader -from nltk.corpus.reader.api import * - - -class IgnoreReadmeCorpusView(StreamBackedCorpusView): - """ - This CorpusView is used to skip the initial readme block of the corpus. - """ - - def __init__(self, *args, **kwargs): - StreamBackedCorpusView.__init__(self, *args, **kwargs) - # open self._stream - self._open() - # skip the readme block - read_blankline_block(self._stream) - # Set the initial position to the current stream position - self._filepos = [self._stream.tell()] - - -class OpinionLexiconCorpusReader(WordListCorpusReader): - """ - Reader for Liu and Hu opinion lexicon. Blank lines and readme are ignored. - - >>> from nltk.corpus import opinion_lexicon - >>> opinion_lexicon.words() - ['2-faced', '2-faces', 'abnormal', 'abolish', ...] - - The OpinionLexiconCorpusReader provides shortcuts to retrieve positive/negative - words: - - >>> opinion_lexicon.negative() - ['2-faced', '2-faces', 'abnormal', 'abolish', ...] - - Note that words from `words()` method are sorted by file id, not alphabetically: - - >>> opinion_lexicon.words()[0:10] # doctest: +NORMALIZE_WHITESPACE - ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably', - 'abominate', 'abomination', 'abort', 'aborted'] - >>> sorted(opinion_lexicon.words())[0:10] # doctest: +NORMALIZE_WHITESPACE - ['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably', - 'abominate', 'abomination', 'abort'] - """ - - CorpusView = IgnoreReadmeCorpusView - - def words(self, fileids=None): - """ - Return all words in the opinion lexicon. Note that these words are not - sorted in alphabetical order. - - :param fileids: a list or regexp specifying the ids of the files whose - words have to be returned. - :return: the given file(s) as a list of words and punctuation symbols. - :rtype: list(str) - """ - if fileids is None: - fileids = self._fileids - elif isinstance(fileids, str): - fileids = [fileids] - return concat( - [ - self.CorpusView(path, self._read_word_block, encoding=enc) - for (path, enc, fileid) in self.abspaths(fileids, True, True) - ] - ) - - def positive(self): - """ - Return all positive words in alphabetical order. - - :return: a list of positive words. - :rtype: list(str) - """ - return self.words("positive-words.txt") - - def negative(self): - """ - Return all negative words in alphabetical order. - - :return: a list of negative words. - :rtype: list(str) - """ - return self.words("negative-words.txt") - - def _read_word_block(self, stream): - words = [] - for i in range(20): # Read 20 lines at a time. - line = stream.readline() - if not line: - continue - words.append(line.strip()) - return words diff --git a/pipeline/nltk/corpus/reader/panlex_lite.py b/pipeline/nltk/corpus/reader/panlex_lite.py deleted file mode 100644 index 59492992353ca876eea00f63e3759f14ec5b0e02..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/panlex_lite.py +++ /dev/null @@ -1,174 +0,0 @@ -# Natural Language Toolkit: PanLex Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: David Kamholz -# URL: -# For license information, see LICENSE.TXT - -""" -CorpusReader for PanLex Lite, a stripped down version of PanLex distributed -as an SQLite database. See the README.txt in the panlex_lite corpus directory -for more information on PanLex Lite. -""" - -import os -import sqlite3 - -from nltk.corpus.reader.api import CorpusReader - - -class PanLexLiteCorpusReader(CorpusReader): - MEANING_Q = """ - SELECT dnx2.mn, dnx2.uq, dnx2.ap, dnx2.ui, ex2.tt, ex2.lv - FROM dnx - JOIN ex ON (ex.ex = dnx.ex) - JOIN dnx dnx2 ON (dnx2.mn = dnx.mn) - JOIN ex ex2 ON (ex2.ex = dnx2.ex) - WHERE dnx.ex != dnx2.ex AND ex.tt = ? AND ex.lv = ? - ORDER BY dnx2.uq DESC - """ - - TRANSLATION_Q = """ - SELECT s.tt, sum(s.uq) AS trq FROM ( - SELECT ex2.tt, max(dnx.uq) AS uq - FROM dnx - JOIN ex ON (ex.ex = dnx.ex) - JOIN dnx dnx2 ON (dnx2.mn = dnx.mn) - JOIN ex ex2 ON (ex2.ex = dnx2.ex) - WHERE dnx.ex != dnx2.ex AND ex.lv = ? AND ex.tt = ? AND ex2.lv = ? - GROUP BY ex2.tt, dnx.ui - ) s - GROUP BY s.tt - ORDER BY trq DESC, s.tt - """ - - def __init__(self, root): - self._c = sqlite3.connect(os.path.join(root, "db.sqlite")).cursor() - - self._uid_lv = {} - self._lv_uid = {} - - for row in self._c.execute("SELECT uid, lv FROM lv"): - self._uid_lv[row[0]] = row[1] - self._lv_uid[row[1]] = row[0] - - def language_varieties(self, lc=None): - """ - Return a list of PanLex language varieties. - - :param lc: ISO 639 alpha-3 code. If specified, filters returned varieties - by this code. If unspecified, all varieties are returned. - :return: the specified language varieties as a list of tuples. The first - element is the language variety's seven-character uniform identifier, - and the second element is its default name. - :rtype: list(tuple) - """ - - if lc is None: - return self._c.execute("SELECT uid, tt FROM lv ORDER BY uid").fetchall() - else: - return self._c.execute( - "SELECT uid, tt FROM lv WHERE lc = ? ORDER BY uid", (lc,) - ).fetchall() - - def meanings(self, expr_uid, expr_tt): - """ - Return a list of meanings for an expression. - - :param expr_uid: the expression's language variety, as a seven-character - uniform identifier. - :param expr_tt: the expression's text. - :return: a list of Meaning objects. - :rtype: list(Meaning) - """ - - expr_lv = self._uid_lv[expr_uid] - - mn_info = {} - - for i in self._c.execute(self.MEANING_Q, (expr_tt, expr_lv)): - mn = i[0] - uid = self._lv_uid[i[5]] - - if not mn in mn_info: - mn_info[mn] = { - "uq": i[1], - "ap": i[2], - "ui": i[3], - "ex": {expr_uid: [expr_tt]}, - } - - if not uid in mn_info[mn]["ex"]: - mn_info[mn]["ex"][uid] = [] - - mn_info[mn]["ex"][uid].append(i[4]) - - return [Meaning(mn, mn_info[mn]) for mn in mn_info] - - def translations(self, from_uid, from_tt, to_uid): - """ - Return a list of translations for an expression into a single language - variety. - - :param from_uid: the source expression's language variety, as a - seven-character uniform identifier. - :param from_tt: the source expression's text. - :param to_uid: the target language variety, as a seven-character - uniform identifier. - :return: a list of translation tuples. The first element is the expression - text and the second element is the translation quality. - :rtype: list(tuple) - """ - - from_lv = self._uid_lv[from_uid] - to_lv = self._uid_lv[to_uid] - - return self._c.execute(self.TRANSLATION_Q, (from_lv, from_tt, to_lv)).fetchall() - - -class Meaning(dict): - """ - Represents a single PanLex meaning. A meaning is a translation set derived - from a single source. - """ - - def __init__(self, mn, attr): - super().__init__(**attr) - self["mn"] = mn - - def id(self): - """ - :return: the meaning's id. - :rtype: int - """ - return self["mn"] - - def quality(self): - """ - :return: the meaning's source's quality (0=worst, 9=best). - :rtype: int - """ - return self["uq"] - - def source(self): - """ - :return: the meaning's source id. - :rtype: int - """ - return self["ap"] - - def source_group(self): - """ - :return: the meaning's source group id. - :rtype: int - """ - return self["ui"] - - def expressions(self): - """ - :return: the meaning's expressions as a dictionary whose keys are language - variety uniform identifiers and whose values are lists of expression - texts. - :rtype: dict - """ - return self["ex"] diff --git a/pipeline/nltk/corpus/reader/panlex_swadesh.py b/pipeline/nltk/corpus/reader/panlex_swadesh.py deleted file mode 100644 index 182960f2ebc4b3e2411e3980ce4e445412af9bcc..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/panlex_swadesh.py +++ /dev/null @@ -1,95 +0,0 @@ -# Natural Language Toolkit: Word List Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Edward Loper -# URL: -# For license information, see LICENSE.TXT - - -import re -from collections import defaultdict, namedtuple - -from nltk.corpus.reader.api import * -from nltk.corpus.reader.util import * -from nltk.corpus.reader.wordlist import WordListCorpusReader -from nltk.tokenize import line_tokenize - -PanlexLanguage = namedtuple( - "PanlexLanguage", - [ - "panlex_uid", # (1) PanLex UID - "iso639", # (2) ISO 639 language code - "iso639_type", # (3) ISO 639 language type, see README - "script", # (4) normal scripts of expressions - "name", # (5) PanLex default name - "langvar_uid", # (6) UID of the language variety in which the default name is an expression - ], -) - - -class PanlexSwadeshCorpusReader(WordListCorpusReader): - """ - This is a class to read the PanLex Swadesh list from - - David Kamholz, Jonathan Pool, and Susan M. Colowick (2014). - PanLex: Building a Resource for Panlingual Lexical Translation. - In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf - - License: CC0 1.0 Universal - https://creativecommons.org/publicdomain/zero/1.0/legalcode - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # Find the swadesh size using the fileids' path. - self.swadesh_size = re.match(r"swadesh([0-9].*)\/", self.fileids()[0]).group(1) - self._languages = {lang.panlex_uid: lang for lang in self.get_languages()} - self._macro_langauges = self.get_macrolanguages() - - def license(self): - return "CC0 1.0 Universal" - - def language_codes(self): - return self._languages.keys() - - def get_languages(self): - for line in self.raw(f"langs{self.swadesh_size}.txt").split("\n"): - if not line.strip(): # Skip empty lines. - continue - yield PanlexLanguage(*line.strip().split("\t")) - - def get_macrolanguages(self): - macro_langauges = defaultdict(list) - for lang in self._languages.values(): - macro_langauges[lang.iso639].append(lang.panlex_uid) - return macro_langauges - - def words_by_lang(self, lang_code): - """ - :return: a list of list(str) - """ - fileid = f"swadesh{self.swadesh_size}/{lang_code}.txt" - return [concept.split("\t") for concept in self.words(fileid)] - - def words_by_iso639(self, iso63_code): - """ - :return: a list of list(str) - """ - fileids = [ - f"swadesh{self.swadesh_size}/{lang_code}.txt" - for lang_code in self._macro_langauges[iso63_code] - ] - return [ - concept.split("\t") for fileid in fileids for concept in self.words(fileid) - ] - - def entries(self, fileids=None): - """ - :return: a tuple of words for the specified fileids. - """ - if not fileids: - fileids = self.fileids() - - wordlists = [self.words(f) for f in fileids] - return list(zip(*wordlists)) diff --git a/pipeline/nltk/corpus/reader/pl196x.py b/pipeline/nltk/corpus/reader/pl196x.py deleted file mode 100644 index e59d297c0100f46b484b02bfc125532e4ca9d8ad..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/pl196x.py +++ /dev/null @@ -1,375 +0,0 @@ -# Natural Language Toolkit: -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Piotr Kasprzyk -# URL: -# For license information, see LICENSE.TXT - -from nltk.corpus.reader.api import * -from nltk.corpus.reader.xmldocs import XMLCorpusReader - -PARA = re.compile(r"]*){0,1}>(.*?)

    ") -SENT = re.compile(r"]*){0,1}>(.*?)
    ") - -TAGGEDWORD = re.compile(r"<([wc](?: [^>]*){0,1}>)(.*?)") -WORD = re.compile(r"<[wc](?: [^>]*){0,1}>(.*?)") - -TYPE = re.compile(r'type="(.*?)"') -ANA = re.compile(r'ana="(.*?)"') - -TEXTID = re.compile(r'text id="(.*?)"') - - -class TEICorpusView(StreamBackedCorpusView): - def __init__( - self, - corpus_file, - tagged, - group_by_sent, - group_by_para, - tagset=None, - head_len=0, - textids=None, - ): - - self._tagged = tagged - self._textids = textids - - self._group_by_sent = group_by_sent - self._group_by_para = group_by_para - # WARNING -- skip header - StreamBackedCorpusView.__init__(self, corpus_file, startpos=head_len) - - _pagesize = 4096 - - def read_block(self, stream): - block = stream.readlines(self._pagesize) - block = concat(block) - while (block.count(" block.count("")) or block.count( - "") + len("") - block = block[:beg] + block[beg + end :] - - output = [] - for para_str in PARA.findall(block): - para = [] - for sent_str in SENT.findall(para_str): - if not self._tagged: - sent = WORD.findall(sent_str) - else: - sent = list(map(self._parse_tag, TAGGEDWORD.findall(sent_str))) - if self._group_by_sent: - para.append(sent) - else: - para.extend(sent) - if self._group_by_para: - output.append(para) - else: - output.extend(para) - return output - - def _parse_tag(self, tag_word_tuple): - (tag, word) = tag_word_tuple - if tag.startswith("w"): - tag = ANA.search(tag).group(1) - else: # tag.startswith('c') - tag = TYPE.search(tag).group(1) - return word, tag - - -class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader): - head_len = 2770 - - def __init__(self, *args, **kwargs): - if "textid_file" in kwargs: - self._textids = kwargs["textid_file"] - else: - self._textids = None - - XMLCorpusReader.__init__(self, *args) - CategorizedCorpusReader.__init__(self, kwargs) - - self._init_textids() - - def _init_textids(self): - self._f2t = defaultdict(list) - self._t2f = defaultdict(list) - if self._textids is not None: - with open(self._textids) as fp: - for line in fp: - line = line.strip() - file_id, text_ids = line.split(" ", 1) - if file_id not in self.fileids(): - raise ValueError( - "In text_id mapping file %s: %s not found" - % (self._textids, file_id) - ) - for text_id in text_ids.split(self._delimiter): - self._add_textids(file_id, text_id) - - def _add_textids(self, file_id, text_id): - self._f2t[file_id].append(text_id) - self._t2f[text_id].append(file_id) - - def _resolve(self, fileids, categories, textids=None): - tmp = None - if ( - len( - list( - filter( - lambda accessor: accessor is None, - (fileids, categories, textids), - ) - ) - ) - != 1 - ): - - raise ValueError( - "Specify exactly one of: fileids, " "categories or textids" - ) - - if fileids is not None: - return fileids, None - - if categories is not None: - return self.fileids(categories), None - - if textids is not None: - if isinstance(textids, str): - textids = [textids] - files = sum((self._t2f[t] for t in textids), []) - tdict = dict() - for f in files: - tdict[f] = set(self._f2t[f]) & set(textids) - return files, tdict - - def decode_tag(self, tag): - # to be implemented - return tag - - def textids(self, fileids=None, categories=None): - """ - In the pl196x corpus each category is stored in single - file and thus both methods provide identical functionality. In order - to accommodate finer granularity, a non-standard textids() method was - implemented. All the main functions can be supplied with a list - of required chunks---giving much more control to the user. - """ - fileids, _ = self._resolve(fileids, categories) - if fileids is None: - return sorted(self._t2f) - - if isinstance(fileids, str): - fileids = [fileids] - return sorted(sum((self._f2t[d] for d in fileids), [])) - - def words(self, fileids=None, categories=None, textids=None): - fileids, textids = self._resolve(fileids, categories, textids) - if fileids is None: - fileids = self._fileids - elif isinstance(fileids, str): - fileids = [fileids] - - if textids: - return concat( - [ - TEICorpusView( - self.abspath(fileid), - False, - False, - False, - head_len=self.head_len, - textids=textids[fileid], - ) - for fileid in fileids - ] - ) - else: - return concat( - [ - TEICorpusView( - self.abspath(fileid), - False, - False, - False, - head_len=self.head_len, - ) - for fileid in fileids - ] - ) - - def sents(self, fileids=None, categories=None, textids=None): - fileids, textids = self._resolve(fileids, categories, textids) - if fileids is None: - fileids = self._fileids - elif isinstance(fileids, str): - fileids = [fileids] - - if textids: - return concat( - [ - TEICorpusView( - self.abspath(fileid), - False, - True, - False, - head_len=self.head_len, - textids=textids[fileid], - ) - for fileid in fileids - ] - ) - else: - return concat( - [ - TEICorpusView( - self.abspath(fileid), False, True, False, head_len=self.head_len - ) - for fileid in fileids - ] - ) - - def paras(self, fileids=None, categories=None, textids=None): - fileids, textids = self._resolve(fileids, categories, textids) - if fileids is None: - fileids = self._fileids - elif isinstance(fileids, str): - fileids = [fileids] - - if textids: - return concat( - [ - TEICorpusView( - self.abspath(fileid), - False, - True, - True, - head_len=self.head_len, - textids=textids[fileid], - ) - for fileid in fileids - ] - ) - else: - return concat( - [ - TEICorpusView( - self.abspath(fileid), False, True, True, head_len=self.head_len - ) - for fileid in fileids - ] - ) - - def tagged_words(self, fileids=None, categories=None, textids=None): - fileids, textids = self._resolve(fileids, categories, textids) - if fileids is None: - fileids = self._fileids - elif isinstance(fileids, str): - fileids = [fileids] - - if textids: - return concat( - [ - TEICorpusView( - self.abspath(fileid), - True, - False, - False, - head_len=self.head_len, - textids=textids[fileid], - ) - for fileid in fileids - ] - ) - else: - return concat( - [ - TEICorpusView( - self.abspath(fileid), True, False, False, head_len=self.head_len - ) - for fileid in fileids - ] - ) - - def tagged_sents(self, fileids=None, categories=None, textids=None): - fileids, textids = self._resolve(fileids, categories, textids) - if fileids is None: - fileids = self._fileids - elif isinstance(fileids, str): - fileids = [fileids] - - if textids: - return concat( - [ - TEICorpusView( - self.abspath(fileid), - True, - True, - False, - head_len=self.head_len, - textids=textids[fileid], - ) - for fileid in fileids - ] - ) - else: - return concat( - [ - TEICorpusView( - self.abspath(fileid), True, True, False, head_len=self.head_len - ) - for fileid in fileids - ] - ) - - def tagged_paras(self, fileids=None, categories=None, textids=None): - fileids, textids = self._resolve(fileids, categories, textids) - if fileids is None: - fileids = self._fileids - elif isinstance(fileids, str): - fileids = [fileids] - - if textids: - return concat( - [ - TEICorpusView( - self.abspath(fileid), - True, - True, - True, - head_len=self.head_len, - textids=textids[fileid], - ) - for fileid in fileids - ] - ) - else: - return concat( - [ - TEICorpusView( - self.abspath(fileid), True, True, True, head_len=self.head_len - ) - for fileid in fileids - ] - ) - - def xml(self, fileids=None, categories=None): - fileids, _ = self._resolve(fileids, categories) - if len(fileids) == 1: - return XMLCorpusReader.xml(self, fileids[0]) - else: - raise TypeError("Expected a single file") diff --git a/pipeline/nltk/corpus/reader/plaintext.py b/pipeline/nltk/corpus/reader/plaintext.py deleted file mode 100644 index f096f3ecb0ef7196950071723393656ec91aa363..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/plaintext.py +++ /dev/null @@ -1,227 +0,0 @@ -# Natural Language Toolkit: Plaintext Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Edward Loper -# Nitin Madnani -# URL: -# For license information, see LICENSE.TXT - -""" -A reader for corpora that consist of plaintext documents. -""" - -import nltk.data -from nltk.corpus.reader.api import * -from nltk.corpus.reader.util import * -from nltk.tokenize import * - - -class PlaintextCorpusReader(CorpusReader): - """ - Reader for corpora that consist of plaintext documents. Paragraphs - are assumed to be split using blank lines. Sentences and words can - be tokenized using the default tokenizers, or by custom tokenizers - specified as parameters to the constructor. - - This corpus reader can be customized (e.g., to skip preface - sections of specific document formats) by creating a subclass and - overriding the ``CorpusView`` class variable. - """ - - CorpusView = StreamBackedCorpusView - """The corpus view class used by this reader. Subclasses of - ``PlaintextCorpusReader`` may specify alternative corpus view - classes (e.g., to skip the preface sections of documents.)""" - - def __init__( - self, - root, - fileids, - word_tokenizer=WordPunctTokenizer(), - sent_tokenizer=nltk.data.LazyLoader("tokenizers/punkt/english.pickle"), - para_block_reader=read_blankline_block, - encoding="utf8", - ): - r""" - Construct a new plaintext corpus reader for a set of documents - located at the given root directory. Example usage: - - >>> root = '/usr/local/share/nltk_data/corpora/webtext/' - >>> reader = PlaintextCorpusReader(root, '.*\.txt') # doctest: +SKIP - - :param root: The root directory for this corpus. - :param fileids: A list or regexp specifying the fileids in this corpus. - :param word_tokenizer: Tokenizer for breaking sentences or - paragraphs into words. - :param sent_tokenizer: Tokenizer for breaking paragraphs - into words. - :param para_block_reader: The block reader used to divide the - corpus into paragraph blocks. - """ - CorpusReader.__init__(self, root, fileids, encoding) - self._word_tokenizer = word_tokenizer - self._sent_tokenizer = sent_tokenizer - self._para_block_reader = para_block_reader - - def words(self, fileids=None): - """ - :return: the given file(s) as a list of words - and punctuation symbols. - :rtype: list(str) - """ - return concat( - [ - self.CorpusView(path, self._read_word_block, encoding=enc) - for (path, enc, fileid) in self.abspaths(fileids, True, True) - ] - ) - - def sents(self, fileids=None): - """ - :return: the given file(s) as a list of - sentences or utterances, each encoded as a list of word - strings. - :rtype: list(list(str)) - """ - if self._sent_tokenizer is None: - raise ValueError("No sentence tokenizer for this corpus") - - return concat( - [ - self.CorpusView(path, self._read_sent_block, encoding=enc) - for (path, enc, fileid) in self.abspaths(fileids, True, True) - ] - ) - - def paras(self, fileids=None): - """ - :return: the given file(s) as a list of - paragraphs, each encoded as a list of sentences, which are - in turn encoded as lists of word strings. - :rtype: list(list(list(str))) - """ - if self._sent_tokenizer is None: - raise ValueError("No sentence tokenizer for this corpus") - - return concat( - [ - self.CorpusView(path, self._read_para_block, encoding=enc) - for (path, enc, fileid) in self.abspaths(fileids, True, True) - ] - ) - - def _read_word_block(self, stream): - words = [] - for i in range(20): # Read 20 lines at a time. - words.extend(self._word_tokenizer.tokenize(stream.readline())) - return words - - def _read_sent_block(self, stream): - sents = [] - for para in self._para_block_reader(stream): - sents.extend( - [ - self._word_tokenizer.tokenize(sent) - for sent in self._sent_tokenizer.tokenize(para) - ] - ) - return sents - - def _read_para_block(self, stream): - paras = [] - for para in self._para_block_reader(stream): - paras.append( - [ - self._word_tokenizer.tokenize(sent) - for sent in self._sent_tokenizer.tokenize(para) - ] - ) - return paras - - -class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, PlaintextCorpusReader): - """ - A reader for plaintext corpora whose documents are divided into - categories based on their file identifiers. - """ - - def __init__(self, *args, **kwargs): - """ - Initialize the corpus reader. Categorization arguments - (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to - the ``CategorizedCorpusReader`` constructor. The remaining arguments - are passed to the ``PlaintextCorpusReader`` constructor. - """ - CategorizedCorpusReader.__init__(self, kwargs) - PlaintextCorpusReader.__init__(self, *args, **kwargs) - - -# FIXME: Is there a better way? How to not hardcode this? -# Possibly, add a language kwargs to CategorizedPlaintextCorpusReader to -# override the `sent_tokenizer`. -class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader): - def __init__(self, *args, **kwargs): - CategorizedCorpusReader.__init__(self, kwargs) - kwargs["sent_tokenizer"] = nltk.data.LazyLoader( - "tokenizers/punkt/portuguese.pickle" - ) - PlaintextCorpusReader.__init__(self, *args, **kwargs) - - -class EuroparlCorpusReader(PlaintextCorpusReader): - - """ - Reader for Europarl corpora that consist of plaintext documents. - Documents are divided into chapters instead of paragraphs as - for regular plaintext documents. Chapters are separated using blank - lines. Everything is inherited from ``PlaintextCorpusReader`` except - that: - - - Since the corpus is pre-processed and pre-tokenized, the - word tokenizer should just split the line at whitespaces. - - For the same reason, the sentence tokenizer should just - split the paragraph at line breaks. - - There is a new 'chapters()' method that returns chapters instead - instead of paragraphs. - - The 'paras()' method inherited from PlaintextCorpusReader is - made non-functional to remove any confusion between chapters - and paragraphs for Europarl. - """ - - def _read_word_block(self, stream): - words = [] - for i in range(20): # Read 20 lines at a time. - words.extend(stream.readline().split()) - return words - - def _read_sent_block(self, stream): - sents = [] - for para in self._para_block_reader(stream): - sents.extend([sent.split() for sent in para.splitlines()]) - return sents - - def _read_para_block(self, stream): - paras = [] - for para in self._para_block_reader(stream): - paras.append([sent.split() for sent in para.splitlines()]) - return paras - - def chapters(self, fileids=None): - """ - :return: the given file(s) as a list of - chapters, each encoded as a list of sentences, which are - in turn encoded as lists of word strings. - :rtype: list(list(list(str))) - """ - return concat( - [ - self.CorpusView(fileid, self._read_para_block, encoding=enc) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def paras(self, fileids=None): - raise NotImplementedError( - "The Europarl corpus reader does not support paragraphs. Please use chapters() instead." - ) diff --git a/pipeline/nltk/corpus/reader/ppattach.py b/pipeline/nltk/corpus/reader/ppattach.py deleted file mode 100644 index 0006e640e9ef30cb50fbdee621b13f2f78b484dd..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/ppattach.py +++ /dev/null @@ -1,95 +0,0 @@ -# Natural Language Toolkit: PP Attachment Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -Read lines from the Prepositional Phrase Attachment Corpus. - -The PP Attachment Corpus contains several files having the format: - -sentence_id verb noun1 preposition noun2 attachment - -For example: - -42960 gives authority to administration V -46742 gives inventors of microchip N - -The PP attachment is to the verb phrase (V) or noun phrase (N), i.e.: - -(VP gives (NP authority) (PP to administration)) -(VP gives (NP inventors (PP of microchip))) - -The corpus contains the following files: - -training: training set -devset: development test set, used for algorithm development. -test: test set, used to report results -bitstrings: word classes derived from Mutual Information Clustering for the Wall Street Journal. - -Ratnaparkhi, Adwait (1994). A Maximum Entropy Model for Prepositional -Phrase Attachment. Proceedings of the ARPA Human Language Technology -Conference. [http://www.cis.upenn.edu/~adwait/papers/hlt94.ps] - -The PP Attachment Corpus is distributed with NLTK with the permission -of the author. -""" - -from nltk.corpus.reader.api import * -from nltk.corpus.reader.util import * - - -class PPAttachment: - def __init__(self, sent, verb, noun1, prep, noun2, attachment): - self.sent = sent - self.verb = verb - self.noun1 = noun1 - self.prep = prep - self.noun2 = noun2 - self.attachment = attachment - - def __repr__(self): - return ( - "PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, " - "noun2=%r, attachment=%r)" - % (self.sent, self.verb, self.noun1, self.prep, self.noun2, self.attachment) - ) - - -class PPAttachmentCorpusReader(CorpusReader): - """ - sentence_id verb noun1 preposition noun2 attachment - """ - - def attachments(self, fileids): - return concat( - [ - StreamBackedCorpusView(fileid, self._read_obj_block, encoding=enc) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def tuples(self, fileids): - return concat( - [ - StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def _read_tuple_block(self, stream): - line = stream.readline() - if line: - return [tuple(line.split())] - else: - return [] - - def _read_obj_block(self, stream): - line = stream.readline() - if line: - return [PPAttachment(*line.split())] - else: - return [] diff --git a/pipeline/nltk/corpus/reader/propbank.py b/pipeline/nltk/corpus/reader/propbank.py deleted file mode 100644 index c254a8416f2c1bb38f684819e43bae76a4308eeb..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/propbank.py +++ /dev/null @@ -1,520 +0,0 @@ -# Natural Language Toolkit: PropBank Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT - -import re -from functools import total_ordering -from xml.etree import ElementTree - -from nltk.corpus.reader.api import * -from nltk.corpus.reader.util import * -from nltk.internals import raise_unorderable_types -from nltk.tree import Tree - - -class PropbankCorpusReader(CorpusReader): - """ - Corpus reader for the propbank corpus, which augments the Penn - Treebank with information about the predicate argument structure - of every verb instance. The corpus consists of two parts: the - predicate-argument annotations themselves, and a set of "frameset - files" which define the argument labels used by the annotations, - on a per-verb basis. Each "frameset file" contains one or more - predicates, such as ``'turn'`` or ``'turn_on'``, each of which is - divided into coarse-grained word senses called "rolesets". For - each "roleset", the frameset file provides descriptions of the - argument roles, along with examples. - """ - - def __init__( - self, - root, - propfile, - framefiles="", - verbsfile=None, - parse_fileid_xform=None, - parse_corpus=None, - encoding="utf8", - ): - """ - :param root: The root directory for this corpus. - :param propfile: The name of the file containing the predicate- - argument annotations (relative to ``root``). - :param framefiles: A list or regexp specifying the frameset - fileids for this corpus. - :param parse_fileid_xform: A transform that should be applied - to the fileids in this corpus. This should be a function - of one argument (a fileid) that returns a string (the new - fileid). - :param parse_corpus: The corpus containing the parse trees - corresponding to this corpus. These parse trees are - necessary to resolve the tree pointers used by propbank. - """ - # If framefiles is specified as a regexp, expand it. - if isinstance(framefiles, str): - framefiles = find_corpus_fileids(root, framefiles) - framefiles = list(framefiles) - # Initialize the corpus reader. - CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, encoding) - - # Record our frame fileids & prop file. - self._propfile = propfile - self._framefiles = framefiles - self._verbsfile = verbsfile - self._parse_fileid_xform = parse_fileid_xform - self._parse_corpus = parse_corpus - - def instances(self, baseform=None): - """ - :return: a corpus view that acts as a list of - ``PropBankInstance`` objects, one for each noun in the corpus. - """ - kwargs = {} - if baseform is not None: - kwargs["instance_filter"] = lambda inst: inst.baseform == baseform - return StreamBackedCorpusView( - self.abspath(self._propfile), - lambda stream: self._read_instance_block(stream, **kwargs), - encoding=self.encoding(self._propfile), - ) - - def lines(self): - """ - :return: a corpus view that acts as a list of strings, one for - each line in the predicate-argument annotation file. - """ - return StreamBackedCorpusView( - self.abspath(self._propfile), - read_line_block, - encoding=self.encoding(self._propfile), - ) - - def roleset(self, roleset_id): - """ - :return: the xml description for the given roleset. - """ - baseform = roleset_id.split(".")[0] - framefile = "frames/%s.xml" % baseform - if framefile not in self._framefiles: - raise ValueError("Frameset file for %s not found" % roleset_id) - - # n.b.: The encoding for XML fileids is specified by the file - # itself; so we ignore self._encoding here. - with self.abspath(framefile).open() as fp: - etree = ElementTree.parse(fp).getroot() - for roleset in etree.findall("predicate/roleset"): - if roleset.attrib["id"] == roleset_id: - return roleset - raise ValueError(f"Roleset {roleset_id} not found in {framefile}") - - def rolesets(self, baseform=None): - """ - :return: list of xml descriptions for rolesets. - """ - if baseform is not None: - framefile = "frames/%s.xml" % baseform - if framefile not in self._framefiles: - raise ValueError("Frameset file for %s not found" % baseform) - framefiles = [framefile] - else: - framefiles = self._framefiles - - rsets = [] - for framefile in framefiles: - # n.b.: The encoding for XML fileids is specified by the file - # itself; so we ignore self._encoding here. - with self.abspath(framefile).open() as fp: - etree = ElementTree.parse(fp).getroot() - rsets.append(etree.findall("predicate/roleset")) - return LazyConcatenation(rsets) - - def verbs(self): - """ - :return: a corpus view that acts as a list of all verb lemmas - in this corpus (from the verbs.txt file). - """ - return StreamBackedCorpusView( - self.abspath(self._verbsfile), - read_line_block, - encoding=self.encoding(self._verbsfile), - ) - - def _read_instance_block(self, stream, instance_filter=lambda inst: True): - block = [] - - # Read 100 at a time. - for i in range(100): - line = stream.readline().strip() - if line: - inst = PropbankInstance.parse( - line, self._parse_fileid_xform, self._parse_corpus - ) - if instance_filter(inst): - block.append(inst) - - return block - - -###################################################################### -# { Propbank Instance & related datatypes -###################################################################### - - -class PropbankInstance: - def __init__( - self, - fileid, - sentnum, - wordnum, - tagger, - roleset, - inflection, - predicate, - arguments, - parse_corpus=None, - ): - - self.fileid = fileid - """The name of the file containing the parse tree for this - instance's sentence.""" - - self.sentnum = sentnum - """The sentence number of this sentence within ``fileid``. - Indexing starts from zero.""" - - self.wordnum = wordnum - """The word number of this instance's predicate within its - containing sentence. Word numbers are indexed starting from - zero, and include traces and other empty parse elements.""" - - self.tagger = tagger - """An identifier for the tagger who tagged this instance; or - ``'gold'`` if this is an adjuticated instance.""" - - self.roleset = roleset - """The name of the roleset used by this instance's predicate. - Use ``propbank.roleset() `` to - look up information about the roleset.""" - - self.inflection = inflection - """A ``PropbankInflection`` object describing the inflection of - this instance's predicate.""" - - self.predicate = predicate - """A ``PropbankTreePointer`` indicating the position of this - instance's predicate within its containing sentence.""" - - self.arguments = tuple(arguments) - """A list of tuples (argloc, argid), specifying the location - and identifier for each of the predicate's argument in the - containing sentence. Argument identifiers are strings such as - ``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain - the predicate.""" - - self.parse_corpus = parse_corpus - """A corpus reader for the parse trees corresponding to the - instances in this propbank corpus.""" - - @property - def baseform(self): - """The baseform of the predicate.""" - return self.roleset.split(".")[0] - - @property - def sensenumber(self): - """The sense number of the predicate.""" - return self.roleset.split(".")[1] - - @property - def predid(self): - """Identifier of the predicate.""" - return "rel" - - def __repr__(self): - return "".format( - self.fileid, - self.sentnum, - self.wordnum, - ) - - def __str__(self): - s = "{} {} {} {} {} {}".format( - self.fileid, - self.sentnum, - self.wordnum, - self.tagger, - self.roleset, - self.inflection, - ) - items = self.arguments + ((self.predicate, "rel"),) - for (argloc, argid) in sorted(items): - s += f" {argloc}-{argid}" - return s - - def _get_tree(self): - if self.parse_corpus is None: - return None - if self.fileid not in self.parse_corpus.fileids(): - return None - return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum] - - tree = property( - _get_tree, - doc=""" - The parse tree corresponding to this instance, or None if - the corresponding tree is not available.""", - ) - - @staticmethod - def parse(s, parse_fileid_xform=None, parse_corpus=None): - pieces = s.split() - if len(pieces) < 7: - raise ValueError("Badly formatted propbank line: %r" % s) - - # Divide the line into its basic pieces. - (fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6] - rel = [p for p in pieces[6:] if p.endswith("-rel")] - args = [p for p in pieces[6:] if not p.endswith("-rel")] - if len(rel) != 1: - raise ValueError("Badly formatted propbank line: %r" % s) - - # Apply the fileid selector, if any. - if parse_fileid_xform is not None: - fileid = parse_fileid_xform(fileid) - - # Convert sentence & word numbers to ints. - sentnum = int(sentnum) - wordnum = int(wordnum) - - # Parse the inflection - inflection = PropbankInflection.parse(inflection) - - # Parse the predicate location. - predicate = PropbankTreePointer.parse(rel[0][:-4]) - - # Parse the arguments. - arguments = [] - for arg in args: - argloc, argid = arg.split("-", 1) - arguments.append((PropbankTreePointer.parse(argloc), argid)) - - # Put it all together. - return PropbankInstance( - fileid, - sentnum, - wordnum, - tagger, - roleset, - inflection, - predicate, - arguments, - parse_corpus, - ) - - -class PropbankPointer: - """ - A pointer used by propbank to identify one or more constituents in - a parse tree. ``PropbankPointer`` is an abstract base class with - three concrete subclasses: - - - ``PropbankTreePointer`` is used to point to single constituents. - - ``PropbankSplitTreePointer`` is used to point to 'split' - constituents, which consist of a sequence of two or more - ``PropbankTreePointer`` pointers. - - ``PropbankChainTreePointer`` is used to point to entire trace - chains in a tree. It consists of a sequence of pieces, which - can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers. - """ - - def __init__(self): - if self.__class__ == PropbankPointer: - raise NotImplementedError() - - -class PropbankChainTreePointer(PropbankPointer): - def __init__(self, pieces): - self.pieces = pieces - """A list of the pieces that make up this chain. Elements may - be either ``PropbankSplitTreePointer`` or - ``PropbankTreePointer`` pointers.""" - - def __str__(self): - return "*".join("%s" % p for p in self.pieces) - - def __repr__(self): - return "" % self - - def select(self, tree): - if tree is None: - raise ValueError("Parse tree not available") - return Tree("*CHAIN*", [p.select(tree) for p in self.pieces]) - - -class PropbankSplitTreePointer(PropbankPointer): - def __init__(self, pieces): - self.pieces = pieces - """A list of the pieces that make up this chain. Elements are - all ``PropbankTreePointer`` pointers.""" - - def __str__(self): - return ",".join("%s" % p for p in self.pieces) - - def __repr__(self): - return "" % self - - def select(self, tree): - if tree is None: - raise ValueError("Parse tree not available") - return Tree("*SPLIT*", [p.select(tree) for p in self.pieces]) - - -@total_ordering -class PropbankTreePointer(PropbankPointer): - """ - wordnum:height*wordnum:height*... - wordnum:height, - - """ - - def __init__(self, wordnum, height): - self.wordnum = wordnum - self.height = height - - @staticmethod - def parse(s): - # Deal with chains (xx*yy*zz) - pieces = s.split("*") - if len(pieces) > 1: - return PropbankChainTreePointer( - [PropbankTreePointer.parse(elt) for elt in pieces] - ) - - # Deal with split args (xx,yy,zz) - pieces = s.split(",") - if len(pieces) > 1: - return PropbankSplitTreePointer( - [PropbankTreePointer.parse(elt) for elt in pieces] - ) - - # Deal with normal pointers. - pieces = s.split(":") - if len(pieces) != 2: - raise ValueError("bad propbank pointer %r" % s) - return PropbankTreePointer(int(pieces[0]), int(pieces[1])) - - def __str__(self): - return f"{self.wordnum}:{self.height}" - - def __repr__(self): - return "PropbankTreePointer(%d, %d)" % (self.wordnum, self.height) - - def __eq__(self, other): - while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)): - other = other.pieces[0] - - if not isinstance(other, PropbankTreePointer): - return self is other - - return self.wordnum == other.wordnum and self.height == other.height - - def __ne__(self, other): - return not self == other - - def __lt__(self, other): - while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)): - other = other.pieces[0] - - if not isinstance(other, PropbankTreePointer): - return id(self) < id(other) - - return (self.wordnum, -self.height) < (other.wordnum, -other.height) - - def select(self, tree): - if tree is None: - raise ValueError("Parse tree not available") - return tree[self.treepos(tree)] - - def treepos(self, tree): - """ - Convert this pointer to a standard 'tree position' pointer, - given that it points to the given tree. - """ - if tree is None: - raise ValueError("Parse tree not available") - stack = [tree] - treepos = [] - - wordnum = 0 - while True: - # tree node: - if isinstance(stack[-1], Tree): - # Select the next child. - if len(treepos) < len(stack): - treepos.append(0) - else: - treepos[-1] += 1 - # Update the stack. - if treepos[-1] < len(stack[-1]): - stack.append(stack[-1][treepos[-1]]) - else: - # End of node's child list: pop up a level. - stack.pop() - treepos.pop() - # word node: - else: - if wordnum == self.wordnum: - return tuple(treepos[: len(treepos) - self.height - 1]) - else: - wordnum += 1 - stack.pop() - - -class PropbankInflection: - # { Inflection Form - INFINITIVE = "i" - GERUND = "g" - PARTICIPLE = "p" - FINITE = "v" - # { Inflection Tense - FUTURE = "f" - PAST = "p" - PRESENT = "n" - # { Inflection Aspect - PERFECT = "p" - PROGRESSIVE = "o" - PERFECT_AND_PROGRESSIVE = "b" - # { Inflection Person - THIRD_PERSON = "3" - # { Inflection Voice - ACTIVE = "a" - PASSIVE = "p" - # { Inflection - NONE = "-" - # } - - def __init__(self, form="-", tense="-", aspect="-", person="-", voice="-"): - self.form = form - self.tense = tense - self.aspect = aspect - self.person = person - self.voice = voice - - def __str__(self): - return self.form + self.tense + self.aspect + self.person + self.voice - - def __repr__(self): - return "" % self - - _VALIDATE = re.compile(r"[igpv\-][fpn\-][pob\-][3\-][ap\-]$") - - @staticmethod - def parse(s): - if not isinstance(s, str): - raise TypeError("expected a string") - if len(s) != 5 or not PropbankInflection._VALIDATE.match(s): - raise ValueError("Bad propbank inflection string %r" % s) - return PropbankInflection(*s) diff --git a/pipeline/nltk/corpus/reader/pros_cons.py b/pipeline/nltk/corpus/reader/pros_cons.py deleted file mode 100644 index 31f1b02f701bc68a652af9617751d78b1c04d56d..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/pros_cons.py +++ /dev/null @@ -1,133 +0,0 @@ -# Natural Language Toolkit: Pros and Cons Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Pierpaolo Pantone <24alsecondo@gmail.com> -# URL: -# For license information, see LICENSE.TXT - -""" -CorpusReader for the Pros and Cons dataset. - -- Pros and Cons dataset information - - -Contact: Bing Liu, liub@cs.uic.edu - https://www.cs.uic.edu/~liub - -Distributed with permission. - -Related papers: - -- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences". - Proceedings of the 22nd International Conference on Computational Linguistics - (Coling-2008), Manchester, 18-22 August, 2008. - -- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing - Opinions on the Web". Proceedings of the 14th international World Wide Web - conference (WWW-2005), May 10-14, 2005, in Chiba, Japan. -""" -import re - -from nltk.corpus.reader.api import * -from nltk.tokenize import * - - -class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader): - """ - Reader for the Pros and Cons sentence dataset. - - >>> from nltk.corpus import pros_cons - >>> pros_cons.sents(categories='Cons') # doctest: +NORMALIZE_WHITESPACE - [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy', - 'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'], - ...] - >>> pros_cons.words('IntegratedPros.txt') - ['Easy', 'to', 'use', ',', 'economical', '!', ...] - """ - - CorpusView = StreamBackedCorpusView - - def __init__( - self, - root, - fileids, - word_tokenizer=WordPunctTokenizer(), - encoding="utf8", - **kwargs - ): - """ - :param root: The root directory for the corpus. - :param fileids: a list or regexp specifying the fileids in the corpus. - :param word_tokenizer: a tokenizer for breaking sentences or paragraphs - into words. Default: `WhitespaceTokenizer` - :param encoding: the encoding that should be used to read the corpus. - :param kwargs: additional parameters passed to CategorizedCorpusReader. - """ - - CorpusReader.__init__(self, root, fileids, encoding) - CategorizedCorpusReader.__init__(self, kwargs) - self._word_tokenizer = word_tokenizer - - def sents(self, fileids=None, categories=None): - """ - Return all sentences in the corpus or in the specified files/categories. - - :param fileids: a list or regexp specifying the ids of the files whose - sentences have to be returned. - :param categories: a list specifying the categories whose sentences - have to be returned. - :return: the given file(s) as a list of sentences. Each sentence is - tokenized using the specified word_tokenizer. - :rtype: list(list(str)) - """ - fileids = self._resolve(fileids, categories) - if fileids is None: - fileids = self._fileids - elif isinstance(fileids, str): - fileids = [fileids] - return concat( - [ - self.CorpusView(path, self._read_sent_block, encoding=enc) - for (path, enc, fileid) in self.abspaths(fileids, True, True) - ] - ) - - def words(self, fileids=None, categories=None): - """ - Return all words and punctuation symbols in the corpus or in the specified - files/categories. - - :param fileids: a list or regexp specifying the ids of the files whose - words have to be returned. - :param categories: a list specifying the categories whose words have - to be returned. - :return: the given file(s) as a list of words and punctuation symbols. - :rtype: list(str) - """ - fileids = self._resolve(fileids, categories) - if fileids is None: - fileids = self._fileids - elif isinstance(fileids, str): - fileids = [fileids] - return concat( - [ - self.CorpusView(path, self._read_word_block, encoding=enc) - for (path, enc, fileid) in self.abspaths(fileids, True, True) - ] - ) - - def _read_sent_block(self, stream): - sents = [] - for i in range(20): # Read 20 lines at a time. - line = stream.readline() - if not line: - continue - sent = re.match(r"^(?!\n)\s*<(Pros|Cons)>(.*)", line) - if sent: - sents.append(self._word_tokenizer.tokenize(sent.group(2).strip())) - return sents - - def _read_word_block(self, stream): - words = [] - for sent in self._read_sent_block(stream): - words.extend(sent) - return words diff --git a/pipeline/nltk/corpus/reader/reviews.py b/pipeline/nltk/corpus/reader/reviews.py deleted file mode 100644 index 5f52425c0f7c260f62d7d953b90d241a6c00a2b8..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/reviews.py +++ /dev/null @@ -1,331 +0,0 @@ -# Natural Language Toolkit: Product Reviews Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Pierpaolo Pantone <24alsecondo@gmail.com> -# URL: -# For license information, see LICENSE.TXT - -""" -CorpusReader for reviews corpora (syntax based on Customer Review Corpus). - -Customer Review Corpus information -================================== - -Annotated by: Minqing Hu and Bing Liu, 2004. - Department of Computer Science - University of Illinois at Chicago - -Contact: Bing Liu, liub@cs.uic.edu - https://www.cs.uic.edu/~liub - -Distributed with permission. - -The "product_reviews_1" and "product_reviews_2" datasets respectively contain -annotated customer reviews of 5 and 9 products from amazon.com. - -Related papers: - -- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews". - Proceedings of the ACM SIGKDD International Conference on Knowledge - Discovery & Data Mining (KDD-04), 2004. - -- Minqing Hu and Bing Liu. "Mining Opinion Features in Customer Reviews". - Proceedings of Nineteeth National Conference on Artificial Intelligence - (AAAI-2004), 2004. - -- Xiaowen Ding, Bing Liu and Philip S. Yu. "A Holistic Lexicon-Based Appraoch to - Opinion Mining." Proceedings of First ACM International Conference on Web - Search and Data Mining (WSDM-2008), Feb 11-12, 2008, Stanford University, - Stanford, California, USA. - -Symbols used in the annotated reviews: - - :[t]: the title of the review: Each [t] tag starts a review. - :xxxx[+|-n]: xxxx is a product feature. - :[+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest. - Note that the strength is quite subjective. - You may want ignore it, but only considering + and - - :[-n]: Negative opinion - :##: start of each sentence. Each line is a sentence. - :[u]: feature not appeared in the sentence. - :[p]: feature not appeared in the sentence. Pronoun resolution is needed. - :[s]: suggestion or recommendation. - :[cc]: comparison with a competing product from a different brand. - :[cs]: comparison with a competing product from the same brand. - -Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not - provide separation between different reviews. This is due to the fact that - the dataset was specifically designed for aspect/feature-based sentiment - analysis, for which sentence-level annotation is sufficient. For document- - level classification and analysis, this peculiarity should be taken into - consideration. -""" - -import re - -from nltk.corpus.reader.api import * -from nltk.tokenize import * - -TITLE = re.compile(r"^\[t\](.*)$") # [t] Title -FEATURES = re.compile( - r"((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]" -) # find 'feature' in feature[+3] -NOTES = re.compile(r"\[(?!t)(p|u|s|cc|cs)\]") # find 'p' in camera[+2][p] -SENT = re.compile(r"##(.*)$") # find tokenized sentence - - -class Review: - """ - A Review is the main block of a ReviewsCorpusReader. - """ - - def __init__(self, title=None, review_lines=None): - """ - :param title: the title of the review. - :param review_lines: the list of the ReviewLines that belong to the Review. - """ - self.title = title - if review_lines is None: - self.review_lines = [] - else: - self.review_lines = review_lines - - def add_line(self, review_line): - """ - Add a line (ReviewLine) to the review. - - :param review_line: a ReviewLine instance that belongs to the Review. - """ - assert isinstance(review_line, ReviewLine) - self.review_lines.append(review_line) - - def features(self): - """ - Return a list of features in the review. Each feature is a tuple made of - the specific item feature and the opinion strength about that feature. - - :return: all features of the review as a list of tuples (feat, score). - :rtype: list(tuple) - """ - features = [] - for review_line in self.review_lines: - features.extend(review_line.features) - return features - - def sents(self): - """ - Return all tokenized sentences in the review. - - :return: all sentences of the review as lists of tokens. - :rtype: list(list(str)) - """ - return [review_line.sent for review_line in self.review_lines] - - def __repr__(self): - return 'Review(title="{}", review_lines={})'.format( - self.title, self.review_lines - ) - - -class ReviewLine: - """ - A ReviewLine represents a sentence of the review, together with (optional) - annotations of its features and notes about the reviewed item. - """ - - def __init__(self, sent, features=None, notes=None): - self.sent = sent - if features is None: - self.features = [] - else: - self.features = features - - if notes is None: - self.notes = [] - else: - self.notes = notes - - def __repr__(self): - return "ReviewLine(features={}, notes={}, sent={})".format( - self.features, self.notes, self.sent - ) - - -class ReviewsCorpusReader(CorpusReader): - """ - Reader for the Customer Review Data dataset by Hu, Liu (2004). - Note: we are not applying any sentence tokenization at the moment, just word - tokenization. - - >>> from nltk.corpus import product_reviews_1 - >>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt') - >>> review = camera_reviews[0] - >>> review.sents()[0] # doctest: +NORMALIZE_WHITESPACE - ['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am', - 'extremely', 'satisfied', 'with', 'the', 'purchase', '.'] - >>> review.features() # doctest: +NORMALIZE_WHITESPACE - [('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'), - ('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'), - ('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'), - ('option', '+1')] - - We can also reach the same information directly from the stream: - - >>> product_reviews_1.features('Canon_G3.txt') - [('canon powershot g3', '+3'), ('use', '+2'), ...] - - We can compute stats for specific product features: - - >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture']) - >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture']) - >>> mean = tot / n_reviews - >>> print(n_reviews, tot, mean) - 15 24 1.6 - """ - - CorpusView = StreamBackedCorpusView - - def __init__( - self, root, fileids, word_tokenizer=WordPunctTokenizer(), encoding="utf8" - ): - """ - :param root: The root directory for the corpus. - :param fileids: a list or regexp specifying the fileids in the corpus. - :param word_tokenizer: a tokenizer for breaking sentences or paragraphs - into words. Default: `WordPunctTokenizer` - :param encoding: the encoding that should be used to read the corpus. - """ - - CorpusReader.__init__(self, root, fileids, encoding) - self._word_tokenizer = word_tokenizer - self._readme = "README.txt" - - def features(self, fileids=None): - """ - Return a list of features. Each feature is a tuple made of the specific - item feature and the opinion strength about that feature. - - :param fileids: a list or regexp specifying the ids of the files whose - features have to be returned. - :return: all features for the item(s) in the given file(s). - :rtype: list(tuple) - """ - if fileids is None: - fileids = self._fileids - elif isinstance(fileids, str): - fileids = [fileids] - return concat( - [ - self.CorpusView(fileid, self._read_features, encoding=enc) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def reviews(self, fileids=None): - """ - Return all the reviews as a list of Review objects. If `fileids` is - specified, return all the reviews from each of the specified files. - - :param fileids: a list or regexp specifying the ids of the files whose - reviews have to be returned. - :return: the given file(s) as a list of reviews. - """ - if fileids is None: - fileids = self._fileids - return concat( - [ - self.CorpusView(fileid, self._read_review_block, encoding=enc) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def sents(self, fileids=None): - """ - Return all sentences in the corpus or in the specified files. - - :param fileids: a list or regexp specifying the ids of the files whose - sentences have to be returned. - :return: the given file(s) as a list of sentences, each encoded as a - list of word strings. - :rtype: list(list(str)) - """ - return concat( - [ - self.CorpusView(path, self._read_sent_block, encoding=enc) - for (path, enc, fileid) in self.abspaths(fileids, True, True) - ] - ) - - def words(self, fileids=None): - """ - Return all words and punctuation symbols in the corpus or in the specified - files. - - :param fileids: a list or regexp specifying the ids of the files whose - words have to be returned. - :return: the given file(s) as a list of words and punctuation symbols. - :rtype: list(str) - """ - return concat( - [ - self.CorpusView(path, self._read_word_block, encoding=enc) - for (path, enc, fileid) in self.abspaths(fileids, True, True) - ] - ) - - def _read_features(self, stream): - features = [] - for i in range(20): - line = stream.readline() - if not line: - return features - features.extend(re.findall(FEATURES, line)) - return features - - def _read_review_block(self, stream): - while True: - line = stream.readline() - if not line: - return [] # end of file. - title_match = re.match(TITLE, line) - if title_match: - review = Review( - title=title_match.group(1).strip() - ) # We create a new review - break - - # Scan until we find another line matching the regexp, or EOF. - while True: - oldpos = stream.tell() - line = stream.readline() - # End of file: - if not line: - return [review] - # Start of a new review: backup to just before it starts, and - # return the review we've already collected. - if re.match(TITLE, line): - stream.seek(oldpos) - return [review] - # Anything else is part of the review line. - feats = re.findall(FEATURES, line) - notes = re.findall(NOTES, line) - sent = re.findall(SENT, line) - if sent: - sent = self._word_tokenizer.tokenize(sent[0]) - review_line = ReviewLine(sent=sent, features=feats, notes=notes) - review.add_line(review_line) - - def _read_sent_block(self, stream): - sents = [] - for review in self._read_review_block(stream): - sents.extend([sent for sent in review.sents()]) - return sents - - def _read_word_block(self, stream): - words = [] - for i in range(20): # Read 20 lines at a time. - line = stream.readline() - sent = re.findall(SENT, line) - if sent: - words.extend(self._word_tokenizer.tokenize(sent[0])) - return words diff --git a/pipeline/nltk/corpus/reader/rte.py b/pipeline/nltk/corpus/reader/rte.py deleted file mode 100644 index 98261fae9adf04ecf6938c966ec3cae4fcc775a2..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/rte.py +++ /dev/null @@ -1,146 +0,0 @@ -# Natural Language Toolkit: RTE Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ewan Klein -# URL: -# For license information, see LICENSE.TXT - -""" -Corpus reader for the Recognizing Textual Entailment (RTE) Challenge Corpora. - -The files were taken from the RTE1, RTE2 and RTE3 datasets and the files -were regularized. - -Filenames are of the form rte*_dev.xml and rte*_test.xml. The latter are the -gold standard annotated files. - -Each entailment corpus is a list of 'text'/'hypothesis' pairs. The following -example is taken from RTE3:: - - - - The sale was made to pay Yukos' US$ 27.5 billion tax bill, - Yuganskneftegaz was originally sold for US$ 9.4 billion to a little known - company Baikalfinansgroup which was later bought by the Russian - state-owned oil company Rosneft . - - Baikalfinansgroup was sold to Rosneft. - - -In order to provide globally unique IDs for each pair, a new attribute -``challenge`` has been added to the root element ``entailment-corpus`` of each -file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the -challenge number and 'n' is the pair ID. -""" -from nltk.corpus.reader.api import * -from nltk.corpus.reader.util import * -from nltk.corpus.reader.xmldocs import * - - -def norm(value_string): - """ - Normalize the string value in an RTE pair's ``value`` or ``entailment`` - attribute as an integer (1, 0). - - :param value_string: the label used to classify a text/hypothesis pair - :type value_string: str - :rtype: int - """ - - valdict = {"TRUE": 1, "FALSE": 0, "YES": 1, "NO": 0} - return valdict[value_string.upper()] - - -class RTEPair: - """ - Container for RTE text-hypothesis pairs. - - The entailment relation is signalled by the ``value`` attribute in RTE1, and by - ``entailment`` in RTE2 and RTE3. These both get mapped on to the ``entailment`` - attribute of this class. - """ - - def __init__( - self, - pair, - challenge=None, - id=None, - text=None, - hyp=None, - value=None, - task=None, - length=None, - ): - """ - :param challenge: version of the RTE challenge (i.e., RTE1, RTE2 or RTE3) - :param id: identifier for the pair - :param text: the text component of the pair - :param hyp: the hypothesis component of the pair - :param value: classification label for the pair - :param task: attribute for the particular NLP task that the data was drawn from - :param length: attribute for the length of the text of the pair - """ - self.challenge = challenge - self.id = pair.attrib["id"] - self.gid = f"{self.challenge}-{self.id}" - self.text = pair[0].text - self.hyp = pair[1].text - - if "value" in pair.attrib: - self.value = norm(pair.attrib["value"]) - elif "entailment" in pair.attrib: - self.value = norm(pair.attrib["entailment"]) - else: - self.value = value - if "task" in pair.attrib: - self.task = pair.attrib["task"] - else: - self.task = task - if "length" in pair.attrib: - self.length = pair.attrib["length"] - else: - self.length = length - - def __repr__(self): - if self.challenge: - return f"" - else: - return "" % self.id - - -class RTECorpusReader(XMLCorpusReader): - """ - Corpus reader for corpora in RTE challenges. - - This is just a wrapper around the XMLCorpusReader. See module docstring above for the expected - structure of input documents. - """ - - def _read_etree(self, doc): - """ - Map the XML input into an RTEPair. - - This uses the ``getiterator()`` method from the ElementTree package to - find all the ```` elements. - - :param doc: a parsed XML document - :rtype: list(RTEPair) - """ - try: - challenge = doc.attrib["challenge"] - except KeyError: - challenge = None - pairiter = doc.iter("pair") - return [RTEPair(pair, challenge=challenge) for pair in pairiter] - - def pairs(self, fileids): - """ - Build a list of RTEPairs from a RTE corpus. - - :param fileids: a list of RTE corpus fileids - :type: list - :rtype: list(RTEPair) - """ - if isinstance(fileids, str): - fileids = [fileids] - return concat([self._read_etree(self.xml(fileid)) for fileid in fileids]) diff --git a/pipeline/nltk/corpus/reader/semcor.py b/pipeline/nltk/corpus/reader/semcor.py deleted file mode 100644 index c44474280deda5087069e7c398eaab79656f97b3..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/semcor.py +++ /dev/null @@ -1,296 +0,0 @@ -# Natural Language Toolkit: SemCor Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Nathan Schneider -# URL: -# For license information, see LICENSE.TXT - -""" -Corpus reader for the SemCor Corpus. -""" - -__docformat__ = "epytext en" - -from nltk.corpus.reader.api import * -from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView -from nltk.tree import Tree - - -class SemcorCorpusReader(XMLCorpusReader): - """ - Corpus reader for the SemCor Corpus. - For access to the complete XML data structure, use the ``xml()`` - method. For access to simple word lists and tagged word lists, use - ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``. - """ - - def __init__(self, root, fileids, wordnet, lazy=True): - XMLCorpusReader.__init__(self, root, fileids) - self._lazy = lazy - self._wordnet = wordnet - - def words(self, fileids=None): - """ - :return: the given file(s) as a list of words and punctuation symbols. - :rtype: list(str) - """ - return self._items(fileids, "word", False, False, False) - - def chunks(self, fileids=None): - """ - :return: the given file(s) as a list of chunks, - each of which is a list of words and punctuation symbols - that form a unit. - :rtype: list(list(str)) - """ - return self._items(fileids, "chunk", False, False, False) - - def tagged_chunks(self, fileids=None, tag=("pos" or "sem" or "both")): - """ - :return: the given file(s) as a list of tagged chunks, represented - in tree form. - :rtype: list(Tree) - - :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'` - to indicate the kind of tags to include. Semantic tags consist of - WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity - without a specific entry in WordNet. (Named entities of type 'other' - have no lemma. Other chunks not in WordNet have no semantic tag. - Punctuation tokens have `None` for their part of speech tag.) - """ - return self._items(fileids, "chunk", False, tag != "sem", tag != "pos") - - def sents(self, fileids=None): - """ - :return: the given file(s) as a list of sentences, each encoded - as a list of word strings. - :rtype: list(list(str)) - """ - return self._items(fileids, "word", True, False, False) - - def chunk_sents(self, fileids=None): - """ - :return: the given file(s) as a list of sentences, each encoded - as a list of chunks. - :rtype: list(list(list(str))) - """ - return self._items(fileids, "chunk", True, False, False) - - def tagged_sents(self, fileids=None, tag=("pos" or "sem" or "both")): - """ - :return: the given file(s) as a list of sentences. Each sentence - is represented as a list of tagged chunks (in tree form). - :rtype: list(list(Tree)) - - :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'` - to indicate the kind of tags to include. Semantic tags consist of - WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity - without a specific entry in WordNet. (Named entities of type 'other' - have no lemma. Other chunks not in WordNet have no semantic tag. - Punctuation tokens have `None` for their part of speech tag.) - """ - return self._items(fileids, "chunk", True, tag != "sem", tag != "pos") - - def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag): - if unit == "word" and not bracket_sent: - # the result of the SemcorWordView may be a multiword unit, so the - # LazyConcatenation will make sure the sentence is flattened - _ = lambda *args: LazyConcatenation( - (SemcorWordView if self._lazy else self._words)(*args) - ) - else: - _ = SemcorWordView if self._lazy else self._words - return concat( - [ - _(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet) - for fileid in self.abspaths(fileids) - ] - ) - - def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag): - """ - Helper used to implement the view methods -- returns a list of - tokens, (segmented) words, chunks, or sentences. The tokens - and chunks may optionally be tagged (with POS and sense - information). - - :param fileid: The name of the underlying file. - :param unit: One of `'token'`, `'word'`, or `'chunk'`. - :param bracket_sent: If true, include sentence bracketing. - :param pos_tag: Whether to include part-of-speech tags. - :param sem_tag: Whether to include semantic tags, namely WordNet lemma - and OOV named entity status. - """ - assert unit in ("token", "word", "chunk") - result = [] - - xmldoc = ElementTree.parse(fileid).getroot() - for xmlsent in xmldoc.findall(".//s"): - sent = [] - for xmlword in _all_xmlwords_in(xmlsent): - itm = SemcorCorpusReader._word( - xmlword, unit, pos_tag, sem_tag, self._wordnet - ) - if unit == "word": - sent.extend(itm) - else: - sent.append(itm) - - if bracket_sent: - result.append(SemcorSentence(xmlsent.attrib["snum"], sent)) - else: - result.extend(sent) - - assert None not in result - return result - - @staticmethod - def _word(xmlword, unit, pos_tag, sem_tag, wordnet): - tkn = xmlword.text - if not tkn: - tkn = "" # fixes issue 337? - - lemma = xmlword.get("lemma", tkn) # lemma or NE class - lexsn = xmlword.get("lexsn") # lex_sense (locator for the lemma's sense) - if lexsn is not None: - sense_key = lemma + "%" + lexsn - wnpos = ("n", "v", "a", "r", "s")[ - int(lexsn.split(":")[0]) - 1 - ] # see http://wordnet.princeton.edu/man/senseidx.5WN.html - else: - sense_key = wnpos = None - redef = xmlword.get( - "rdf", tkn - ) # redefinition--this indicates the lookup string - # does not exactly match the enclosed string, e.g. due to typographical adjustments - # or discontinuity of a multiword expression. If a redefinition has occurred, - # the "rdf" attribute holds its inflected form and "lemma" holds its lemma. - # For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class). - sensenum = xmlword.get("wnsn") # WordNet sense number - isOOVEntity = "pn" in xmlword.keys() # a "personal name" (NE) not in WordNet - pos = xmlword.get( - "pos" - ) # part of speech for the whole chunk (None for punctuation) - - if unit == "token": - if not pos_tag and not sem_tag: - itm = tkn - else: - itm = ( - (tkn,) - + ((pos,) if pos_tag else ()) - + ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ()) - ) - return itm - else: - ww = tkn.split("_") # TODO: case where punctuation intervenes in MWE - if unit == "word": - return ww - else: - if sensenum is not None: - try: - sense = wordnet.lemma_from_key(sense_key) # Lemma object - except Exception: - # cannot retrieve the wordnet.Lemma object. possible reasons: - # (a) the wordnet corpus is not downloaded; - # (b) a nonexistent sense is annotated: e.g., such.s.00 triggers: - # nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00' - # solution: just use the lemma name as a string - try: - sense = "%s.%s.%02d" % ( - lemma, - wnpos, - int(sensenum), - ) # e.g.: reach.v.02 - except ValueError: - sense = ( - lemma + "." + wnpos + "." + sensenum - ) # e.g. the sense number may be "2;1" - - bottom = [Tree(pos, ww)] if pos_tag else ww - - if sem_tag and isOOVEntity: - if sensenum is not None: - return Tree(sense, [Tree("NE", bottom)]) - else: # 'other' NE - return Tree("NE", bottom) - elif sem_tag and sensenum is not None: - return Tree(sense, bottom) - elif pos_tag: - return bottom[0] - else: - return bottom # chunk as a list - - -def _all_xmlwords_in(elt, result=None): - if result is None: - result = [] - for child in elt: - if child.tag in ("wf", "punc"): - result.append(child) - else: - _all_xmlwords_in(child, result) - return result - - -class SemcorSentence(list): - """ - A list of words, augmented by an attribute ``num`` used to record - the sentence identifier (the ``n`` attribute from the XML). - """ - - def __init__(self, num, items): - self.num = num - list.__init__(self, items) - - -class SemcorWordView(XMLCorpusView): - """ - A stream backed corpus view specialized for use with the BNC corpus. - """ - - def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet): - """ - :param fileid: The name of the underlying file. - :param unit: One of `'token'`, `'word'`, or `'chunk'`. - :param bracket_sent: If true, include sentence bracketing. - :param pos_tag: Whether to include part-of-speech tags. - :param sem_tag: Whether to include semantic tags, namely WordNet lemma - and OOV named entity status. - """ - if bracket_sent: - tagspec = ".*/s" - else: - tagspec = ".*/s/(punc|wf)" - - self._unit = unit - self._sent = bracket_sent - self._pos_tag = pos_tag - self._sem_tag = sem_tag - self._wordnet = wordnet - - XMLCorpusView.__init__(self, fileid, tagspec) - - def handle_elt(self, elt, context): - if self._sent: - return self.handle_sent(elt) - else: - return self.handle_word(elt) - - def handle_word(self, elt): - return SemcorCorpusReader._word( - elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet - ) - - def handle_sent(self, elt): - sent = [] - for child in elt: - if child.tag in ("wf", "punc"): - itm = self.handle_word(child) - if self._unit == "word": - sent.extend(itm) - else: - sent.append(itm) - else: - raise ValueError("Unexpected element %s" % child.tag) - return SemcorSentence(elt.attrib["snum"], sent) diff --git a/pipeline/nltk/corpus/reader/senseval.py b/pipeline/nltk/corpus/reader/senseval.py deleted file mode 100644 index 99f09fe9f486f7770bddb290550f844898aef966..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/senseval.py +++ /dev/null @@ -1,196 +0,0 @@ -# Natural Language Toolkit: Senseval 2 Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Trevor Cohn -# Steven Bird (modifications) -# URL: -# For license information, see LICENSE.TXT - -""" -Read from the Senseval 2 Corpus. - -SENSEVAL [http://www.senseval.org/] -Evaluation exercises for Word Sense Disambiguation. -Organized by ACL-SIGLEX [https://www.siglex.org/] - -Prepared by Ted Pedersen , University of Minnesota, -https://www.d.umn.edu/~tpederse/data.html -Distributed with permission. - -The NLTK version of the Senseval 2 files uses well-formed XML. -Each instance of the ambiguous words "hard", "interest", "line", and "serve" -is tagged with a sense identifier, and supplied with context. -""" - -import re -from xml.etree import ElementTree - -from nltk.corpus.reader.api import * -from nltk.corpus.reader.util import * -from nltk.tokenize import * - - -class SensevalInstance: - def __init__(self, word, position, context, senses): - self.word = word - self.senses = tuple(senses) - self.position = position - self.context = context - - def __repr__(self): - return "SensevalInstance(word=%r, position=%r, " "context=%r, senses=%r)" % ( - self.word, - self.position, - self.context, - self.senses, - ) - - -class SensevalCorpusReader(CorpusReader): - def instances(self, fileids=None): - return concat( - [ - SensevalCorpusView(fileid, enc) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def _entry(self, tree): - elts = [] - for lexelt in tree.findall("lexelt"): - for inst in lexelt.findall("instance"): - sense = inst[0].attrib["senseid"] - context = [(w.text, w.attrib["pos"]) for w in inst[1]] - elts.append((sense, context)) - return elts - - -class SensevalCorpusView(StreamBackedCorpusView): - def __init__(self, fileid, encoding): - StreamBackedCorpusView.__init__(self, fileid, encoding=encoding) - - self._word_tokenizer = WhitespaceTokenizer() - self._lexelt_starts = [0] # list of streampos - self._lexelts = [None] # list of lexelt names - - def read_block(self, stream): - # Decide which lexical element we're in. - lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell()) - 1 - lexelt = self._lexelts[lexelt_num] - - instance_lines = [] - in_instance = False - while True: - line = stream.readline() - if line == "": - assert instance_lines == [] - return [] - - # Start of a lexical element? - if line.lstrip().startswith(" has no 'item=...' - lexelt = m.group(1)[1:-1] - if lexelt_num < len(self._lexelts): - assert lexelt == self._lexelts[lexelt_num] - else: - self._lexelts.append(lexelt) - self._lexelt_starts.append(stream.tell()) - - # Start of an instance? - if line.lstrip().startswith("" - elif cword.tag == "wf": - context.append((cword.text, cword.attrib["pos"])) - elif cword.tag == "s": - pass # Sentence boundary marker. - - else: - print("ACK", cword.tag) - assert False, "expected CDATA or or " - if cword.tail: - context += self._word_tokenizer.tokenize(cword.tail) - else: - assert False, "unexpected tag %s" % child.tag - return SensevalInstance(lexelt, position, context, senses) - - -def _fixXML(text): - """ - Fix the various issues with Senseval pseudo-XML. - """ - # <~> or <^> => ~ or ^ - text = re.sub(r"<([~\^])>", r"\1", text) - # fix lone & - text = re.sub(r"(\s+)\&(\s+)", r"\1&\2", text) - # fix """ - text = re.sub(r'"""', "'\"'", text) - # fix => - text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text) - # fix foreign word tag - text = re.sub(r"<\&frasl>\s*]*>", "FRASL", text) - # remove <&I .> - text = re.sub(r"<\&I[^>]*>", "", text) - # fix <{word}> - text = re.sub(r"<{([^}]+)}>", r"\1", text) - # remove <@>,

    ,

    - text = re.sub(r"<(@|/?p)>", r"", text) - # remove <&M .> and <&T .> and <&Ms .> - text = re.sub(r"<&\w+ \.>", r"", text) - # remove lines - text = re.sub(r"]*>", r"", text) - # remove <[hi]> and <[/p]> etc - text = re.sub(r"<\[\/?[^>]+\]*>", r"", text) - # take the thing out of the brackets: <…> - text = re.sub(r"<(\&\w+;)>", r"\1", text) - # and remove the & for those patterns that aren't regular XML - text = re.sub(r"&(?!amp|gt|lt|apos|quot)", r"", text) - # fix 'abc ' style tags - now abc - text = re.sub( - r'[ \t]*([^<>\s]+?)[ \t]*', r' \1', text - ) - text = re.sub(r'\s*"\s*', " \"", text) - return text diff --git a/pipeline/nltk/corpus/reader/sentiwordnet.py b/pipeline/nltk/corpus/reader/sentiwordnet.py deleted file mode 100644 index 42426100da71cf1d6b23353a22ce2e074837424d..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/sentiwordnet.py +++ /dev/null @@ -1,136 +0,0 @@ -# Natural Language Toolkit: SentiWordNet -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Christopher Potts -# URL: -# For license information, see LICENSE.TXT - -""" -An NLTK interface for SentiWordNet - -SentiWordNet is a lexical resource for opinion mining. -SentiWordNet assigns to each synset of WordNet three -sentiment scores: positivity, negativity, and objectivity. - -For details about SentiWordNet see: -http://sentiwordnet.isti.cnr.it/ - - >>> from nltk.corpus import sentiwordnet as swn - >>> print(swn.senti_synset('breakdown.n.03')) - - >>> list(swn.senti_synsets('slow')) - [SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'),\ - SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),\ - SentiSynset('slow.a.02'), SentiSynset('dense.s.04'),\ - SentiSynset('slow.a.04'), SentiSynset('boring.s.01'),\ - SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'),\ - SentiSynset('behind.r.03')] - >>> happy = swn.senti_synsets('happy', 'a') - >>> happy0 = list(happy)[0] - >>> happy0.pos_score() - 0.875 - >>> happy0.neg_score() - 0.0 - >>> happy0.obj_score() - 0.125 -""" - -import re - -from nltk.corpus.reader import CorpusReader - - -class SentiWordNetCorpusReader(CorpusReader): - def __init__(self, root, fileids, encoding="utf-8"): - """ - Construct a new SentiWordNet Corpus Reader, using data from - the specified file. - """ - super().__init__(root, fileids, encoding=encoding) - if len(self._fileids) != 1: - raise ValueError("Exactly one file must be specified") - self._db = {} - self._parse_src_file() - - def _parse_src_file(self): - lines = self.open(self._fileids[0]).read().splitlines() - lines = filter((lambda x: not re.search(r"^\s*#", x)), lines) - for i, line in enumerate(lines): - fields = [field.strip() for field in re.split(r"\t+", line)] - try: - pos, offset, pos_score, neg_score, synset_terms, gloss = fields - except BaseException as e: - raise ValueError(f"Line {i} formatted incorrectly: {line}\n") from e - if pos and offset: - offset = int(offset) - self._db[(pos, offset)] = (float(pos_score), float(neg_score)) - - def senti_synset(self, *vals): - from nltk.corpus import wordnet as wn - - if tuple(vals) in self._db: - pos_score, neg_score = self._db[tuple(vals)] - pos, offset = vals - if pos == "s": - pos = "a" - synset = wn.synset_from_pos_and_offset(pos, offset) - return SentiSynset(pos_score, neg_score, synset) - else: - synset = wn.synset(vals[0]) - pos = synset.pos() - if pos == "s": - pos = "a" - offset = synset.offset() - if (pos, offset) in self._db: - pos_score, neg_score = self._db[(pos, offset)] - return SentiSynset(pos_score, neg_score, synset) - else: - return None - - def senti_synsets(self, string, pos=None): - from nltk.corpus import wordnet as wn - - sentis = [] - synset_list = wn.synsets(string, pos) - for synset in synset_list: - sentis.append(self.senti_synset(synset.name())) - sentis = filter(lambda x: x, sentis) - return sentis - - def all_senti_synsets(self): - from nltk.corpus import wordnet as wn - - for key, fields in self._db.items(): - pos, offset = key - pos_score, neg_score = fields - synset = wn.synset_from_pos_and_offset(pos, offset) - yield SentiSynset(pos_score, neg_score, synset) - - -class SentiSynset: - def __init__(self, pos_score, neg_score, synset): - self._pos_score = pos_score - self._neg_score = neg_score - self._obj_score = 1.0 - (self._pos_score + self._neg_score) - self.synset = synset - - def pos_score(self): - return self._pos_score - - def neg_score(self): - return self._neg_score - - def obj_score(self): - return self._obj_score - - def __str__(self): - """Prints just the Pos/Neg scores for now.""" - s = "<" - s += self.synset.name() + ": " - s += "PosScore=%s " % self._pos_score - s += "NegScore=%s" % self._neg_score - s += ">" - return s - - def __repr__(self): - return "Senti" + repr(self.synset) diff --git a/pipeline/nltk/corpus/reader/sinica_treebank.py b/pipeline/nltk/corpus/reader/sinica_treebank.py deleted file mode 100644 index 6aa7f5ec9f34114c499721650bbb307413dd7804..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/sinica_treebank.py +++ /dev/null @@ -1,75 +0,0 @@ -# Natural Language Toolkit: Sinica Treebank Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# URL: -# For license information, see LICENSE.TXT - -""" -Sinica Treebank Corpus Sample - -http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm - -10,000 parsed sentences, drawn from the Academia Sinica Balanced -Corpus of Modern Chinese. Parse tree notation is based on -Information-based Case Grammar. Tagset documentation is available -at https://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html - -Language and Knowledge Processing Group, Institute of Information -Science, Academia Sinica - -The data is distributed with the Natural Language Toolkit under the terms of -the Creative Commons Attribution-NonCommercial-ShareAlike License -[https://creativecommons.org/licenses/by-nc-sa/2.5/]. - -References: - -Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999) -The Construction of Sinica Treebank. Computational Linguistics and -Chinese Language Processing, 4, pp 87-104. - -Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming -Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria, -Annotation Guidelines, and On-line Interface. Proceedings of 2nd -Chinese Language Processing Workshop, Association for Computational -Linguistics. - -Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar -Extraction, Proceedings of IJCNLP-04, pp560-565. -""" - -from nltk.corpus.reader.api import * -from nltk.corpus.reader.util import * -from nltk.tag import map_tag -from nltk.tree import sinica_parse - -IDENTIFIER = re.compile(r"^#\S+\s") -APPENDIX = re.compile(r"(?<=\))#.*$") -TAGWORD = re.compile(r":([^:()|]+):([^:()|]+)") -WORD = re.compile(r":[^:()|]+:([^:()|]+)") - - -class SinicaTreebankCorpusReader(SyntaxCorpusReader): - """ - Reader for the sinica treebank. - """ - - def _read_block(self, stream): - sent = stream.readline() - sent = IDENTIFIER.sub("", sent) - sent = APPENDIX.sub("", sent) - return [sent] - - def _parse(self, sent): - return sinica_parse(sent) - - def _tag(self, sent, tagset=None): - tagged_sent = [(w, t) for (t, w) in TAGWORD.findall(sent)] - if tagset and tagset != self._tagset: - tagged_sent = [ - (w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_sent - ] - return tagged_sent - - def _word(self, sent): - return WORD.findall(sent) diff --git a/pipeline/nltk/corpus/reader/string_category.py b/pipeline/nltk/corpus/reader/string_category.py deleted file mode 100644 index b4ae423eb920d6d86c0fce8a43881f7bdeaf5b35..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/string_category.py +++ /dev/null @@ -1,56 +0,0 @@ -# Natural Language Toolkit: String Category Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -Read tuples from a corpus consisting of categorized strings. -For example, from the question classification corpus: - -NUM:dist How far is it from Denver to Aspen ? -LOC:city What county is Modesto , California in ? -HUM:desc Who was Galileo ? -DESC:def What is an atom ? -NUM:date When did Hawaii become a state ? -""" - -from nltk.corpus.reader.api import * - -# based on PPAttachmentCorpusReader -from nltk.corpus.reader.util import * - - -# [xx] Should the order of the tuple be reversed -- in most other places -# in nltk, we use the form (data, tag) -- e.g., tagged words and -# labeled texts for classifiers. -class StringCategoryCorpusReader(CorpusReader): - def __init__(self, root, fileids, delimiter=" ", encoding="utf8"): - """ - :param root: The root directory for this corpus. - :param fileids: A list or regexp specifying the fileids in this corpus. - :param delimiter: Field delimiter - """ - CorpusReader.__init__(self, root, fileids, encoding) - self._delimiter = delimiter - - def tuples(self, fileids=None): - if fileids is None: - fileids = self._fileids - elif isinstance(fileids, str): - fileids = [fileids] - return concat( - [ - StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def _read_tuple_block(self, stream): - line = stream.readline().strip() - if line: - return [tuple(line.split(self._delimiter, 1))] - else: - return [] diff --git a/pipeline/nltk/corpus/reader/switchboard.py b/pipeline/nltk/corpus/reader/switchboard.py deleted file mode 100644 index f6a396fb137ccf17c990f41268f77e176380acb1..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/switchboard.py +++ /dev/null @@ -1,125 +0,0 @@ -# Natural Language Toolkit: Switchboard Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT -import re - -from nltk.corpus.reader.api import * -from nltk.corpus.reader.util import * -from nltk.tag import map_tag, str2tuple - - -class SwitchboardTurn(list): - """ - A specialized list object used to encode switchboard utterances. - The elements of the list are the words in the utterance; and two - attributes, ``speaker`` and ``id``, are provided to retrieve the - spearker identifier and utterance id. Note that utterance ids - are only unique within a given discourse. - """ - - def __init__(self, words, speaker, id): - list.__init__(self, words) - self.speaker = speaker - self.id = int(id) - - def __repr__(self): - if len(self) == 0: - text = "" - elif isinstance(self[0], tuple): - text = " ".join("%s/%s" % w for w in self) - else: - text = " ".join(self) - return f"<{self.speaker}.{self.id}: {text!r}>" - - -class SwitchboardCorpusReader(CorpusReader): - _FILES = ["tagged"] - # Use the "tagged" file even for non-tagged data methods, since - # it's tokenized. - - def __init__(self, root, tagset=None): - CorpusReader.__init__(self, root, self._FILES) - self._tagset = tagset - - def words(self): - return StreamBackedCorpusView(self.abspath("tagged"), self._words_block_reader) - - def tagged_words(self, tagset=None): - def tagged_words_block_reader(stream): - return self._tagged_words_block_reader(stream, tagset) - - return StreamBackedCorpusView(self.abspath("tagged"), tagged_words_block_reader) - - def turns(self): - return StreamBackedCorpusView(self.abspath("tagged"), self._turns_block_reader) - - def tagged_turns(self, tagset=None): - def tagged_turns_block_reader(stream): - return self._tagged_turns_block_reader(stream, tagset) - - return StreamBackedCorpusView(self.abspath("tagged"), tagged_turns_block_reader) - - def discourses(self): - return StreamBackedCorpusView( - self.abspath("tagged"), self._discourses_block_reader - ) - - def tagged_discourses(self, tagset=False): - def tagged_discourses_block_reader(stream): - return self._tagged_discourses_block_reader(stream, tagset) - - return StreamBackedCorpusView( - self.abspath("tagged"), tagged_discourses_block_reader - ) - - def _discourses_block_reader(self, stream): - # returns at most 1 discourse. (The other methods depend on this.) - return [ - [ - self._parse_utterance(u, include_tag=False) - for b in read_blankline_block(stream) - for u in b.split("\n") - if u.strip() - ] - ] - - def _tagged_discourses_block_reader(self, stream, tagset=None): - # returns at most 1 discourse. (The other methods depend on this.) - return [ - [ - self._parse_utterance(u, include_tag=True, tagset=tagset) - for b in read_blankline_block(stream) - for u in b.split("\n") - if u.strip() - ] - ] - - def _turns_block_reader(self, stream): - return self._discourses_block_reader(stream)[0] - - def _tagged_turns_block_reader(self, stream, tagset=None): - return self._tagged_discourses_block_reader(stream, tagset)[0] - - def _words_block_reader(self, stream): - return sum(self._discourses_block_reader(stream)[0], []) - - def _tagged_words_block_reader(self, stream, tagset=None): - return sum(self._tagged_discourses_block_reader(stream, tagset)[0], []) - - _UTTERANCE_RE = re.compile(r"(\w+)\.(\d+)\:\s*(.*)") - _SEP = "/" - - def _parse_utterance(self, utterance, include_tag, tagset=None): - m = self._UTTERANCE_RE.match(utterance) - if m is None: - raise ValueError("Bad utterance %r" % utterance) - speaker, id, text = m.groups() - words = [str2tuple(s, self._SEP) for s in text.split()] - if not include_tag: - words = [w for (w, t) in words] - elif tagset and tagset != self._tagset: - words = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in words] - return SwitchboardTurn(words, speaker, id) diff --git a/pipeline/nltk/corpus/reader/tagged.py b/pipeline/nltk/corpus/reader/tagged.py deleted file mode 100644 index 2dcfe1b6ff5487a57da8b5e8a9b919eba8b3b6e3..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/tagged.py +++ /dev/null @@ -1,354 +0,0 @@ -# Natural Language Toolkit: Tagged Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird -# Jacob Perkins -# URL: -# For license information, see LICENSE.TXT - -""" -A reader for corpora whose documents contain part-of-speech-tagged words. -""" - -import os - -from nltk.corpus.reader.api import * -from nltk.corpus.reader.timit import read_timit_block -from nltk.corpus.reader.util import * -from nltk.tag import map_tag, str2tuple -from nltk.tokenize import * - - -class TaggedCorpusReader(CorpusReader): - """ - Reader for simple part-of-speech tagged corpora. Paragraphs are - assumed to be split using blank lines. Sentences and words can be - tokenized using the default tokenizers, or by custom tokenizers - specified as parameters to the constructor. Words are parsed - using ``nltk.tag.str2tuple``. By default, ``'/'`` is used as the - separator. I.e., words should have the form:: - - word1/tag1 word2/tag2 word3/tag3 ... - - But custom separators may be specified as parameters to the - constructor. Part of speech tags are case-normalized to upper - case. - """ - - def __init__( - self, - root, - fileids, - sep="/", - word_tokenizer=WhitespaceTokenizer(), - sent_tokenizer=RegexpTokenizer("\n", gaps=True), - para_block_reader=read_blankline_block, - encoding="utf8", - tagset=None, - ): - """ - Construct a new Tagged Corpus reader for a set of documents - located at the given root directory. Example usage: - - >>> root = '/...path to corpus.../' - >>> reader = TaggedCorpusReader(root, '.*', '.txt') # doctest: +SKIP - - :param root: The root directory for this corpus. - :param fileids: A list or regexp specifying the fileids in this corpus. - """ - CorpusReader.__init__(self, root, fileids, encoding) - self._sep = sep - self._word_tokenizer = word_tokenizer - self._sent_tokenizer = sent_tokenizer - self._para_block_reader = para_block_reader - self._tagset = tagset - - def words(self, fileids=None): - """ - :return: the given file(s) as a list of words - and punctuation symbols. - :rtype: list(str) - """ - return concat( - [ - TaggedCorpusView( - fileid, - enc, - False, - False, - False, - self._sep, - self._word_tokenizer, - self._sent_tokenizer, - self._para_block_reader, - None, - ) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def sents(self, fileids=None): - """ - :return: the given file(s) as a list of - sentences or utterances, each encoded as a list of word - strings. - :rtype: list(list(str)) - """ - return concat( - [ - TaggedCorpusView( - fileid, - enc, - False, - True, - False, - self._sep, - self._word_tokenizer, - self._sent_tokenizer, - self._para_block_reader, - None, - ) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def paras(self, fileids=None): - """ - :return: the given file(s) as a list of - paragraphs, each encoded as a list of sentences, which are - in turn encoded as lists of word strings. - :rtype: list(list(list(str))) - """ - return concat( - [ - TaggedCorpusView( - fileid, - enc, - False, - True, - True, - self._sep, - self._word_tokenizer, - self._sent_tokenizer, - self._para_block_reader, - None, - ) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def tagged_words(self, fileids=None, tagset=None): - """ - :return: the given file(s) as a list of tagged - words and punctuation symbols, encoded as tuples - ``(word,tag)``. - :rtype: list(tuple(str,str)) - """ - if tagset and tagset != self._tagset: - tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) - else: - tag_mapping_function = None - return concat( - [ - TaggedCorpusView( - fileid, - enc, - True, - False, - False, - self._sep, - self._word_tokenizer, - self._sent_tokenizer, - self._para_block_reader, - tag_mapping_function, - ) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def tagged_sents(self, fileids=None, tagset=None): - """ - :return: the given file(s) as a list of - sentences, each encoded as a list of ``(word,tag)`` tuples. - - :rtype: list(list(tuple(str,str))) - """ - if tagset and tagset != self._tagset: - tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) - else: - tag_mapping_function = None - return concat( - [ - TaggedCorpusView( - fileid, - enc, - True, - True, - False, - self._sep, - self._word_tokenizer, - self._sent_tokenizer, - self._para_block_reader, - tag_mapping_function, - ) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - def tagged_paras(self, fileids=None, tagset=None): - """ - :return: the given file(s) as a list of - paragraphs, each encoded as a list of sentences, which are - in turn encoded as lists of ``(word,tag)`` tuples. - :rtype: list(list(list(tuple(str,str)))) - """ - if tagset and tagset != self._tagset: - tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) - else: - tag_mapping_function = None - return concat( - [ - TaggedCorpusView( - fileid, - enc, - True, - True, - True, - self._sep, - self._word_tokenizer, - self._sent_tokenizer, - self._para_block_reader, - tag_mapping_function, - ) - for (fileid, enc) in self.abspaths(fileids, True) - ] - ) - - -class CategorizedTaggedCorpusReader(CategorizedCorpusReader, TaggedCorpusReader): - """ - A reader for part-of-speech tagged corpora whose documents are - divided into categories based on their file identifiers. - """ - - def __init__(self, *args, **kwargs): - """ - Initialize the corpus reader. Categorization arguments - (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to - the ``CategorizedCorpusReader`` constructor. The remaining arguments - are passed to the ``TaggedCorpusReader``. - """ - CategorizedCorpusReader.__init__(self, kwargs) - TaggedCorpusReader.__init__(self, *args, **kwargs) - - def tagged_words(self, fileids=None, categories=None, tagset=None): - return super().tagged_words(self._resolve(fileids, categories), tagset) - - def tagged_sents(self, fileids=None, categories=None, tagset=None): - return super().tagged_sents(self._resolve(fileids, categories), tagset) - - def tagged_paras(self, fileids=None, categories=None, tagset=None): - return super().tagged_paras(self._resolve(fileids, categories), tagset) - - -class TaggedCorpusView(StreamBackedCorpusView): - """ - A specialized corpus view for tagged documents. It can be - customized via flags to divide the tagged corpus documents up by - sentence or paragraph, and to include or omit part of speech tags. - ``TaggedCorpusView`` objects are typically created by - ``TaggedCorpusReader`` (not directly by nltk users). - """ - - def __init__( - self, - corpus_file, - encoding, - tagged, - group_by_sent, - group_by_para, - sep, - word_tokenizer, - sent_tokenizer, - para_block_reader, - tag_mapping_function=None, - ): - self._tagged = tagged - self._group_by_sent = group_by_sent - self._group_by_para = group_by_para - self._sep = sep - self._word_tokenizer = word_tokenizer - self._sent_tokenizer = sent_tokenizer - self._para_block_reader = para_block_reader - self._tag_mapping_function = tag_mapping_function - StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) - - def read_block(self, stream): - """Reads one paragraph at a time.""" - block = [] - for para_str in self._para_block_reader(stream): - para = [] - for sent_str in self._sent_tokenizer.tokenize(para_str): - sent = [ - str2tuple(s, self._sep) - for s in self._word_tokenizer.tokenize(sent_str) - ] - if self._tag_mapping_function: - sent = [(w, self._tag_mapping_function(t)) for (w, t) in sent] - if not self._tagged: - sent = [w for (w, t) in sent] - if self._group_by_sent: - para.append(sent) - else: - para.extend(sent) - if self._group_by_para: - block.append(para) - else: - block.extend(para) - return block - - -# needs to implement simplified tags -class MacMorphoCorpusReader(TaggedCorpusReader): - """ - A corpus reader for the MAC_MORPHO corpus. Each line contains a - single tagged word, using '_' as a separator. Sentence boundaries - are based on the end-sentence tag ('_.'). Paragraph information - is not included in the corpus, so each paragraph returned by - ``self.paras()`` and ``self.tagged_paras()`` contains a single - sentence. - """ - - def __init__(self, root, fileids, encoding="utf8", tagset=None): - TaggedCorpusReader.__init__( - self, - root, - fileids, - sep="_", - word_tokenizer=LineTokenizer(), - sent_tokenizer=RegexpTokenizer(".*\n"), - para_block_reader=self._read_block, - encoding=encoding, - tagset=tagset, - ) - - def _read_block(self, stream): - return read_regexp_block(stream, r".*", r".*_\.") - - -class TimitTaggedCorpusReader(TaggedCorpusReader): - """ - A corpus reader for tagged sentences that are included in the TIMIT corpus. - """ - - def __init__(self, *args, **kwargs): - TaggedCorpusReader.__init__( - self, para_block_reader=read_timit_block, *args, **kwargs - ) - - def paras(self): - raise NotImplementedError("use sents() instead") - - def tagged_paras(self): - raise NotImplementedError("use tagged_sents() instead") diff --git a/pipeline/nltk/corpus/reader/timit.py b/pipeline/nltk/corpus/reader/timit.py deleted file mode 100644 index e399ac2ff31fd39c5dfc9ac9e9de0bc29d1f1842..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/timit.py +++ /dev/null @@ -1,510 +0,0 @@ -# Natural Language Toolkit: TIMIT Corpus Reader -# -# Copyright (C) 2001-2007 NLTK Project -# Author: Haejoong Lee -# Steven Bird -# Jacob Perkins -# URL: -# For license information, see LICENSE.TXT - -# [xx] this docstring is out-of-date: -""" -Read tokens, phonemes and audio data from the NLTK TIMIT Corpus. - -This corpus contains selected portion of the TIMIT corpus. - - - 16 speakers from 8 dialect regions - - 1 male and 1 female from each dialect region - - total 130 sentences (10 sentences per speaker. Note that some - sentences are shared among other speakers, especially sa1 and sa2 - are spoken by all speakers.) - - total 160 recording of sentences (10 recordings per speaker) - - audio format: NIST Sphere, single channel, 16kHz sampling, - 16 bit sample, PCM encoding - - -Module contents -=============== - -The timit corpus reader provides 4 functions and 4 data items. - - - utterances - - List of utterances in the corpus. There are total 160 utterances, - each of which corresponds to a unique utterance of a speaker. - Here's an example of an utterance identifier in the list:: - - dr1-fvmh0/sx206 - - _---- _--- - | | | | | - | | | | | - | | | | `--- sentence number - | | | `----- sentence type (a:all, i:shared, x:exclusive) - | | `--------- speaker ID - | `------------ sex (m:male, f:female) - `-------------- dialect region (1..8) - - - speakers - - List of speaker IDs. An example of speaker ID:: - - dr1-fvmh0 - - Note that if you split an item ID with colon and take the first element of - the result, you will get a speaker ID. - - >>> itemid = 'dr1-fvmh0/sx206' - >>> spkrid , sentid = itemid.split('/') - >>> spkrid - 'dr1-fvmh0' - - The second element of the result is a sentence ID. - - - dictionary() - - Phonetic dictionary of words contained in this corpus. This is a Python - dictionary from words to phoneme lists. - - - spkrinfo() - - Speaker information table. It's a Python dictionary from speaker IDs to - records of 10 fields. Speaker IDs the same as the ones in timie.speakers. - Each record is a dictionary from field names to values, and the fields are - as follows:: - - id speaker ID as defined in the original TIMIT speaker info table - sex speaker gender (M:male, F:female) - dr speaker dialect region (1:new england, 2:northern, - 3:north midland, 4:south midland, 5:southern, 6:new york city, - 7:western, 8:army brat (moved around)) - use corpus type (TRN:training, TST:test) - in this sample corpus only TRN is available - recdate recording date - birthdate speaker birth date - ht speaker height - race speaker race (WHT:white, BLK:black, AMR:american indian, - SPN:spanish-american, ORN:oriental,???:unknown) - edu speaker education level (HS:high school, AS:associate degree, - BS:bachelor's degree (BS or BA), MS:master's degree (MS or MA), - PHD:doctorate degree (PhD,JD,MD), ??:unknown) - comments comments by the recorder - -The 4 functions are as follows. - - - tokenized(sentences=items, offset=False) - - Given a list of items, returns an iterator of a list of word lists, - each of which corresponds to an item (sentence). If offset is set to True, - each element of the word list is a tuple of word(string), start offset and - end offset, where offset is represented as a number of 16kHz samples. - - - phonetic(sentences=items, offset=False) - - Given a list of items, returns an iterator of a list of phoneme lists, - each of which corresponds to an item (sentence). If offset is set to True, - each element of the phoneme list is a tuple of word(string), start offset - and end offset, where offset is represented as a number of 16kHz samples. - - - audiodata(item, start=0, end=None) - - Given an item, returns a chunk of audio samples formatted into a string. - When the function is called, if start and end are omitted, the entire - samples of the recording will be returned. If only end is omitted, - samples from the start offset to the end of the recording will be returned. - - - play(data) - - Play the given audio samples. The audio samples can be obtained from the - timit.audiodata function. - -""" -import sys -import time - -from nltk.corpus.reader.api import * -from nltk.internals import import_from_stdlib -from nltk.tree import Tree - - -class TimitCorpusReader(CorpusReader): - """ - Reader for the TIMIT corpus (or any other corpus with the same - file layout and use of file formats). The corpus root directory - should contain the following files: - - - timitdic.txt: dictionary of standard transcriptions - - spkrinfo.txt: table of speaker information - - In addition, the root directory should contain one subdirectory - for each speaker, containing three files for each utterance: - - - .txt: text content of utterances - - .wrd: tokenized text content of utterances - - .phn: phonetic transcription of utterances - - .wav: utterance sound file - """ - - _FILE_RE = r"(\w+-\w+/\w+\.(phn|txt|wav|wrd))|" + r"timitdic\.txt|spkrinfo\.txt" - """A regexp matching fileids that are used by this corpus reader.""" - _UTTERANCE_RE = r"\w+-\w+/\w+\.txt" - - def __init__(self, root, encoding="utf8"): - """ - Construct a new TIMIT corpus reader in the given directory. - :param root: The root directory for this corpus. - """ - # Ensure that wave files don't get treated as unicode data: - if isinstance(encoding, str): - encoding = [(r".*\.wav", None), (".*", encoding)] - - CorpusReader.__init__( - self, root, find_corpus_fileids(root, self._FILE_RE), encoding=encoding - ) - - self._utterances = [ - name[:-4] for name in find_corpus_fileids(root, self._UTTERANCE_RE) - ] - """A list of the utterance identifiers for all utterances in - this corpus.""" - - self._speakerinfo = None - self._root = root - self.speakers = sorted({u.split("/")[0] for u in self._utterances}) - - def fileids(self, filetype=None): - """ - Return a list of file identifiers for the files that make up - this corpus. - - :param filetype: If specified, then ``filetype`` indicates that - only the files that have the given type should be - returned. Accepted values are: ``txt``, ``wrd``, ``phn``, - ``wav``, or ``metadata``, - """ - if filetype is None: - return CorpusReader.fileids(self) - elif filetype in ("txt", "wrd", "phn", "wav"): - return [f"{u}.{filetype}" for u in self._utterances] - elif filetype == "metadata": - return ["timitdic.txt", "spkrinfo.txt"] - else: - raise ValueError("Bad value for filetype: %r" % filetype) - - def utteranceids( - self, dialect=None, sex=None, spkrid=None, sent_type=None, sentid=None - ): - """ - :return: A list of the utterance identifiers for all - utterances in this corpus, or for the given speaker, dialect - region, gender, sentence type, or sentence number, if - specified. - """ - if isinstance(dialect, str): - dialect = [dialect] - if isinstance(sex, str): - sex = [sex] - if isinstance(spkrid, str): - spkrid = [spkrid] - if isinstance(sent_type, str): - sent_type = [sent_type] - if isinstance(sentid, str): - sentid = [sentid] - - utterances = self._utterances[:] - if dialect is not None: - utterances = [u for u in utterances if u[2] in dialect] - if sex is not None: - utterances = [u for u in utterances if u[4] in sex] - if spkrid is not None: - utterances = [u for u in utterances if u[:9] in spkrid] - if sent_type is not None: - utterances = [u for u in utterances if u[11] in sent_type] - if sentid is not None: - utterances = [u for u in utterances if u[10:] in spkrid] - return utterances - - def transcription_dict(self): - """ - :return: A dictionary giving the 'standard' transcription for - each word. - """ - _transcriptions = {} - with self.open("timitdic.txt") as fp: - for line in fp: - if not line.strip() or line[0] == ";": - continue - m = re.match(r"\s*(\S+)\s+/(.*)/\s*$", line) - if not m: - raise ValueError("Bad line: %r" % line) - _transcriptions[m.group(1)] = m.group(2).split() - return _transcriptions - - def spkrid(self, utterance): - return utterance.split("/")[0] - - def sentid(self, utterance): - return utterance.split("/")[1] - - def utterance(self, spkrid, sentid): - return f"{spkrid}/{sentid}" - - def spkrutteranceids(self, speaker): - """ - :return: A list of all utterances associated with a given - speaker. - """ - return [ - utterance - for utterance in self._utterances - if utterance.startswith(speaker + "/") - ] - - def spkrinfo(self, speaker): - """ - :return: A dictionary mapping .. something. - """ - if speaker in self._utterances: - speaker = self.spkrid(speaker) - - if self._speakerinfo is None: - self._speakerinfo = {} - with self.open("spkrinfo.txt") as fp: - for line in fp: - if not line.strip() or line[0] == ";": - continue - rec = line.strip().split(None, 9) - key = f"dr{rec[2]}-{rec[1].lower()}{rec[0].lower()}" - self._speakerinfo[key] = SpeakerInfo(*rec) - - return self._speakerinfo[speaker] - - def phones(self, utterances=None): - results = [] - for fileid in self._utterance_fileids(utterances, ".phn"): - with self.open(fileid) as fp: - for line in fp: - if line.strip(): - results.append(line.split()[-1]) - return results - - def phone_times(self, utterances=None): - """ - offset is represented as a number of 16kHz samples! - """ - results = [] - for fileid in self._utterance_fileids(utterances, ".phn"): - with self.open(fileid) as fp: - for line in fp: - if line.strip(): - results.append( - ( - line.split()[2], - int(line.split()[0]), - int(line.split()[1]), - ) - ) - return results - - def words(self, utterances=None): - results = [] - for fileid in self._utterance_fileids(utterances, ".wrd"): - with self.open(fileid) as fp: - for line in fp: - if line.strip(): - results.append(line.split()[-1]) - return results - - def word_times(self, utterances=None): - results = [] - for fileid in self._utterance_fileids(utterances, ".wrd"): - with self.open(fileid) as fp: - for line in fp: - if line.strip(): - results.append( - ( - line.split()[2], - int(line.split()[0]), - int(line.split()[1]), - ) - ) - return results - - def sents(self, utterances=None): - results = [] - for fileid in self._utterance_fileids(utterances, ".wrd"): - with self.open(fileid) as fp: - results.append([line.split()[-1] for line in fp if line.strip()]) - return results - - def sent_times(self, utterances=None): - # TODO: Check this - return [ - ( - line.split(None, 2)[-1].strip(), - int(line.split()[0]), - int(line.split()[1]), - ) - for fileid in self._utterance_fileids(utterances, ".txt") - for line in self.open(fileid) - if line.strip() - ] - - def phone_trees(self, utterances=None): - if utterances is None: - utterances = self._utterances - if isinstance(utterances, str): - utterances = [utterances] - - trees = [] - for utterance in utterances: - word_times = self.word_times(utterance) - phone_times = self.phone_times(utterance) - sent_times = self.sent_times(utterance) - - while sent_times: - (sent, sent_start, sent_end) = sent_times.pop(0) - trees.append(Tree("S", [])) - while ( - word_times and phone_times and phone_times[0][2] <= word_times[0][1] - ): - trees[-1].append(phone_times.pop(0)[0]) - while word_times and word_times[0][2] <= sent_end: - (word, word_start, word_end) = word_times.pop(0) - trees[-1].append(Tree(word, [])) - while phone_times and phone_times[0][2] <= word_end: - trees[-1][-1].append(phone_times.pop(0)[0]) - while phone_times and phone_times[0][2] <= sent_end: - trees[-1].append(phone_times.pop(0)[0]) - return trees - - # [xx] NOTE: This is currently broken -- we're assuming that the - # fileids are WAV fileids (aka RIFF), but they're actually NIST SPHERE - # fileids. - def wav(self, utterance, start=0, end=None): - # nltk.chunk conflicts with the stdlib module 'chunk' - wave = import_from_stdlib("wave") - - w = wave.open(self.open(utterance + ".wav"), "rb") - - if end is None: - end = w.getnframes() - - # Skip past frames before start, then read the frames we want - w.readframes(start) - frames = w.readframes(end - start) - - # Open a new temporary file -- the wave module requires - # an actual file, and won't work w/ stringio. :( - tf = tempfile.TemporaryFile() - out = wave.open(tf, "w") - - # Write the parameters & data to the new file. - out.setparams(w.getparams()) - out.writeframes(frames) - out.close() - - # Read the data back from the file, and return it. The - # file will automatically be deleted when we return. - tf.seek(0) - return tf.read() - - def audiodata(self, utterance, start=0, end=None): - assert end is None or end > start - headersize = 44 - with self.open(utterance + ".wav") as fp: - if end is None: - data = fp.read() - else: - data = fp.read(headersize + end * 2) - return data[headersize + start * 2 :] - - def _utterance_fileids(self, utterances, extension): - if utterances is None: - utterances = self._utterances - if isinstance(utterances, str): - utterances = [utterances] - return [f"{u}{extension}" for u in utterances] - - def play(self, utterance, start=0, end=None): - """ - Play the given audio sample. - - :param utterance: The utterance id of the sample to play - """ - # Method 1: os audio dev. - try: - import ossaudiodev - - try: - dsp = ossaudiodev.open("w") - dsp.setfmt(ossaudiodev.AFMT_S16_LE) - dsp.channels(1) - dsp.speed(16000) - dsp.write(self.audiodata(utterance, start, end)) - dsp.close() - except OSError as e: - print( - ( - "can't acquire the audio device; please " - "activate your audio device." - ), - file=sys.stderr, - ) - print("system error message:", str(e), file=sys.stderr) - return - except ImportError: - pass - - # Method 2: pygame - try: - # FIXME: this won't work under python 3 - import pygame.mixer - import StringIO - - pygame.mixer.init(16000) - f = StringIO.StringIO(self.wav(utterance, start, end)) - pygame.mixer.Sound(f).play() - while pygame.mixer.get_busy(): - time.sleep(0.01) - return - except ImportError: - pass - - # Method 3: complain. :) - print( - ("you must install pygame or ossaudiodev " "for audio playback."), - file=sys.stderr, - ) - - -class SpeakerInfo: - def __init__( - self, id, sex, dr, use, recdate, birthdate, ht, race, edu, comments=None - ): - self.id = id - self.sex = sex - self.dr = dr - self.use = use - self.recdate = recdate - self.birthdate = birthdate - self.ht = ht - self.race = race - self.edu = edu - self.comments = comments - - def __repr__(self): - attribs = "id sex dr use recdate birthdate ht race edu comments" - args = [f"{attr}={getattr(self, attr)!r}" for attr in attribs.split()] - return "SpeakerInfo(%s)" % (", ".join(args)) - - -def read_timit_block(stream): - """ - Block reader for timit tagged sentences, which are preceded by a sentence - number that will be ignored. - """ - line = stream.readline() - if not line: - return [] - n, sent = line.split(" ", 1) - return [sent] diff --git a/pipeline/nltk/corpus/reader/toolbox.py b/pipeline/nltk/corpus/reader/toolbox.py deleted file mode 100644 index 5684ea0b90129223ada6e7dc62fd6a6708e90960..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/toolbox.py +++ /dev/null @@ -1,76 +0,0 @@ -# Natural Language Toolkit: Toolbox Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Greg Aumann -# Stuart Robinson -# Steven Bird -# URL: -# For license information, see LICENSE.TXT - -""" -Module for reading, writing and manipulating -Toolbox databases and settings fileids. -""" - -from nltk.corpus.reader.api import * -from nltk.corpus.reader.util import * -from nltk.toolbox import ToolboxData - - -class ToolboxCorpusReader(CorpusReader): - def xml(self, fileids, key=None): - return concat( - [ - ToolboxData(path, enc).parse(key=key) - for (path, enc) in self.abspaths(fileids, True) - ] - ) - - def fields( - self, - fileids, - strip=True, - unwrap=True, - encoding="utf8", - errors="strict", - unicode_fields=None, - ): - return concat( - [ - list( - ToolboxData(fileid, enc).fields( - strip, unwrap, encoding, errors, unicode_fields - ) - ) - for (fileid, enc) in self.abspaths(fileids, include_encoding=True) - ] - ) - - # should probably be done lazily: - def entries(self, fileids, **kwargs): - if "key" in kwargs: - key = kwargs["key"] - del kwargs["key"] - else: - key = "lx" # the default key in MDF - entries = [] - for marker, contents in self.fields(fileids, **kwargs): - if marker == key: - entries.append((contents, [])) - else: - try: - entries[-1][-1].append((marker, contents)) - except IndexError: - pass - return entries - - def words(self, fileids, key="lx"): - return [contents for marker, contents in self.fields(fileids) if marker == key] - - -def demo(): - pass - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/corpus/reader/twitter.py b/pipeline/nltk/corpus/reader/twitter.py deleted file mode 100644 index a54c6654f0d95aefa3e1bfb55402be505981607e..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/twitter.py +++ /dev/null @@ -1,136 +0,0 @@ -# Natural Language Toolkit: Twitter Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ewan Klein -# URL: -# For license information, see LICENSE.TXT - -""" -A reader for corpora that consist of Tweets. It is assumed that the Tweets -have been serialised into line-delimited JSON. -""" - -import json -import os - -from nltk.corpus.reader.api import CorpusReader -from nltk.corpus.reader.util import StreamBackedCorpusView, ZipFilePathPointer, concat -from nltk.tokenize import TweetTokenizer - - -class TwitterCorpusReader(CorpusReader): - r""" - Reader for corpora that consist of Tweets represented as a list of line-delimited JSON. - - Individual Tweets can be tokenized using the default tokenizer, or by a - custom tokenizer specified as a parameter to the constructor. - - Construct a new Tweet corpus reader for a set of documents - located at the given root directory. - - If you made your own tweet collection in a directory called - `twitter-files`, then you can initialise the reader as:: - - from nltk.corpus import TwitterCorpusReader - reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json') - - However, the recommended approach is to set the relevant directory as the - value of the environmental variable `TWITTER`, and then invoke the reader - as follows:: - - root = os.environ['TWITTER'] - reader = TwitterCorpusReader(root, '.*\.json') - - If you want to work directly with the raw Tweets, the `json` library can - be used:: - - import json - for tweet in reader.docs(): - print(json.dumps(tweet, indent=1, sort_keys=True)) - - """ - - CorpusView = StreamBackedCorpusView - """ - The corpus view class used by this reader. - """ - - def __init__( - self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding="utf8" - ): - """ - :param root: The root directory for this corpus. - :param fileids: A list or regexp specifying the fileids in this corpus. - :param word_tokenizer: Tokenizer for breaking the text of Tweets into - smaller units, including but not limited to words. - """ - CorpusReader.__init__(self, root, fileids, encoding) - - for path in self.abspaths(self._fileids): - if isinstance(path, ZipFilePathPointer): - pass - elif os.path.getsize(path) == 0: - raise ValueError(f"File {path} is empty") - """Check that all user-created corpus files are non-empty.""" - - self._word_tokenizer = word_tokenizer - - def docs(self, fileids=None): - """ - Returns the full Tweet objects, as specified by `Twitter - documentation on Tweets - `_ - - :return: the given file(s) as a list of dictionaries deserialised - from JSON. - :rtype: list(dict) - """ - return concat( - [ - self.CorpusView(path, self._read_tweets, encoding=enc) - for (path, enc, fileid) in self.abspaths(fileids, True, True) - ] - ) - - def strings(self, fileids=None): - """ - Returns only the text content of Tweets in the file(s) - - :return: the given file(s) as a list of Tweets. - :rtype: list(str) - """ - fulltweets = self.docs(fileids) - tweets = [] - for jsono in fulltweets: - try: - text = jsono["text"] - if isinstance(text, bytes): - text = text.decode(self.encoding) - tweets.append(text) - except KeyError: - pass - return tweets - - def tokenized(self, fileids=None): - """ - :return: the given file(s) as a list of the text content of Tweets as - as a list of words, screenanames, hashtags, URLs and punctuation symbols. - - :rtype: list(list(str)) - """ - tweets = self.strings(fileids) - tokenizer = self._word_tokenizer - return [tokenizer.tokenize(t) for t in tweets] - - def _read_tweets(self, stream): - """ - Assumes that each line in ``stream`` is a JSON-serialised object. - """ - tweets = [] - for i in range(10): - line = stream.readline() - if not line: - return tweets - tweet = json.loads(line) - tweets.append(tweet) - return tweets diff --git a/pipeline/nltk/corpus/reader/udhr.py b/pipeline/nltk/corpus/reader/udhr.py deleted file mode 100644 index e6309ff4559659ff9b97bf679b563bcb957d18f1..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/udhr.py +++ /dev/null @@ -1,75 +0,0 @@ -""" -UDHR corpus reader. It mostly deals with encodings. -""" - -from nltk.corpus.reader.plaintext import PlaintextCorpusReader -from nltk.corpus.reader.util import find_corpus_fileids - - -class UdhrCorpusReader(PlaintextCorpusReader): - - ENCODINGS = [ - (".*-Latin1$", "latin-1"), - (".*-Hebrew$", "hebrew"), - (".*-Arabic$", "cp1256"), - ("Czech_Cesky-UTF8", "cp1250"), # yeah - ("Polish-Latin2", "cp1250"), - ("Polish_Polski-Latin2", "cp1250"), - (".*-Cyrillic$", "cyrillic"), - (".*-SJIS$", "SJIS"), - (".*-GB2312$", "GB2312"), - (".*-Latin2$", "ISO-8859-2"), - (".*-Greek$", "greek"), - (".*-UTF8$", "utf-8"), - ("Hungarian_Magyar-Unicode", "utf-16-le"), - ("Amahuaca", "latin1"), - ("Turkish_Turkce-Turkish", "latin5"), - ("Lithuanian_Lietuviskai-Baltic", "latin4"), - ("Japanese_Nihongo-EUC", "EUC-JP"), - ("Japanese_Nihongo-JIS", "iso2022_jp"), - ("Chinese_Mandarin-HZ", "hz"), - (r"Abkhaz\-Cyrillic\+Abkh", "cp1251"), - ] - - SKIP = { - # The following files are not fully decodable because they - # were truncated at wrong bytes: - "Burmese_Myanmar-UTF8", - "Japanese_Nihongo-JIS", - "Chinese_Mandarin-HZ", - "Chinese_Mandarin-UTF8", - "Gujarati-UTF8", - "Hungarian_Magyar-Unicode", - "Lao-UTF8", - "Magahi-UTF8", - "Marathi-UTF8", - "Tamil-UTF8", - # Unfortunately, encodings required for reading - # the following files are not supported by Python: - "Vietnamese-VPS", - "Vietnamese-VIQR", - "Vietnamese-TCVN", - "Magahi-Agra", - "Bhojpuri-Agra", - "Esperanto-T61", # latin3 raises an exception - # The following files are encoded for specific fonts: - "Burmese_Myanmar-WinResearcher", - "Armenian-DallakHelv", - "Tigrinya_Tigrigna-VG2Main", - "Amharic-Afenegus6..60375", # ? - "Navaho_Dine-Navajo-Navaho-font", - # What are these? - "Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117", - "Azeri_Azerbaijani_Latin-Az.Times.Lat0117", - # The following files are unintended: - "Czech-Latin2-err", - "Russian_Russky-UTF8~", - } - - def __init__(self, root="udhr"): - fileids = find_corpus_fileids(root, r"(?!README|\.).*") - super().__init__( - root, - [fileid for fileid in fileids if fileid not in self.SKIP], - encoding=self.ENCODINGS, - ) diff --git a/pipeline/nltk/corpus/reader/util.py b/pipeline/nltk/corpus/reader/util.py deleted file mode 100644 index 0934f1705952b4c00d8884da76c8e052c5a23d58..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/util.py +++ /dev/null @@ -1,867 +0,0 @@ -# Natural Language Toolkit: Corpus Reader Utilities -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Edward Loper -# URL: -# For license information, see LICENSE.TXT - -import bisect -import os -import pickle -import re -import tempfile -from functools import reduce -from xml.etree import ElementTree - -from nltk.data import ( - FileSystemPathPointer, - PathPointer, - SeekableUnicodeStreamReader, - ZipFilePathPointer, -) -from nltk.internals import slice_bounds -from nltk.tokenize import wordpunct_tokenize -from nltk.util import AbstractLazySequence, LazyConcatenation, LazySubsequence - -###################################################################### -# { Corpus View -###################################################################### - - -class StreamBackedCorpusView(AbstractLazySequence): - """ - A 'view' of a corpus file, which acts like a sequence of tokens: - it can be accessed by index, iterated over, etc. However, the - tokens are only constructed as-needed -- the entire corpus is - never stored in memory at once. - - The constructor to ``StreamBackedCorpusView`` takes two arguments: - a corpus fileid (specified as a string or as a ``PathPointer``); - and a block reader. A "block reader" is a function that reads - zero or more tokens from a stream, and returns them as a list. A - very simple example of a block reader is: - - >>> def simple_block_reader(stream): - ... return stream.readline().split() - - This simple block reader reads a single line at a time, and - returns a single token (consisting of a string) for each - whitespace-separated substring on the line. - - When deciding how to define the block reader for a given - corpus, careful consideration should be given to the size of - blocks handled by the block reader. Smaller block sizes will - increase the memory requirements of the corpus view's internal - data structures (by 2 integers per block). On the other hand, - larger block sizes may decrease performance for random access to - the corpus. (But note that larger block sizes will *not* - decrease performance for iteration.) - - Internally, ``CorpusView`` maintains a partial mapping from token - index to file position, with one entry per block. When a token - with a given index *i* is requested, the ``CorpusView`` constructs - it as follows: - - 1. First, it searches the toknum/filepos mapping for the token - index closest to (but less than or equal to) *i*. - - 2. Then, starting at the file position corresponding to that - index, it reads one block at a time using the block reader - until it reaches the requested token. - - The toknum/filepos mapping is created lazily: it is initially - empty, but every time a new block is read, the block's - initial token is added to the mapping. (Thus, the toknum/filepos - map has one entry per block.) - - In order to increase efficiency for random access patterns that - have high degrees of locality, the corpus view may cache one or - more blocks. - - :note: Each ``CorpusView`` object internally maintains an open file - object for its underlying corpus file. This file should be - automatically closed when the ``CorpusView`` is garbage collected, - but if you wish to close it manually, use the ``close()`` - method. If you access a ``CorpusView``'s items after it has been - closed, the file object will be automatically re-opened. - - :warning: If the contents of the file are modified during the - lifetime of the ``CorpusView``, then the ``CorpusView``'s behavior - is undefined. - - :warning: If a unicode encoding is specified when constructing a - ``CorpusView``, then the block reader may only call - ``stream.seek()`` with offsets that have been returned by - ``stream.tell()``; in particular, calling ``stream.seek()`` with - relative offsets, or with offsets based on string lengths, may - lead to incorrect behavior. - - :ivar _block_reader: The function used to read - a single block from the underlying file stream. - :ivar _toknum: A list containing the token index of each block - that has been processed. In particular, ``_toknum[i]`` is the - token index of the first token in block ``i``. Together - with ``_filepos``, this forms a partial mapping between token - indices and file positions. - :ivar _filepos: A list containing the file position of each block - that has been processed. In particular, ``_toknum[i]`` is the - file position of the first character in block ``i``. Together - with ``_toknum``, this forms a partial mapping between token - indices and file positions. - :ivar _stream: The stream used to access the underlying corpus file. - :ivar _len: The total number of tokens in the corpus, if known; - or None, if the number of tokens is not yet known. - :ivar _eofpos: The character position of the last character in the - file. This is calculated when the corpus view is initialized, - and is used to decide when the end of file has been reached. - :ivar _cache: A cache of the most recently read block. It - is encoded as a tuple (start_toknum, end_toknum, tokens), where - start_toknum is the token index of the first token in the block; - end_toknum is the token index of the first token not in the - block; and tokens is a list of the tokens in the block. - """ - - def __init__(self, fileid, block_reader=None, startpos=0, encoding="utf8"): - """ - Create a new corpus view, based on the file ``fileid``, and - read with ``block_reader``. See the class documentation - for more information. - - :param fileid: The path to the file that is read by this - corpus view. ``fileid`` can either be a string or a - ``PathPointer``. - - :param startpos: The file position at which the view will - start reading. This can be used to skip over preface - sections. - - :param encoding: The unicode encoding that should be used to - read the file's contents. If no encoding is specified, - then the file's contents will be read as a non-unicode - string (i.e., a str). - """ - if block_reader: - self.read_block = block_reader - # Initialize our toknum/filepos mapping. - self._toknum = [0] - self._filepos = [startpos] - self._encoding = encoding - # We don't know our length (number of tokens) yet. - self._len = None - - self._fileid = fileid - self._stream = None - - self._current_toknum = None - """This variable is set to the index of the next token that - will be read, immediately before ``self.read_block()`` is - called. This is provided for the benefit of the block - reader, which under rare circumstances may need to know - the current token number.""" - - self._current_blocknum = None - """This variable is set to the index of the next block that - will be read, immediately before ``self.read_block()`` is - called. This is provided for the benefit of the block - reader, which under rare circumstances may need to know - the current block number.""" - - # Find the length of the file. - try: - if isinstance(self._fileid, PathPointer): - self._eofpos = self._fileid.file_size() - else: - self._eofpos = os.stat(self._fileid).st_size - except Exception as exc: - raise ValueError(f"Unable to open or access {fileid!r} -- {exc}") from exc - - # Maintain a cache of the most recently read block, to - # increase efficiency of random access. - self._cache = (-1, -1, None) - - fileid = property( - lambda self: self._fileid, - doc=""" - The fileid of the file that is accessed by this view. - - :type: str or PathPointer""", - ) - - def read_block(self, stream): - """ - Read a block from the input stream. - - :return: a block of tokens from the input stream - :rtype: list(any) - :param stream: an input stream - :type stream: stream - """ - raise NotImplementedError("Abstract Method") - - def _open(self): - """ - Open the file stream associated with this corpus view. This - will be called performed if any value is read from the view - while its file stream is closed. - """ - if isinstance(self._fileid, PathPointer): - self._stream = self._fileid.open(self._encoding) - elif self._encoding: - self._stream = SeekableUnicodeStreamReader( - open(self._fileid, "rb"), self._encoding - ) - else: - self._stream = open(self._fileid, "rb") - - def close(self): - """ - Close the file stream associated with this corpus view. This - can be useful if you are worried about running out of file - handles (although the stream should automatically be closed - upon garbage collection of the corpus view). If the corpus - view is accessed after it is closed, it will be automatically - re-opened. - """ - if self._stream is not None: - self._stream.close() - self._stream = None - - def __enter__(self): - return self - - def __exit__(self, type, value, traceback): - self.close() - - def __len__(self): - if self._len is None: - # iterate_from() sets self._len when it reaches the end - # of the file: - for tok in self.iterate_from(self._toknum[-1]): - pass - return self._len - - def __getitem__(self, i): - if isinstance(i, slice): - start, stop = slice_bounds(self, i) - # Check if it's in the cache. - offset = self._cache[0] - if offset <= start and stop <= self._cache[1]: - return self._cache[2][start - offset : stop - offset] - # Construct & return the result. - return LazySubsequence(self, start, stop) - else: - # Handle negative indices - if i < 0: - i += len(self) - if i < 0: - raise IndexError("index out of range") - # Check if it's in the cache. - offset = self._cache[0] - if offset <= i < self._cache[1]: - return self._cache[2][i - offset] - # Use iterate_from to extract it. - try: - return next(self.iterate_from(i)) - except StopIteration as e: - raise IndexError("index out of range") from e - - # If we wanted to be thread-safe, then this method would need to - # do some locking. - def iterate_from(self, start_tok): - # Start by feeding from the cache, if possible. - if self._cache[0] <= start_tok < self._cache[1]: - for tok in self._cache[2][start_tok - self._cache[0] :]: - yield tok - start_tok += 1 - - # Decide where in the file we should start. If `start` is in - # our mapping, then we can jump straight to the correct block; - # otherwise, start at the last block we've processed. - if start_tok < self._toknum[-1]: - block_index = bisect.bisect_right(self._toknum, start_tok) - 1 - toknum = self._toknum[block_index] - filepos = self._filepos[block_index] - else: - block_index = len(self._toknum) - 1 - toknum = self._toknum[-1] - filepos = self._filepos[-1] - - # Open the stream, if it's not open already. - if self._stream is None: - self._open() - - # If the file is empty, the while loop will never run. - # This *seems* to be all the state we need to set: - if self._eofpos == 0: - self._len = 0 - - # Each iteration through this loop, we read a single block - # from the stream. - while filepos < self._eofpos: - # Read the next block. - self._stream.seek(filepos) - self._current_toknum = toknum - self._current_blocknum = block_index - tokens = self.read_block(self._stream) - assert isinstance(tokens, (tuple, list, AbstractLazySequence)), ( - "block reader %s() should return list or tuple." - % self.read_block.__name__ - ) - num_toks = len(tokens) - new_filepos = self._stream.tell() - assert ( - new_filepos > filepos - ), "block reader %s() should consume at least 1 byte (filepos=%d)" % ( - self.read_block.__name__, - filepos, - ) - - # Update our cache. - self._cache = (toknum, toknum + num_toks, list(tokens)) - - # Update our mapping. - assert toknum <= self._toknum[-1] - if num_toks > 0: - block_index += 1 - if toknum == self._toknum[-1]: - assert new_filepos > self._filepos[-1] # monotonic! - self._filepos.append(new_filepos) - self._toknum.append(toknum + num_toks) - else: - # Check for consistency: - assert ( - new_filepos == self._filepos[block_index] - ), "inconsistent block reader (num chars read)" - assert ( - toknum + num_toks == self._toknum[block_index] - ), "inconsistent block reader (num tokens returned)" - - # If we reached the end of the file, then update self._len - if new_filepos == self._eofpos: - self._len = toknum + num_toks - # Generate the tokens in this block (but skip any tokens - # before start_tok). Note that between yields, our state - # may be modified. - for tok in tokens[max(0, start_tok - toknum) :]: - yield tok - # If we're at the end of the file, then we're done. - assert new_filepos <= self._eofpos - if new_filepos == self._eofpos: - break - # Update our indices - toknum += num_toks - filepos = new_filepos - - # If we reach this point, then we should know our length. - assert self._len is not None - # Enforce closing of stream once we reached end of file - # We should have reached EOF once we're out of the while loop. - self.close() - - # Use concat for these, so we can use a ConcatenatedCorpusView - # when possible. - def __add__(self, other): - return concat([self, other]) - - def __radd__(self, other): - return concat([other, self]) - - def __mul__(self, count): - return concat([self] * count) - - def __rmul__(self, count): - return concat([self] * count) - - -class ConcatenatedCorpusView(AbstractLazySequence): - """ - A 'view' of a corpus file that joins together one or more - ``StreamBackedCorpusViews``. At most - one file handle is left open at any time. - """ - - def __init__(self, corpus_views): - self._pieces = corpus_views - """A list of the corpus subviews that make up this - concatenation.""" - - self._offsets = [0] - """A list of offsets, indicating the index at which each - subview begins. In particular:: - offsets[i] = sum([len(p) for p in pieces[:i]])""" - - self._open_piece = None - """The most recently accessed corpus subview (or None). - Before a new subview is accessed, this subview will be closed.""" - - def __len__(self): - if len(self._offsets) <= len(self._pieces): - # Iterate to the end of the corpus. - for tok in self.iterate_from(self._offsets[-1]): - pass - - return self._offsets[-1] - - def close(self): - for piece in self._pieces: - piece.close() - - def iterate_from(self, start_tok): - piecenum = bisect.bisect_right(self._offsets, start_tok) - 1 - - while piecenum < len(self._pieces): - offset = self._offsets[piecenum] - piece = self._pieces[piecenum] - - # If we've got another piece open, close it first. - if self._open_piece is not piece: - if self._open_piece is not None: - self._open_piece.close() - self._open_piece = piece - - # Get everything we can from this piece. - yield from piece.iterate_from(max(0, start_tok - offset)) - - # Update the offset table. - if piecenum + 1 == len(self._offsets): - self._offsets.append(self._offsets[-1] + len(piece)) - - # Move on to the next piece. - piecenum += 1 - - -def concat(docs): - """ - Concatenate together the contents of multiple documents from a - single corpus, using an appropriate concatenation function. This - utility function is used by corpus readers when the user requests - more than one document at a time. - """ - if len(docs) == 1: - return docs[0] - if len(docs) == 0: - raise ValueError("concat() expects at least one object!") - - types = {d.__class__ for d in docs} - - # If they're all strings, use string concatenation. - if all(isinstance(doc, str) for doc in docs): - return "".join(docs) - - # If they're all corpus views, then use ConcatenatedCorpusView. - for typ in types: - if not issubclass(typ, (StreamBackedCorpusView, ConcatenatedCorpusView)): - break - else: - return ConcatenatedCorpusView(docs) - - # If they're all lazy sequences, use a lazy concatenation - for typ in types: - if not issubclass(typ, AbstractLazySequence): - break - else: - return LazyConcatenation(docs) - - # Otherwise, see what we can do: - if len(types) == 1: - typ = list(types)[0] - - if issubclass(typ, list): - return reduce((lambda a, b: a + b), docs, []) - - if issubclass(typ, tuple): - return reduce((lambda a, b: a + b), docs, ()) - - if ElementTree.iselement(typ): - xmltree = ElementTree.Element("documents") - for doc in docs: - xmltree.append(doc) - return xmltree - - # No method found! - raise ValueError("Don't know how to concatenate types: %r" % types) - - -###################################################################### -# { Corpus View for Pickled Sequences -###################################################################### - - -class PickleCorpusView(StreamBackedCorpusView): - """ - A stream backed corpus view for corpus files that consist of - sequences of serialized Python objects (serialized using - ``pickle.dump``). One use case for this class is to store the - result of running feature detection on a corpus to disk. This can - be useful when performing feature detection is expensive (so we - don't want to repeat it); but the corpus is too large to store in - memory. The following example illustrates this technique: - - >>> from nltk.corpus.reader.util import PickleCorpusView - >>> from nltk.util import LazyMap - >>> feature_corpus = LazyMap(detect_features, corpus) # doctest: +SKIP - >>> PickleCorpusView.write(feature_corpus, some_fileid) # doctest: +SKIP - >>> pcv = PickleCorpusView(some_fileid) # doctest: +SKIP - """ - - BLOCK_SIZE = 100 - PROTOCOL = -1 - - def __init__(self, fileid, delete_on_gc=False): - """ - Create a new corpus view that reads the pickle corpus - ``fileid``. - - :param delete_on_gc: If true, then ``fileid`` will be deleted - whenever this object gets garbage-collected. - """ - self._delete_on_gc = delete_on_gc - StreamBackedCorpusView.__init__(self, fileid) - - def read_block(self, stream): - result = [] - for i in range(self.BLOCK_SIZE): - try: - result.append(pickle.load(stream)) - except EOFError: - break - return result - - def __del__(self): - """ - If ``delete_on_gc`` was set to true when this - ``PickleCorpusView`` was created, then delete the corpus view's - fileid. (This method is called whenever a - ``PickledCorpusView`` is garbage-collected. - """ - if getattr(self, "_delete_on_gc"): - if os.path.exists(self._fileid): - try: - os.remove(self._fileid) - except OSError: - pass - self.__dict__.clear() # make the garbage collector's job easier - - @classmethod - def write(cls, sequence, output_file): - if isinstance(output_file, str): - output_file = open(output_file, "wb") - for item in sequence: - pickle.dump(item, output_file, cls.PROTOCOL) - - @classmethod - def cache_to_tempfile(cls, sequence, delete_on_gc=True): - """ - Write the given sequence to a temporary file as a pickle - corpus; and then return a ``PickleCorpusView`` view for that - temporary corpus file. - - :param delete_on_gc: If true, then the temporary file will be - deleted whenever this object gets garbage-collected. - """ - try: - fd, output_file_name = tempfile.mkstemp(".pcv", "nltk-") - output_file = os.fdopen(fd, "wb") - cls.write(sequence, output_file) - output_file.close() - return PickleCorpusView(output_file_name, delete_on_gc) - except OSError as e: - raise ValueError("Error while creating temp file: %s" % e) from e - - -###################################################################### -# { Block Readers -###################################################################### - - -def read_whitespace_block(stream): - toks = [] - for i in range(20): # Read 20 lines at a time. - toks.extend(stream.readline().split()) - return toks - - -def read_wordpunct_block(stream): - toks = [] - for i in range(20): # Read 20 lines at a time. - toks.extend(wordpunct_tokenize(stream.readline())) - return toks - - -def read_line_block(stream): - toks = [] - for i in range(20): - line = stream.readline() - if not line: - return toks - toks.append(line.rstrip("\n")) - return toks - - -def read_blankline_block(stream): - s = "" - while True: - line = stream.readline() - # End of file: - if not line: - if s: - return [s] - else: - return [] - # Blank line: - elif line and not line.strip(): - if s: - return [s] - # Other line: - else: - s += line - - -def read_alignedsent_block(stream): - s = "" - while True: - line = stream.readline() - if line[0] == "=" or line[0] == "\n" or line[:2] == "\r\n": - continue - # End of file: - if not line: - if s: - return [s] - else: - return [] - # Other line: - else: - s += line - if re.match(r"^\d+-\d+", line) is not None: - return [s] - - -def read_regexp_block(stream, start_re, end_re=None): - """ - Read a sequence of tokens from a stream, where tokens begin with - lines that match ``start_re``. If ``end_re`` is specified, then - tokens end with lines that match ``end_re``; otherwise, tokens end - whenever the next line matching ``start_re`` or EOF is found. - """ - # Scan until we find a line matching the start regexp. - while True: - line = stream.readline() - if not line: - return [] # end of file. - if re.match(start_re, line): - break - - # Scan until we find another line matching the regexp, or EOF. - lines = [line] - while True: - oldpos = stream.tell() - line = stream.readline() - # End of file: - if not line: - return ["".join(lines)] - # End of token: - if end_re is not None and re.match(end_re, line): - return ["".join(lines)] - # Start of new token: backup to just before it starts, and - # return the token we've already collected. - if end_re is None and re.match(start_re, line): - stream.seek(oldpos) - return ["".join(lines)] - # Anything else is part of the token. - lines.append(line) - - -def read_sexpr_block(stream, block_size=16384, comment_char=None): - """ - Read a sequence of s-expressions from the stream, and leave the - stream's file position at the end the last complete s-expression - read. This function will always return at least one s-expression, - unless there are no more s-expressions in the file. - - If the file ends in in the middle of an s-expression, then that - incomplete s-expression is returned when the end of the file is - reached. - - :param block_size: The default block size for reading. If an - s-expression is longer than one block, then more than one - block will be read. - :param comment_char: A character that marks comments. Any lines - that begin with this character will be stripped out. - (If spaces or tabs precede the comment character, then the - line will not be stripped.) - """ - start = stream.tell() - block = stream.read(block_size) - encoding = getattr(stream, "encoding", None) - assert encoding is not None or isinstance(block, str) - if encoding not in (None, "utf-8"): - import warnings - - warnings.warn( - "Parsing may fail, depending on the properties " - "of the %s encoding!" % encoding - ) - # (e.g., the utf-16 encoding does not work because it insists - # on adding BOMs to the beginning of encoded strings.) - - if comment_char: - COMMENT = re.compile("(?m)^%s.*$" % re.escape(comment_char)) - while True: - try: - # If we're stripping comments, then make sure our block ends - # on a line boundary; and then replace any comments with - # space characters. (We can't just strip them out -- that - # would make our offset wrong.) - if comment_char: - block += stream.readline() - block = re.sub(COMMENT, _sub_space, block) - # Read the block. - tokens, offset = _parse_sexpr_block(block) - # Skip whitespace - offset = re.compile(r"\s*").search(block, offset).end() - - # Move to the end position. - if encoding is None: - stream.seek(start + offset) - else: - stream.seek(start + len(block[:offset].encode(encoding))) - - # Return the list of tokens we processed - return tokens - except ValueError as e: - if e.args[0] == "Block too small": - next_block = stream.read(block_size) - if next_block: - block += next_block - continue - else: - # The file ended mid-sexpr -- return what we got. - return [block.strip()] - else: - raise - - -def _sub_space(m): - """Helper function: given a regexp match, return a string of - spaces that's the same length as the matched string.""" - return " " * (m.end() - m.start()) - - -def _parse_sexpr_block(block): - tokens = [] - start = end = 0 - - while end < len(block): - m = re.compile(r"\S").search(block, end) - if not m: - return tokens, end - - start = m.start() - - # Case 1: sexpr is not parenthesized. - if m.group() != "(": - m2 = re.compile(r"[\s(]").search(block, start) - if m2: - end = m2.start() - else: - if tokens: - return tokens, end - raise ValueError("Block too small") - - # Case 2: parenthesized sexpr. - else: - nesting = 0 - for m in re.compile(r"[()]").finditer(block, start): - if m.group() == "(": - nesting += 1 - else: - nesting -= 1 - if nesting == 0: - end = m.end() - break - else: - if tokens: - return tokens, end - raise ValueError("Block too small") - - tokens.append(block[start:end]) - - return tokens, end - - -###################################################################### -# { Finding Corpus Items -###################################################################### - - -def find_corpus_fileids(root, regexp): - if not isinstance(root, PathPointer): - raise TypeError("find_corpus_fileids: expected a PathPointer") - regexp += "$" - - # Find fileids in a zipfile: scan the zipfile's namelist. Filter - # out entries that end in '/' -- they're directories. - if isinstance(root, ZipFilePathPointer): - fileids = [ - name[len(root.entry) :] - for name in root.zipfile.namelist() - if not name.endswith("/") - ] - items = [name for name in fileids if re.match(regexp, name)] - return sorted(items) - - # Find fileids in a directory: use os.walk to search all (proper - # or symlinked) subdirectories, and match paths against the regexp. - elif isinstance(root, FileSystemPathPointer): - items = [] - for dirname, subdirs, fileids in os.walk(root.path): - prefix = "".join("%s/" % p for p in _path_from(root.path, dirname)) - items += [ - prefix + fileid - for fileid in fileids - if re.match(regexp, prefix + fileid) - ] - # Don't visit svn directories: - if ".svn" in subdirs: - subdirs.remove(".svn") - return sorted(items) - - else: - raise AssertionError("Don't know how to handle %r" % root) - - -def _path_from(parent, child): - if os.path.split(parent)[1] == "": - parent = os.path.split(parent)[0] - path = [] - while parent != child: - child, dirname = os.path.split(child) - path.insert(0, dirname) - assert os.path.split(child)[0] != child - return path - - -###################################################################### -# { Paragraph structure in Treebank files -###################################################################### - - -def tagged_treebank_para_block_reader(stream): - # Read the next paragraph. - para = "" - while True: - line = stream.readline() - # End of paragraph: - if re.match(r"======+\s*$", line): - if para.strip(): - return [para] - # End of file: - elif line == "": - if para.strip(): - return [para] - else: - return [] - # Content line: - else: - para += line diff --git a/pipeline/nltk/corpus/reader/verbnet.py b/pipeline/nltk/corpus/reader/verbnet.py deleted file mode 100644 index 6056574bb03a0797d47c68b2de333b8337b08a46..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/verbnet.py +++ /dev/null @@ -1,629 +0,0 @@ -# Natural Language Toolkit: Verbnet Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -An NLTK interface to the VerbNet verb lexicon - -For details about VerbNet see: -https://verbs.colorado.edu/~mpalmer/projects/verbnet.html -""" - -import re -import textwrap -from collections import defaultdict - -from nltk.corpus.reader.xmldocs import XMLCorpusReader - - -class VerbnetCorpusReader(XMLCorpusReader): - """ - An NLTK interface to the VerbNet verb lexicon. - - From the VerbNet site: "VerbNet (VN) (Kipper-Schuler 2006) is the largest - on-line verb lexicon currently available for English. It is a hierarchical - domain-independent, broad-coverage verb lexicon with mappings to other - lexical resources such as WordNet (Miller, 1990; Fellbaum, 1998), XTAG - (XTAG Research Group, 2001), and FrameNet (Baker et al., 1998)." - - For details about VerbNet see: - https://verbs.colorado.edu/~mpalmer/projects/verbnet.html - """ - - # No unicode encoding param, since the data files are all XML. - def __init__(self, root, fileids, wrap_etree=False): - XMLCorpusReader.__init__(self, root, fileids, wrap_etree) - - self._lemma_to_class = defaultdict(list) - """A dictionary mapping from verb lemma strings to lists of - VerbNet class identifiers.""" - - self._wordnet_to_class = defaultdict(list) - """A dictionary mapping from wordnet identifier strings to - lists of VerbNet class identifiers.""" - - self._class_to_fileid = {} - """A dictionary mapping from class identifiers to - corresponding file identifiers. The keys of this dictionary - provide a complete list of all classes and subclasses.""" - - self._shortid_to_longid = {} - - # Initialize the dictionaries. Use the quick (regexp-based) - # method instead of the slow (xml-based) method, because it - # runs 2-30 times faster. - self._quick_index() - - _LONGID_RE = re.compile(r"([^\-\.]*)-([\d+.\-]+)$") - """Regular expression that matches (and decomposes) longids""" - - _SHORTID_RE = re.compile(r"[\d+.\-]+$") - """Regular expression that matches shortids""" - - _INDEX_RE = re.compile( - r']+>|' r'' - ) - """Regular expression used by ``_index()`` to quickly scan the corpus - for basic information.""" - - def lemmas(self, vnclass=None): - """ - Return a list of all verb lemmas that appear in any class, or - in the ``classid`` if specified. - """ - if vnclass is None: - return sorted(self._lemma_to_class.keys()) - else: - # [xx] should this include subclass members? - if isinstance(vnclass, str): - vnclass = self.vnclass(vnclass) - return [member.get("name") for member in vnclass.findall("MEMBERS/MEMBER")] - - def wordnetids(self, vnclass=None): - """ - Return a list of all wordnet identifiers that appear in any - class, or in ``classid`` if specified. - """ - if vnclass is None: - return sorted(self._wordnet_to_class.keys()) - else: - # [xx] should this include subclass members? - if isinstance(vnclass, str): - vnclass = self.vnclass(vnclass) - return sum( - ( - member.get("wn", "").split() - for member in vnclass.findall("MEMBERS/MEMBER") - ), - [], - ) - - def classids(self, lemma=None, wordnetid=None, fileid=None, classid=None): - """ - Return a list of the VerbNet class identifiers. If a file - identifier is specified, then return only the VerbNet class - identifiers for classes (and subclasses) defined by that file. - If a lemma is specified, then return only VerbNet class - identifiers for classes that contain that lemma as a member. - If a wordnetid is specified, then return only identifiers for - classes that contain that wordnetid as a member. If a classid - is specified, then return only identifiers for subclasses of - the specified VerbNet class. - If nothing is specified, return all classids within VerbNet - """ - if fileid is not None: - return [c for (c, f) in self._class_to_fileid.items() if f == fileid] - elif lemma is not None: - return self._lemma_to_class[lemma] - elif wordnetid is not None: - return self._wordnet_to_class[wordnetid] - elif classid is not None: - xmltree = self.vnclass(classid) - return [ - subclass.get("ID") - for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS") - ] - else: - return sorted(self._class_to_fileid.keys()) - - def vnclass(self, fileid_or_classid): - """Returns VerbNet class ElementTree - - Return an ElementTree containing the xml for the specified - VerbNet class. - - :param fileid_or_classid: An identifier specifying which class - should be returned. Can be a file identifier (such as - ``'put-9.1.xml'``), or a VerbNet class identifier (such as - ``'put-9.1'``) or a short VerbNet class identifier (such as - ``'9.1'``). - """ - # File identifier: just return the xml. - if fileid_or_classid in self._fileids: - return self.xml(fileid_or_classid) - - # Class identifier: get the xml, and find the right elt. - classid = self.longid(fileid_or_classid) - if classid in self._class_to_fileid: - fileid = self._class_to_fileid[self.longid(classid)] - tree = self.xml(fileid) - if classid == tree.get("ID"): - return tree - else: - for subclass in tree.findall(".//VNSUBCLASS"): - if classid == subclass.get("ID"): - return subclass - else: - assert False # we saw it during _index()! - - else: - raise ValueError(f"Unknown identifier {fileid_or_classid}") - - def fileids(self, vnclass_ids=None): - """ - Return a list of fileids that make up this corpus. If - ``vnclass_ids`` is specified, then return the fileids that make - up the specified VerbNet class(es). - """ - if vnclass_ids is None: - return self._fileids - elif isinstance(vnclass_ids, str): - return [self._class_to_fileid[self.longid(vnclass_ids)]] - else: - return [ - self._class_to_fileid[self.longid(vnclass_id)] - for vnclass_id in vnclass_ids - ] - - def frames(self, vnclass): - """Given a VerbNet class, this method returns VerbNet frames - - The members returned are: - 1) Example - 2) Description - 3) Syntax - 4) Semantics - - :param vnclass: A VerbNet class identifier; or an ElementTree - containing the xml contents of a VerbNet class. - :return: frames - a list of frame dictionaries - """ - if isinstance(vnclass, str): - vnclass = self.vnclass(vnclass) - frames = [] - vnframes = vnclass.findall("FRAMES/FRAME") - for vnframe in vnframes: - frames.append( - { - "example": self._get_example_within_frame(vnframe), - "description": self._get_description_within_frame(vnframe), - "syntax": self._get_syntactic_list_within_frame(vnframe), - "semantics": self._get_semantics_within_frame(vnframe), - } - ) - return frames - - def subclasses(self, vnclass): - """Returns subclass ids, if any exist - - Given a VerbNet class, this method returns subclass ids (if they exist) - in a list of strings. - - :param vnclass: A VerbNet class identifier; or an ElementTree - containing the xml contents of a VerbNet class. - :return: list of subclasses - """ - if isinstance(vnclass, str): - vnclass = self.vnclass(vnclass) - - subclasses = [ - subclass.get("ID") for subclass in vnclass.findall("SUBCLASSES/VNSUBCLASS") - ] - return subclasses - - def themroles(self, vnclass): - """Returns thematic roles participating in a VerbNet class - - Members returned as part of roles are- - 1) Type - 2) Modifiers - - :param vnclass: A VerbNet class identifier; or an ElementTree - containing the xml contents of a VerbNet class. - :return: themroles: A list of thematic roles in the VerbNet class - """ - if isinstance(vnclass, str): - vnclass = self.vnclass(vnclass) - - themroles = [] - for trole in vnclass.findall("THEMROLES/THEMROLE"): - themroles.append( - { - "type": trole.get("type"), - "modifiers": [ - {"value": restr.get("Value"), "type": restr.get("type")} - for restr in trole.findall("SELRESTRS/SELRESTR") - ], - } - ) - return themroles - - ###################################################################### - # { Index Initialization - ###################################################################### - - def _index(self): - """ - Initialize the indexes ``_lemma_to_class``, - ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning - through the corpus fileids. This is fast if ElementTree - uses the C implementation (<0.1 secs), but quite slow (>10 secs) - if only the python implementation is available. - """ - for fileid in self._fileids: - self._index_helper(self.xml(fileid), fileid) - - def _index_helper(self, xmltree, fileid): - """Helper for ``_index()``""" - vnclass = xmltree.get("ID") - self._class_to_fileid[vnclass] = fileid - self._shortid_to_longid[self.shortid(vnclass)] = vnclass - for member in xmltree.findall("MEMBERS/MEMBER"): - self._lemma_to_class[member.get("name")].append(vnclass) - for wn in member.get("wn", "").split(): - self._wordnet_to_class[wn].append(vnclass) - for subclass in xmltree.findall("SUBCLASSES/VNSUBCLASS"): - self._index_helper(subclass, fileid) - - def _quick_index(self): - """ - Initialize the indexes ``_lemma_to_class``, - ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning - through the corpus fileids. This doesn't do proper xml parsing, - but is good enough to find everything in the standard VerbNet - corpus -- and it runs about 30 times faster than xml parsing - (with the python ElementTree; only 2-3 times faster - if ElementTree uses the C implementation). - """ - # nb: if we got rid of wordnet_to_class, this would run 2-3 - # times faster. - for fileid in self._fileids: - vnclass = fileid[:-4] # strip the '.xml' - self._class_to_fileid[vnclass] = fileid - self._shortid_to_longid[self.shortid(vnclass)] = vnclass - with self.open(fileid) as fp: - for m in self._INDEX_RE.finditer(fp.read()): - groups = m.groups() - if groups[0] is not None: - self._lemma_to_class[groups[0]].append(vnclass) - for wn in groups[1].split(): - self._wordnet_to_class[wn].append(vnclass) - elif groups[2] is not None: - self._class_to_fileid[groups[2]] = fileid - vnclass = groups[2] # for elts. - self._shortid_to_longid[self.shortid(vnclass)] = vnclass - else: - assert False, "unexpected match condition" - - ###################################################################### - # { Identifier conversion - ###################################################################### - - def longid(self, shortid): - """Returns longid of a VerbNet class - - Given a short VerbNet class identifier (eg '37.10'), map it - to a long id (eg 'confess-37.10'). If ``shortid`` is already a - long id, then return it as-is""" - if self._LONGID_RE.match(shortid): - return shortid # it's already a longid. - elif not self._SHORTID_RE.match(shortid): - raise ValueError("vnclass identifier %r not found" % shortid) - try: - return self._shortid_to_longid[shortid] - except KeyError as e: - raise ValueError("vnclass identifier %r not found" % shortid) from e - - def shortid(self, longid): - """Returns shortid of a VerbNet class - - Given a long VerbNet class identifier (eg 'confess-37.10'), - map it to a short id (eg '37.10'). If ``longid`` is already a - short id, then return it as-is.""" - if self._SHORTID_RE.match(longid): - return longid # it's already a shortid. - m = self._LONGID_RE.match(longid) - if m: - return m.group(2) - else: - raise ValueError("vnclass identifier %r not found" % longid) - - ###################################################################### - # { Frame access utility functions - ###################################################################### - - def _get_semantics_within_frame(self, vnframe): - """Returns semantics within a single frame - - A utility function to retrieve semantics within a frame in VerbNet - Members of the semantics dictionary: - 1) Predicate value - 2) Arguments - - :param vnframe: An ElementTree containing the xml contents of - a VerbNet frame. - :return: semantics: semantics dictionary - """ - semantics_within_single_frame = [] - for pred in vnframe.findall("SEMANTICS/PRED"): - arguments = [ - {"type": arg.get("type"), "value": arg.get("value")} - for arg in pred.findall("ARGS/ARG") - ] - semantics_within_single_frame.append( - { - "predicate_value": pred.get("value"), - "arguments": arguments, - "negated": pred.get("bool") == "!", - } - ) - return semantics_within_single_frame - - def _get_example_within_frame(self, vnframe): - """Returns example within a frame - - A utility function to retrieve an example within a frame in VerbNet. - - :param vnframe: An ElementTree containing the xml contents of - a VerbNet frame. - :return: example_text: The example sentence for this particular frame - """ - example_element = vnframe.find("EXAMPLES/EXAMPLE") - if example_element is not None: - example_text = example_element.text - else: - example_text = "" - return example_text - - def _get_description_within_frame(self, vnframe): - """Returns member description within frame - - A utility function to retrieve a description of participating members - within a frame in VerbNet. - - :param vnframe: An ElementTree containing the xml contents of - a VerbNet frame. - :return: description: a description dictionary with members - primary and secondary - """ - description_element = vnframe.find("DESCRIPTION") - return { - "primary": description_element.attrib["primary"], - "secondary": description_element.get("secondary", ""), - } - - def _get_syntactic_list_within_frame(self, vnframe): - """Returns semantics within a frame - - A utility function to retrieve semantics within a frame in VerbNet. - Members of the syntactic dictionary: - 1) POS Tag - 2) Modifiers - - :param vnframe: An ElementTree containing the xml contents of - a VerbNet frame. - :return: syntax_within_single_frame - """ - syntax_within_single_frame = [] - for elt in vnframe.find("SYNTAX"): - pos_tag = elt.tag - modifiers = dict() - modifiers["value"] = elt.get("value") if "value" in elt.attrib else "" - modifiers["selrestrs"] = [ - {"value": restr.get("Value"), "type": restr.get("type")} - for restr in elt.findall("SELRESTRS/SELRESTR") - ] - modifiers["synrestrs"] = [ - {"value": restr.get("Value"), "type": restr.get("type")} - for restr in elt.findall("SYNRESTRS/SYNRESTR") - ] - syntax_within_single_frame.append( - {"pos_tag": pos_tag, "modifiers": modifiers} - ) - return syntax_within_single_frame - - ###################################################################### - # { Pretty Printing - ###################################################################### - - def pprint(self, vnclass): - """Returns pretty printed version of a VerbNet class - - Return a string containing a pretty-printed representation of - the given VerbNet class. - - :param vnclass: A VerbNet class identifier; or an ElementTree - containing the xml contents of a VerbNet class. - """ - if isinstance(vnclass, str): - vnclass = self.vnclass(vnclass) - - s = vnclass.get("ID") + "\n" - s += self.pprint_subclasses(vnclass, indent=" ") + "\n" - s += self.pprint_members(vnclass, indent=" ") + "\n" - s += " Thematic roles:\n" - s += self.pprint_themroles(vnclass, indent=" ") + "\n" - s += " Frames:\n" - s += self.pprint_frames(vnclass, indent=" ") - return s - - def pprint_subclasses(self, vnclass, indent=""): - """Returns pretty printed version of subclasses of VerbNet class - - Return a string containing a pretty-printed representation of - the given VerbNet class's subclasses. - - :param vnclass: A VerbNet class identifier; or an ElementTree - containing the xml contents of a VerbNet class. - """ - if isinstance(vnclass, str): - vnclass = self.vnclass(vnclass) - - subclasses = self.subclasses(vnclass) - if not subclasses: - subclasses = ["(none)"] - s = "Subclasses: " + " ".join(subclasses) - return textwrap.fill( - s, 70, initial_indent=indent, subsequent_indent=indent + " " - ) - - def pprint_members(self, vnclass, indent=""): - """Returns pretty printed version of members in a VerbNet class - - Return a string containing a pretty-printed representation of - the given VerbNet class's member verbs. - - :param vnclass: A VerbNet class identifier; or an ElementTree - containing the xml contents of a VerbNet class. - """ - if isinstance(vnclass, str): - vnclass = self.vnclass(vnclass) - - members = self.lemmas(vnclass) - if not members: - members = ["(none)"] - s = "Members: " + " ".join(members) - return textwrap.fill( - s, 70, initial_indent=indent, subsequent_indent=indent + " " - ) - - def pprint_themroles(self, vnclass, indent=""): - """Returns pretty printed version of thematic roles in a VerbNet class - - Return a string containing a pretty-printed representation of - the given VerbNet class's thematic roles. - - :param vnclass: A VerbNet class identifier; or an ElementTree - containing the xml contents of a VerbNet class. - """ - if isinstance(vnclass, str): - vnclass = self.vnclass(vnclass) - - pieces = [] - for themrole in self.themroles(vnclass): - piece = indent + "* " + themrole.get("type") - modifiers = [ - modifier["value"] + modifier["type"] - for modifier in themrole["modifiers"] - ] - if modifiers: - piece += "[{}]".format(" ".join(modifiers)) - pieces.append(piece) - return "\n".join(pieces) - - def pprint_frames(self, vnclass, indent=""): - """Returns pretty version of all frames in a VerbNet class - - Return a string containing a pretty-printed representation of - the list of frames within the VerbNet class. - - :param vnclass: A VerbNet class identifier; or an ElementTree - containing the xml contents of a VerbNet class. - """ - if isinstance(vnclass, str): - vnclass = self.vnclass(vnclass) - pieces = [] - for vnframe in self.frames(vnclass): - pieces.append(self._pprint_single_frame(vnframe, indent)) - return "\n".join(pieces) - - def _pprint_single_frame(self, vnframe, indent=""): - """Returns pretty printed version of a single frame in a VerbNet class - - Returns a string containing a pretty-printed representation of - the given frame. - - :param vnframe: An ElementTree containing the xml contents of - a VerbNet frame. - """ - frame_string = self._pprint_description_within_frame(vnframe, indent) + "\n" - frame_string += self._pprint_example_within_frame(vnframe, indent + " ") + "\n" - frame_string += ( - self._pprint_syntax_within_frame(vnframe, indent + " Syntax: ") + "\n" - ) - frame_string += indent + " Semantics:\n" - frame_string += self._pprint_semantics_within_frame(vnframe, indent + " ") - return frame_string - - def _pprint_example_within_frame(self, vnframe, indent=""): - """Returns pretty printed version of example within frame in a VerbNet class - - Return a string containing a pretty-printed representation of - the given VerbNet frame example. - - :param vnframe: An ElementTree containing the xml contents of - a Verbnet frame. - """ - if vnframe["example"]: - return indent + " Example: " + vnframe["example"] - - def _pprint_description_within_frame(self, vnframe, indent=""): - """Returns pretty printed version of a VerbNet frame description - - Return a string containing a pretty-printed representation of - the given VerbNet frame description. - - :param vnframe: An ElementTree containing the xml contents of - a VerbNet frame. - """ - description = indent + vnframe["description"]["primary"] - if vnframe["description"]["secondary"]: - description += " ({})".format(vnframe["description"]["secondary"]) - return description - - def _pprint_syntax_within_frame(self, vnframe, indent=""): - """Returns pretty printed version of syntax within a frame in a VerbNet class - - Return a string containing a pretty-printed representation of - the given VerbNet frame syntax. - - :param vnframe: An ElementTree containing the xml contents of - a VerbNet frame. - """ - pieces = [] - for element in vnframe["syntax"]: - piece = element["pos_tag"] - modifier_list = [] - if "value" in element["modifiers"] and element["modifiers"]["value"]: - modifier_list.append(element["modifiers"]["value"]) - modifier_list += [ - "{}{}".format(restr["value"], restr["type"]) - for restr in ( - element["modifiers"]["selrestrs"] - + element["modifiers"]["synrestrs"] - ) - ] - if modifier_list: - piece += "[{}]".format(" ".join(modifier_list)) - pieces.append(piece) - - return indent + " ".join(pieces) - - def _pprint_semantics_within_frame(self, vnframe, indent=""): - """Returns a pretty printed version of semantics within frame in a VerbNet class - - Return a string containing a pretty-printed representation of - the given VerbNet frame semantics. - - :param vnframe: An ElementTree containing the xml contents of - a VerbNet frame. - """ - pieces = [] - for predicate in vnframe["semantics"]: - arguments = [argument["value"] for argument in predicate["arguments"]] - pieces.append( - f"{'¬' if predicate['negated'] else ''}{predicate['predicate_value']}({', '.join(arguments)})" - ) - return "\n".join(f"{indent}* {piece}" for piece in pieces) diff --git a/pipeline/nltk/corpus/reader/wordlist.py b/pipeline/nltk/corpus/reader/wordlist.py deleted file mode 100644 index aced7e83fc7c48027d4d1eeb6aca46531ab57969..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/wordlist.py +++ /dev/null @@ -1,166 +0,0 @@ -# Natural Language Toolkit: Word List Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Edward Loper -# URL: -# For license information, see LICENSE.TXT -from nltk.corpus.reader.api import * -from nltk.corpus.reader.util import * -from nltk.tokenize import line_tokenize - - -class WordListCorpusReader(CorpusReader): - """ - List of words, one per line. Blank lines are ignored. - """ - - def words(self, fileids=None, ignore_lines_startswith="\n"): - return [ - line - for line in line_tokenize(self.raw(fileids)) - if not line.startswith(ignore_lines_startswith) - ] - - -class SwadeshCorpusReader(WordListCorpusReader): - def entries(self, fileids=None): - """ - :return: a tuple of words for the specified fileids. - """ - if not fileids: - fileids = self.fileids() - - wordlists = [self.words(f) for f in fileids] - return list(zip(*wordlists)) - - -class NonbreakingPrefixesCorpusReader(WordListCorpusReader): - """ - This is a class to read the nonbreaking prefixes textfiles from the - Moses Machine Translation toolkit. These lists are used in the Python port - of the Moses' word tokenizer. - """ - - available_langs = { - "catalan": "ca", - "czech": "cs", - "german": "de", - "greek": "el", - "english": "en", - "spanish": "es", - "finnish": "fi", - "french": "fr", - "hungarian": "hu", - "icelandic": "is", - "italian": "it", - "latvian": "lv", - "dutch": "nl", - "polish": "pl", - "portuguese": "pt", - "romanian": "ro", - "russian": "ru", - "slovak": "sk", - "slovenian": "sl", - "swedish": "sv", - "tamil": "ta", - } - # Also, add the lang IDs as the keys. - available_langs.update({v: v for v in available_langs.values()}) - - def words(self, lang=None, fileids=None, ignore_lines_startswith="#"): - """ - This module returns a list of nonbreaking prefixes for the specified - language(s). - - >>> from nltk.corpus import nonbreaking_prefixes as nbp - >>> nbp.words('en')[:10] == [u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J'] - True - >>> nbp.words('ta')[:5] == [u'\u0b85', u'\u0b86', u'\u0b87', u'\u0b88', u'\u0b89'] - True - - :return: a list words for the specified language(s). - """ - # If *lang* in list of languages available, allocate apt fileid. - # Otherwise, the function returns non-breaking prefixes for - # all languages when fileids==None. - if lang in self.available_langs: - lang = self.available_langs[lang] - fileids = ["nonbreaking_prefix." + lang] - return [ - line - for line in line_tokenize(self.raw(fileids)) - if not line.startswith(ignore_lines_startswith) - ] - - -class UnicharsCorpusReader(WordListCorpusReader): - """ - This class is used to read lists of characters from the Perl Unicode - Properties (see https://perldoc.perl.org/perluniprops.html). - The files in the perluniprop.zip are extracted using the Unicode::Tussle - module from https://search.cpan.org/~bdfoy/Unicode-Tussle-1.11/lib/Unicode/Tussle.pm - """ - - # These are categories similar to the Perl Unicode Properties - available_categories = [ - "Close_Punctuation", - "Currency_Symbol", - "IsAlnum", - "IsAlpha", - "IsLower", - "IsN", - "IsSc", - "IsSo", - "IsUpper", - "Line_Separator", - "Number", - "Open_Punctuation", - "Punctuation", - "Separator", - "Symbol", - ] - - def chars(self, category=None, fileids=None): - """ - This module returns a list of characters from the Perl Unicode Properties. - They are very useful when porting Perl tokenizers to Python. - - >>> from nltk.corpus import perluniprops as pup - >>> pup.chars('Open_Punctuation')[:5] == [u'(', u'[', u'{', u'\u0f3a', u'\u0f3c'] - True - >>> pup.chars('Currency_Symbol')[:5] == [u'$', u'\xa2', u'\xa3', u'\xa4', u'\xa5'] - True - >>> pup.available_categories - ['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower', 'IsN', 'IsSc', 'IsSo', 'IsUpper', 'Line_Separator', 'Number', 'Open_Punctuation', 'Punctuation', 'Separator', 'Symbol'] - - :return: a list of characters given the specific unicode character category - """ - if category in self.available_categories: - fileids = [category + ".txt"] - return list(self.raw(fileids).strip()) - - -class MWAPPDBCorpusReader(WordListCorpusReader): - """ - This class is used to read the list of word pairs from the subset of lexical - pairs of The Paraphrase Database (PPDB) XXXL used in the Monolingual Word - Alignment (MWA) algorithm described in Sultan et al. (2014a, 2014b, 2015): - - - http://acl2014.org/acl2014/Q14/pdf/Q14-1017 - - https://www.aclweb.org/anthology/S14-2039 - - https://www.aclweb.org/anthology/S15-2027 - - The original source of the full PPDB corpus can be found on - https://www.cis.upenn.edu/~ccb/ppdb/ - - :return: a list of tuples of similar lexical terms. - """ - - mwa_ppdb_xxxl_file = "ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs" - - def entries(self, fileids=mwa_ppdb_xxxl_file): - """ - :return: a tuple of synonym word pairs. - """ - return [tuple(line.split("\t")) for line in line_tokenize(self.raw(fileids))] diff --git a/pipeline/nltk/corpus/reader/wordnet.py b/pipeline/nltk/corpus/reader/wordnet.py deleted file mode 100644 index f10c3436dde87850528529b4ab1b4cf6413a1bce..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/wordnet.py +++ /dev/null @@ -1,2489 +0,0 @@ -# Natural Language Toolkit: WordNet -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bethard -# Steven Bird -# Edward Loper -# Nitin Madnani -# Nasruddin A’aidil Shari -# Sim Wei Ying Geraldine -# Soe Lynn -# Francis Bond -# Eric Kafe - -# URL: -# For license information, see LICENSE.TXT - -""" -An NLTK interface for WordNet - -WordNet is a lexical database of English. -Using synsets, helps find conceptual relationships between words -such as hypernyms, hyponyms, synonyms, antonyms etc. - -For details about WordNet see: -https://wordnet.princeton.edu/ - -This module also allows you to find lemmas in languages -other than English from the Open Multilingual Wordnet -https://omwn.org/ - -""" - -import math -import os -import re -import warnings -from collections import defaultdict, deque -from functools import total_ordering -from itertools import chain, islice -from operator import itemgetter - -from nltk.corpus.reader import CorpusReader -from nltk.internals import deprecated -from nltk.probability import FreqDist -from nltk.util import binary_search_file as _binary_search_file - -###################################################################### -# Table of Contents -###################################################################### -# - Constants -# - Data Classes -# - WordNetError -# - Lemma -# - Synset -# - WordNet Corpus Reader -# - WordNet Information Content Corpus Reader -# - Similarity Metrics -# - Demo - -###################################################################### -# Constants -###################################################################### - -#: Positive infinity (for similarity functions) -_INF = 1e300 - -# { Part-of-speech constants -ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v" -# } - -POS_LIST = [NOUN, VERB, ADJ, ADV] - -# A table of strings that are used to express verb frames. -VERB_FRAME_STRINGS = ( - None, - "Something %s", - "Somebody %s", - "It is %sing", - "Something is %sing PP", - "Something %s something Adjective/Noun", - "Something %s Adjective/Noun", - "Somebody %s Adjective", - "Somebody %s something", - "Somebody %s somebody", - "Something %s somebody", - "Something %s something", - "Something %s to somebody", - "Somebody %s on something", - "Somebody %s somebody something", - "Somebody %s something to somebody", - "Somebody %s something from somebody", - "Somebody %s somebody with something", - "Somebody %s somebody of something", - "Somebody %s something on somebody", - "Somebody %s somebody PP", - "Somebody %s something PP", - "Somebody %s PP", - "Somebody's (body part) %s", - "Somebody %s somebody to INFINITIVE", - "Somebody %s somebody INFINITIVE", - "Somebody %s that CLAUSE", - "Somebody %s to somebody", - "Somebody %s to INFINITIVE", - "Somebody %s whether INFINITIVE", - "Somebody %s somebody into V-ing something", - "Somebody %s something with something", - "Somebody %s INFINITIVE", - "Somebody %s VERB-ing", - "It %s that CLAUSE", - "Something %s INFINITIVE", - # OEWN additions: - "Somebody %s at something", - "Somebody %s for something", - "Somebody %s on somebody", - "Somebody %s out of somebody", -) - -SENSENUM_RE = re.compile(r"\.[\d]+\.") - - -###################################################################### -# Data Classes -###################################################################### - - -class WordNetError(Exception): - """An exception class for wordnet-related errors.""" - - -@total_ordering -class _WordNetObject: - """A common base class for lemmas and synsets.""" - - def hypernyms(self): - return self._related("@") - - def _hypernyms(self): - return self._related("@") - - def instance_hypernyms(self): - return self._related("@i") - - def _instance_hypernyms(self): - return self._related("@i") - - def hyponyms(self): - return self._related("~") - - def instance_hyponyms(self): - return self._related("~i") - - def member_holonyms(self): - return self._related("#m") - - def substance_holonyms(self): - return self._related("#s") - - def part_holonyms(self): - return self._related("#p") - - def member_meronyms(self): - return self._related("%m") - - def substance_meronyms(self): - return self._related("%s") - - def part_meronyms(self): - return self._related("%p") - - def topic_domains(self): - return self._related(";c") - - def in_topic_domains(self): - return self._related("-c") - - def region_domains(self): - return self._related(";r") - - def in_region_domains(self): - return self._related("-r") - - def usage_domains(self): - return self._related(";u") - - def in_usage_domains(self): - return self._related("-u") - - def attributes(self): - return self._related("=") - - def entailments(self): - return self._related("*") - - def causes(self): - return self._related(">") - - def also_sees(self): - return self._related("^") - - def verb_groups(self): - return self._related("$") - - def similar_tos(self): - return self._related("&") - - def __hash__(self): - return hash(self._name) - - def __eq__(self, other): - return self._name == other._name - - def __ne__(self, other): - return self._name != other._name - - def __lt__(self, other): - return self._name < other._name - - -class Lemma(_WordNetObject): - """ - The lexical entry for a single morphological form of a - sense-disambiguated word. - - Create a Lemma from a "..." string where: - is the morphological stem identifying the synset - is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB - is the sense number, counting from 0. - is the morphological form of interest - - Note that and can be different, e.g. the Synset - 'salt.n.03' has the Lemmas 'salt.n.03.salt', 'salt.n.03.saltiness' and - 'salt.n.03.salinity'. - - Lemma attributes, accessible via methods with the same name: - - - name: The canonical name of this lemma. - - synset: The synset that this lemma belongs to. - - syntactic_marker: For adjectives, the WordNet string identifying the - syntactic position relative modified noun. See: - https://wordnet.princeton.edu/documentation/wninput5wn - For all other parts of speech, this attribute is None. - - count: The frequency of this lemma in wordnet. - - Lemma methods: - - Lemmas have the following methods for retrieving related Lemmas. They - correspond to the names for the pointer symbols defined here: - https://wordnet.princeton.edu/documentation/wninput5wn - These methods all return lists of Lemmas: - - - antonyms - - hypernyms, instance_hypernyms - - hyponyms, instance_hyponyms - - member_holonyms, substance_holonyms, part_holonyms - - member_meronyms, substance_meronyms, part_meronyms - - topic_domains, region_domains, usage_domains - - attributes - - derivationally_related_forms - - entailments - - causes - - also_sees - - verb_groups - - similar_tos - - pertainyms - """ - - __slots__ = [ - "_wordnet_corpus_reader", - "_name", - "_syntactic_marker", - "_synset", - "_frame_strings", - "_frame_ids", - "_lexname_index", - "_lex_id", - "_lang", - "_key", - ] - - def __init__( - self, - wordnet_corpus_reader, - synset, - name, - lexname_index, - lex_id, - syntactic_marker, - ): - self._wordnet_corpus_reader = wordnet_corpus_reader - self._name = name - self._syntactic_marker = syntactic_marker - self._synset = synset - self._frame_strings = [] - self._frame_ids = [] - self._lexname_index = lexname_index - self._lex_id = lex_id - self._lang = "eng" - - self._key = None # gets set later. - - def name(self): - return self._name - - def syntactic_marker(self): - return self._syntactic_marker - - def synset(self): - return self._synset - - def frame_strings(self): - return self._frame_strings - - def frame_ids(self): - return self._frame_ids - - def lang(self): - return self._lang - - def key(self): - return self._key - - def __repr__(self): - tup = type(self).__name__, self._synset._name, self._name - return "%s('%s.%s')" % tup - - def _related(self, relation_symbol): - get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset - if (self._name, relation_symbol) not in self._synset._lemma_pointers: - return [] - return [ - get_synset(pos, offset)._lemmas[lemma_index] - for pos, offset, lemma_index in self._synset._lemma_pointers[ - self._name, relation_symbol - ] - ] - - def count(self): - """Return the frequency count for this Lemma""" - return self._wordnet_corpus_reader.lemma_count(self) - - def antonyms(self): - return self._related("!") - - def derivationally_related_forms(self): - return self._related("+") - - def pertainyms(self): - return self._related("\\") - - -class Synset(_WordNetObject): - """Create a Synset from a ".." string where: - is the word's morphological stem - is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB - is the sense number, counting from 0. - - Synset attributes, accessible via methods with the same name: - - - name: The canonical name of this synset, formed using the first lemma - of this synset. Note that this may be different from the name - passed to the constructor if that string used a different lemma to - identify the synset. - - pos: The synset's part of speech, matching one of the module level - attributes ADJ, ADJ_SAT, ADV, NOUN or VERB. - - lemmas: A list of the Lemma objects for this synset. - - definition: The definition for this synset. - - examples: A list of example strings for this synset. - - offset: The offset in the WordNet dict file of this synset. - - lexname: The name of the lexicographer file containing this synset. - - Synset methods: - - Synsets have the following methods for retrieving related Synsets. - They correspond to the names for the pointer symbols defined here: - https://wordnet.princeton.edu/documentation/wninput5wn - These methods all return lists of Synsets. - - - hypernyms, instance_hypernyms - - hyponyms, instance_hyponyms - - member_holonyms, substance_holonyms, part_holonyms - - member_meronyms, substance_meronyms, part_meronyms - - attributes - - entailments - - causes - - also_sees - - verb_groups - - similar_tos - - Additionally, Synsets support the following methods specific to the - hypernym relation: - - - root_hypernyms - - common_hypernyms - - lowest_common_hypernyms - - Note that Synsets do not support the following relations because - these are defined by WordNet as lexical relations: - - - antonyms - - derivationally_related_forms - - pertainyms - """ - - __slots__ = [ - "_pos", - "_offset", - "_name", - "_frame_ids", - "_lemmas", - "_lemma_names", - "_definition", - "_examples", - "_lexname", - "_pointers", - "_lemma_pointers", - "_max_depth", - "_min_depth", - ] - - def __init__(self, wordnet_corpus_reader): - self._wordnet_corpus_reader = wordnet_corpus_reader - # All of these attributes get initialized by - # WordNetCorpusReader._synset_from_pos_and_line() - - self._pos = None - self._offset = None - self._name = None - self._frame_ids = [] - self._lemmas = [] - self._lemma_names = [] - self._definition = None - self._examples = [] - self._lexname = None # lexicographer name - self._all_hypernyms = None - - self._pointers = defaultdict(set) - self._lemma_pointers = defaultdict(list) - - def pos(self): - return self._pos - - def offset(self): - return self._offset - - def name(self): - return self._name - - def frame_ids(self): - return self._frame_ids - - def _doc(self, doc_type, default, lang="eng"): - """Helper method for Synset.definition and Synset.examples""" - corpus = self._wordnet_corpus_reader - if lang not in corpus.langs(): - return None - elif lang == "eng": - return default - else: - corpus._load_lang_data(lang) - of = corpus.ss2of(self) - i = corpus.lg_attrs.index(doc_type) - if of in corpus._lang_data[lang][i]: - return corpus._lang_data[lang][i][of] - else: - return None - - def definition(self, lang="eng"): - """Return definition in specified language""" - return self._doc("def", self._definition, lang=lang) - - def examples(self, lang="eng"): - """Return examples in specified language""" - return self._doc("exe", self._examples, lang=lang) - - def lexname(self): - return self._lexname - - def _needs_root(self): - if self._pos == NOUN and self._wordnet_corpus_reader.get_version() != "1.6": - return False - else: - return True - - def lemma_names(self, lang="eng"): - """Return all the lemma_names associated with the synset""" - if lang == "eng": - return self._lemma_names - else: - reader = self._wordnet_corpus_reader - reader._load_lang_data(lang) - i = reader.ss2of(self) - if i in reader._lang_data[lang][0]: - return reader._lang_data[lang][0][i] - else: - return [] - - def lemmas(self, lang="eng"): - """Return all the lemma objects associated with the synset""" - if lang == "eng": - return self._lemmas - elif self._name: - self._wordnet_corpus_reader._load_lang_data(lang) - lemmark = [] - lemmy = self.lemma_names(lang) - for lem in lemmy: - temp = Lemma( - self._wordnet_corpus_reader, - self, - lem, - self._wordnet_corpus_reader._lexnames.index(self.lexname()), - 0, - None, - ) - temp._lang = lang - lemmark.append(temp) - return lemmark - - def root_hypernyms(self): - """Get the topmost hypernyms of this synset in WordNet.""" - - result = [] - seen = set() - todo = [self] - while todo: - next_synset = todo.pop() - if next_synset not in seen: - seen.add(next_synset) - next_hypernyms = ( - next_synset.hypernyms() + next_synset.instance_hypernyms() - ) - if not next_hypernyms: - result.append(next_synset) - else: - todo.extend(next_hypernyms) - return result - - # Simpler implementation which makes incorrect assumption that - # hypernym hierarchy is acyclic: - # - # if not self.hypernyms(): - # return [self] - # else: - # return list(set(root for h in self.hypernyms() - # for root in h.root_hypernyms())) - def max_depth(self): - """ - :return: The length of the longest hypernym path from this - synset to the root. - """ - - if "_max_depth" not in self.__dict__: - hypernyms = self.hypernyms() + self.instance_hypernyms() - if not hypernyms: - self._max_depth = 0 - else: - self._max_depth = 1 + max(h.max_depth() for h in hypernyms) - return self._max_depth - - def min_depth(self): - """ - :return: The length of the shortest hypernym path from this - synset to the root. - """ - - if "_min_depth" not in self.__dict__: - hypernyms = self.hypernyms() + self.instance_hypernyms() - if not hypernyms: - self._min_depth = 0 - else: - self._min_depth = 1 + min(h.min_depth() for h in hypernyms) - return self._min_depth - - def closure(self, rel, depth=-1): - """ - Return the transitive closure of source under the rel - relationship, breadth-first, discarding cycles: - - >>> from nltk.corpus import wordnet as wn - >>> computer = wn.synset('computer.n.01') - >>> topic = lambda s:s.topic_domains() - >>> print(list(computer.closure(topic))) - [Synset('computer_science.n.01')] - - UserWarning: Discarded redundant search for Synset('computer.n.01') at depth 2 - - - Include redundant paths (but only once), avoiding duplicate searches - (from 'animal.n.01' to 'entity.n.01'): - - >>> dog = wn.synset('dog.n.01') - >>> hyp = lambda s:s.hypernyms() - >>> print(list(dog.closure(hyp))) - [Synset('canine.n.02'), Synset('domestic_animal.n.01'), Synset('carnivore.n.01'),\ - Synset('animal.n.01'), Synset('placental.n.01'), Synset('organism.n.01'),\ - Synset('mammal.n.01'), Synset('living_thing.n.01'), Synset('vertebrate.n.01'),\ - Synset('whole.n.02'), Synset('chordate.n.01'), Synset('object.n.01'),\ - Synset('physical_entity.n.01'), Synset('entity.n.01')] - - UserWarning: Discarded redundant search for Synset('animal.n.01') at depth 7 - """ - - from nltk.util import acyclic_breadth_first - - for synset in acyclic_breadth_first(self, rel, depth): - if synset != self: - yield synset - - from nltk.util import acyclic_depth_first as acyclic_tree - from nltk.util import unweighted_minimum_spanning_tree as mst - - # Also add this shortcut? - # from nltk.util import unweighted_minimum_spanning_digraph as umsd - - def tree(self, rel, depth=-1, cut_mark=None): - """ - Return the full relation tree, including self, - discarding cycles: - - >>> from nltk.corpus import wordnet as wn - >>> from pprint import pprint - >>> computer = wn.synset('computer.n.01') - >>> topic = lambda s:s.topic_domains() - >>> pprint(computer.tree(topic)) - [Synset('computer.n.01'), [Synset('computer_science.n.01')]] - - UserWarning: Discarded redundant search for Synset('computer.n.01') at depth -3 - - - But keep duplicate branches (from 'animal.n.01' to 'entity.n.01'): - - >>> dog = wn.synset('dog.n.01') - >>> hyp = lambda s:s.hypernyms() - >>> pprint(dog.tree(hyp)) - [Synset('dog.n.01'), - [Synset('canine.n.02'), - [Synset('carnivore.n.01'), - [Synset('placental.n.01'), - [Synset('mammal.n.01'), - [Synset('vertebrate.n.01'), - [Synset('chordate.n.01'), - [Synset('animal.n.01'), - [Synset('organism.n.01'), - [Synset('living_thing.n.01'), - [Synset('whole.n.02'), - [Synset('object.n.01'), - [Synset('physical_entity.n.01'), - [Synset('entity.n.01')]]]]]]]]]]]]], - [Synset('domestic_animal.n.01'), - [Synset('animal.n.01'), - [Synset('organism.n.01'), - [Synset('living_thing.n.01'), - [Synset('whole.n.02'), - [Synset('object.n.01'), - [Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]]] - """ - - from nltk.util import acyclic_branches_depth_first - - return acyclic_branches_depth_first(self, rel, depth, cut_mark) - - def hypernym_paths(self): - """ - Get the path(s) from this synset to the root, where each path is a - list of the synset nodes traversed on the way to the root. - - :return: A list of lists, where each list gives the node sequence - connecting the initial ``Synset`` node and a root node. - """ - paths = [] - - hypernyms = self.hypernyms() + self.instance_hypernyms() - if len(hypernyms) == 0: - paths = [[self]] - - for hypernym in hypernyms: - for ancestor_list in hypernym.hypernym_paths(): - ancestor_list.append(self) - paths.append(ancestor_list) - return paths - - def common_hypernyms(self, other): - """ - Find all synsets that are hypernyms of this synset and the - other synset. - - :type other: Synset - :param other: other input synset. - :return: The synsets that are hypernyms of both synsets. - """ - if not self._all_hypernyms: - self._all_hypernyms = { - self_synset - for self_synsets in self._iter_hypernym_lists() - for self_synset in self_synsets - } - if not other._all_hypernyms: - other._all_hypernyms = { - other_synset - for other_synsets in other._iter_hypernym_lists() - for other_synset in other_synsets - } - return list(self._all_hypernyms.intersection(other._all_hypernyms)) - - def lowest_common_hypernyms(self, other, simulate_root=False, use_min_depth=False): - """ - Get a list of lowest synset(s) that both synsets have as a hypernym. - When `use_min_depth == False` this means that the synset which appears - as a hypernym of both `self` and `other` with the lowest maximum depth - is returned or if there are multiple such synsets at the same depth - they are all returned - - However, if `use_min_depth == True` then the synset(s) which has/have - the lowest minimum depth and appear(s) in both paths is/are returned. - - By setting the use_min_depth flag to True, the behavior of NLTK2 can be - preserved. This was changed in NLTK3 to give more accurate results in a - small set of cases, generally with synsets concerning people. (eg: - 'chef.n.01', 'fireman.n.01', etc.) - - This method is an implementation of Ted Pedersen's "Lowest Common - Subsumer" method from the Perl Wordnet module. It can return either - "self" or "other" if they are a hypernym of the other. - - :type other: Synset - :param other: other input synset - :type simulate_root: bool - :param simulate_root: The various verb taxonomies do not - share a single root which disallows this metric from working for - synsets that are not connected. This flag (False by default) - creates a fake root that connects all the taxonomies. Set it - to True to enable this behavior. For the noun taxonomy, - there is usually a default root except for WordNet version 1.6. - If you are using wordnet 1.6, a fake root will need to be added - for nouns as well. - :type use_min_depth: bool - :param use_min_depth: This setting mimics older (v2) behavior of NLTK - wordnet If True, will use the min_depth function to calculate the - lowest common hypernyms. This is known to give strange results for - some synset pairs (eg: 'chef.n.01', 'fireman.n.01') but is retained - for backwards compatibility - :return: The synsets that are the lowest common hypernyms of both - synsets - """ - synsets = self.common_hypernyms(other) - if simulate_root: - fake_synset = Synset(None) - fake_synset._name = "*ROOT*" - fake_synset.hypernyms = lambda: [] - fake_synset.instance_hypernyms = lambda: [] - synsets.append(fake_synset) - - try: - if use_min_depth: - max_depth = max(s.min_depth() for s in synsets) - unsorted_lch = [s for s in synsets if s.min_depth() == max_depth] - else: - max_depth = max(s.max_depth() for s in synsets) - unsorted_lch = [s for s in synsets if s.max_depth() == max_depth] - return sorted(unsorted_lch) - except ValueError: - return [] - - def hypernym_distances(self, distance=0, simulate_root=False): - """ - Get the path(s) from this synset to the root, counting the distance - of each node from the initial node on the way. A set of - (synset, distance) tuples is returned. - - :type distance: int - :param distance: the distance (number of edges) from this hypernym to - the original hypernym ``Synset`` on which this method was called. - :return: A set of ``(Synset, int)`` tuples where each ``Synset`` is - a hypernym of the first ``Synset``. - """ - distances = {(self, distance)} - for hypernym in self._hypernyms() + self._instance_hypernyms(): - distances |= hypernym.hypernym_distances(distance + 1, simulate_root=False) - if simulate_root: - fake_synset = Synset(None) - fake_synset._name = "*ROOT*" - fake_synset_distance = max(distances, key=itemgetter(1))[1] - distances.add((fake_synset, fake_synset_distance + 1)) - return distances - - def _shortest_hypernym_paths(self, simulate_root): - if self._name == "*ROOT*": - return {self: 0} - - queue = deque([(self, 0)]) - path = {} - - while queue: - s, depth = queue.popleft() - if s in path: - continue - path[s] = depth - - depth += 1 - queue.extend((hyp, depth) for hyp in s._hypernyms()) - queue.extend((hyp, depth) for hyp in s._instance_hypernyms()) - - if simulate_root: - fake_synset = Synset(None) - fake_synset._name = "*ROOT*" - path[fake_synset] = max(path.values()) + 1 - - return path - - def shortest_path_distance(self, other, simulate_root=False): - """ - Returns the distance of the shortest path linking the two synsets (if - one exists). For each synset, all the ancestor nodes and their - distances are recorded and compared. The ancestor node common to both - synsets that can be reached with the minimum number of traversals is - used. If no ancestor nodes are common, None is returned. If a node is - compared with itself 0 is returned. - - :type other: Synset - :param other: The Synset to which the shortest path will be found. - :return: The number of edges in the shortest path connecting the two - nodes, or None if no path exists. - """ - - if self == other: - return 0 - - dist_dict1 = self._shortest_hypernym_paths(simulate_root) - dist_dict2 = other._shortest_hypernym_paths(simulate_root) - - # For each ancestor synset common to both subject synsets, find the - # connecting path length. Return the shortest of these. - - inf = float("inf") - path_distance = inf - for synset, d1 in dist_dict1.items(): - d2 = dist_dict2.get(synset, inf) - path_distance = min(path_distance, d1 + d2) - - return None if math.isinf(path_distance) else path_distance - - # interface to similarity methods - def path_similarity(self, other, verbose=False, simulate_root=True): - """ - Path Distance Similarity: - Return a score denoting how similar two word senses are, based on the - shortest path that connects the senses in the is-a (hypernym/hypnoym) - taxonomy. The score is in the range 0 to 1, except in those cases where - a path cannot be found (will only be true for verbs as there are many - distinct verb taxonomies), in which case None is returned. A score of - 1 represents identity i.e. comparing a sense with itself will return 1. - - :type other: Synset - :param other: The ``Synset`` that this ``Synset`` is being compared to. - :type simulate_root: bool - :param simulate_root: The various verb taxonomies do not - share a single root which disallows this metric from working for - synsets that are not connected. This flag (True by default) - creates a fake root that connects all the taxonomies. Set it - to false to disable this behavior. For the noun taxonomy, - there is usually a default root except for WordNet version 1.6. - If you are using wordnet 1.6, a fake root will be added for nouns - as well. - :return: A score denoting the similarity of the two ``Synset`` objects, - normally between 0 and 1. None is returned if no connecting path - could be found. 1 is returned if a ``Synset`` is compared with - itself. - """ - - distance = self.shortest_path_distance( - other, - simulate_root=simulate_root and (self._needs_root() or other._needs_root()), - ) - if distance is None or distance < 0: - return None - return 1.0 / (distance + 1) - - def lch_similarity(self, other, verbose=False, simulate_root=True): - """ - Leacock Chodorow Similarity: - Return a score denoting how similar two word senses are, based on the - shortest path that connects the senses (as above) and the maximum depth - of the taxonomy in which the senses occur. The relationship is given as - -log(p/2d) where p is the shortest path length and d is the taxonomy - depth. - - :type other: Synset - :param other: The ``Synset`` that this ``Synset`` is being compared to. - :type simulate_root: bool - :param simulate_root: The various verb taxonomies do not - share a single root which disallows this metric from working for - synsets that are not connected. This flag (True by default) - creates a fake root that connects all the taxonomies. Set it - to false to disable this behavior. For the noun taxonomy, - there is usually a default root except for WordNet version 1.6. - If you are using wordnet 1.6, a fake root will be added for nouns - as well. - :return: A score denoting the similarity of the two ``Synset`` objects, - normally greater than 0. None is returned if no connecting path - could be found. If a ``Synset`` is compared with itself, the - maximum score is returned, which varies depending on the taxonomy - depth. - """ - - if self._pos != other._pos: - raise WordNetError( - "Computing the lch similarity requires " - "%s and %s to have the same part of speech." % (self, other) - ) - - need_root = self._needs_root() - - if self._pos not in self._wordnet_corpus_reader._max_depth: - self._wordnet_corpus_reader._compute_max_depth(self._pos, need_root) - - depth = self._wordnet_corpus_reader._max_depth[self._pos] - - distance = self.shortest_path_distance( - other, simulate_root=simulate_root and need_root - ) - - if distance is None or distance < 0 or depth == 0: - return None - return -math.log((distance + 1) / (2.0 * depth)) - - def wup_similarity(self, other, verbose=False, simulate_root=True): - """ - Wu-Palmer Similarity: - Return a score denoting how similar two word senses are, based on the - depth of the two senses in the taxonomy and that of their Least Common - Subsumer (most specific ancestor node). Previously, the scores computed - by this implementation did _not_ always agree with those given by - Pedersen's Perl implementation of WordNet Similarity. However, with - the addition of the simulate_root flag (see below), the score for - verbs now almost always agree but not always for nouns. - - The LCS does not necessarily feature in the shortest path connecting - the two senses, as it is by definition the common ancestor deepest in - the taxonomy, not closest to the two senses. Typically, however, it - will so feature. Where multiple candidates for the LCS exist, that - whose shortest path to the root node is the longest will be selected. - Where the LCS has multiple paths to the root, the longer path is used - for the purposes of the calculation. - - :type other: Synset - :param other: The ``Synset`` that this ``Synset`` is being compared to. - :type simulate_root: bool - :param simulate_root: The various verb taxonomies do not - share a single root which disallows this metric from working for - synsets that are not connected. This flag (True by default) - creates a fake root that connects all the taxonomies. Set it - to false to disable this behavior. For the noun taxonomy, - there is usually a default root except for WordNet version 1.6. - If you are using wordnet 1.6, a fake root will be added for nouns - as well. - :return: A float score denoting the similarity of the two ``Synset`` - objects, normally greater than zero. If no connecting path between - the two senses can be found, None is returned. - - """ - need_root = self._needs_root() or other._needs_root() - - # Note that to preserve behavior from NLTK2 we set use_min_depth=True - # It is possible that more accurate results could be obtained by - # removing this setting and it should be tested later on - subsumers = self.lowest_common_hypernyms( - other, simulate_root=simulate_root and need_root, use_min_depth=True - ) - - # If no LCS was found return None - if len(subsumers) == 0: - return None - - subsumer = self if self in subsumers else subsumers[0] - - # Get the longest path from the LCS to the root, - # including a correction: - # - add one because the calculations include both the start and end - # nodes - depth = subsumer.max_depth() + 1 - - # Note: No need for an additional add-one correction for non-nouns - # to account for an imaginary root node because that is now - # automatically handled by simulate_root - # if subsumer._pos != NOUN: - # depth += 1 - - # Get the shortest path from the LCS to each of the synsets it is - # subsuming. Add this to the LCS path length to get the path - # length from each synset to the root. - len1 = self.shortest_path_distance( - subsumer, simulate_root=simulate_root and need_root - ) - len2 = other.shortest_path_distance( - subsumer, simulate_root=simulate_root and need_root - ) - if len1 is None or len2 is None: - return None - len1 += depth - len2 += depth - return (2.0 * depth) / (len1 + len2) - - def res_similarity(self, other, ic, verbose=False): - """ - Resnik Similarity: - Return a score denoting how similar two word senses are, based on the - Information Content (IC) of the Least Common Subsumer (most specific - ancestor node). - - :type other: Synset - :param other: The ``Synset`` that this ``Synset`` is being compared to. - :type ic: dict - :param ic: an information content object (as returned by - ``nltk.corpus.wordnet_ic.ic()``). - :return: A float score denoting the similarity of the two ``Synset`` - objects. Synsets whose LCS is the root node of the taxonomy will - have a score of 0 (e.g. N['dog'][0] and N['table'][0]). - """ - - ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) - return lcs_ic - - def jcn_similarity(self, other, ic, verbose=False): - """ - Jiang-Conrath Similarity: - Return a score denoting how similar two word senses are, based on the - Information Content (IC) of the Least Common Subsumer (most specific - ancestor node) and that of the two input Synsets. The relationship is - given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)). - - :type other: Synset - :param other: The ``Synset`` that this ``Synset`` is being compared to. - :type ic: dict - :param ic: an information content object (as returned by - ``nltk.corpus.wordnet_ic.ic()``). - :return: A float score denoting the similarity of the two ``Synset`` - objects. - """ - - if self == other: - return _INF - - ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) - - # If either of the input synsets are the root synset, or have a - # frequency of 0 (sparse data problem), return 0. - if ic1 == 0 or ic2 == 0: - return 0 - - ic_difference = ic1 + ic2 - 2 * lcs_ic - - if ic_difference == 0: - return _INF - - return 1 / ic_difference - - def lin_similarity(self, other, ic, verbose=False): - """ - Lin Similarity: - Return a score denoting how similar two word senses are, based on the - Information Content (IC) of the Least Common Subsumer (most specific - ancestor node) and that of the two input Synsets. The relationship is - given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)). - - :type other: Synset - :param other: The ``Synset`` that this ``Synset`` is being compared to. - :type ic: dict - :param ic: an information content object (as returned by - ``nltk.corpus.wordnet_ic.ic()``). - :return: A float score denoting the similarity of the two ``Synset`` - objects, in the range 0 to 1. - """ - - ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) - return (2.0 * lcs_ic) / (ic1 + ic2) - - def _iter_hypernym_lists(self): - """ - :return: An iterator over ``Synset`` objects that are either proper - hypernyms or instance of hypernyms of the synset. - """ - todo = [self] - seen = set() - while todo: - for synset in todo: - seen.add(synset) - yield todo - todo = [ - hypernym - for synset in todo - for hypernym in (synset.hypernyms() + synset.instance_hypernyms()) - if hypernym not in seen - ] - - def __repr__(self): - return f"{type(self).__name__}('{self._name}')" - - def _related(self, relation_symbol, sort=True): - get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset - if relation_symbol not in self._pointers: - return [] - pointer_tuples = self._pointers[relation_symbol] - r = [get_synset(pos, offset) for pos, offset in pointer_tuples] - if sort: - r.sort() - return r - - -###################################################################### -# WordNet Corpus Reader -###################################################################### - - -class WordNetCorpusReader(CorpusReader): - """ - A corpus reader used to access wordnet or its variants. - """ - - _ENCODING = "utf8" - - # { Part-of-speech constants - ADJ, ADJ_SAT, ADV, NOUN, VERB = "a", "s", "r", "n", "v" - # } - - # { Filename constants - _FILEMAP = {ADJ: "adj", ADV: "adv", NOUN: "noun", VERB: "verb"} - # } - - # { Part of speech constants - _pos_numbers = {NOUN: 1, VERB: 2, ADJ: 3, ADV: 4, ADJ_SAT: 5} - _pos_names = dict(tup[::-1] for tup in _pos_numbers.items()) - # } - - #: A list of file identifiers for all the fileids used by this - #: corpus reader. - _FILES = ( - "cntlist.rev", - "lexnames", - "index.sense", - "index.adj", - "index.adv", - "index.noun", - "index.verb", - "data.adj", - "data.adv", - "data.noun", - "data.verb", - "adj.exc", - "adv.exc", - "noun.exc", - "verb.exc", - ) - - def __init__(self, root, omw_reader): - """ - Construct a new wordnet corpus reader, with the given root - directory. - """ - - super().__init__(root, self._FILES, encoding=self._ENCODING) - - # A index that provides the file offset - # Map from lemma -> pos -> synset_index -> offset - self._lemma_pos_offset_map = defaultdict(dict) - - # A cache so we don't have to reconstruct synsets - # Map from pos -> offset -> synset - self._synset_offset_cache = defaultdict(dict) - - # A lookup for the maximum depth of each part of speech. Useful for - # the lch similarity metric. - self._max_depth = defaultdict(dict) - - # Corpus reader containing omw data. - self._omw_reader = omw_reader - - # Corpus reader containing extended_omw data. - self._exomw_reader = None - - self.provenances = defaultdict(str) - self.provenances["eng"] = "" - - if self._omw_reader is None: - warnings.warn( - "The multilingual functions are not available with this Wordnet version" - ) - - self.omw_langs = set() - - # A cache to store the wordnet data of multiple languages - self._lang_data = defaultdict(list) - - self._data_file_map = {} - self._exception_map = {} - self._lexnames = [] - self._key_count_file = None - self._key_synset_file = None - - # Load the lexnames - with self.open("lexnames") as fp: - for i, line in enumerate(fp): - index, lexname, _ = line.split() - assert int(index) == i - self._lexnames.append(lexname) - - # Load the indices for lemmas and synset offsets - self._load_lemma_pos_offset_map() - - # load the exception file data into memory - self._load_exception_map() - - self.nomap = [] - self.splits = {} - - # map from WordNet 3.0 for OMW data - self.map30 = self.map_wn30() - - # Language data attributes - self.lg_attrs = ["lemma", "none", "def", "exe"] - - def index_sense(self, version=None): - """Read sense key to synset id mapping from index.sense file in corpus directory""" - fn = "index.sense" - if version: - from nltk.corpus import CorpusReader, LazyCorpusLoader - - ixreader = LazyCorpusLoader(version, CorpusReader, r".*/" + fn) - else: - ixreader = self - with ixreader.open(fn) as fp: - sensekey_map = {} - for line in fp: - fields = line.strip().split() - sensekey = fields[0] - pos = self._pos_names[int(sensekey.split("%")[1].split(":")[0])] - sensekey_map[sensekey] = f"{fields[1]}-{pos}" - return sensekey_map - - def map_to_many(self): - sensekey_map1 = self.index_sense("wordnet") - sensekey_map2 = self.index_sense() - synset_to_many = {} - for synsetid in set(sensekey_map1.values()): - synset_to_many[synsetid] = [] - for sensekey in set(sensekey_map1.keys()).intersection( - set(sensekey_map2.keys()) - ): - source = sensekey_map1[sensekey] - target = sensekey_map2[sensekey] - synset_to_many[source].append(target) - return synset_to_many - - def map_to_one(self): - synset_to_many = self.map_to_many() - synset_to_one = {} - for source in synset_to_many: - candidates_bag = synset_to_many[source] - if candidates_bag: - candidates_set = set(candidates_bag) - if len(candidates_set) == 1: - target = candidates_bag[0] - else: - counts = [] - for candidate in candidates_set: - counts.append((candidates_bag.count(candidate), candidate)) - self.splits[source] = counts - target = max(counts)[1] - synset_to_one[source] = target - if source[-1] == "s": - # Add a mapping from "a" to target for applications like omw, - # where only Lithuanian and Slovak use the "s" ss_type. - synset_to_one[f"{source[:-1]}a"] = target - else: - self.nomap.append(source) - return synset_to_one - - def map_wn30(self): - """Mapping from Wordnet 3.0 to currently loaded Wordnet version""" - if self.get_version() == "3.0": - return None - else: - return self.map_to_one() - - # Open Multilingual WordNet functions, contributed by - # Nasruddin A’aidil Shari, Sim Wei Ying Geraldine, and Soe Lynn - - def of2ss(self, of): - """take an id and return the synsets""" - return self.synset_from_pos_and_offset(of[-1], int(of[:8])) - - def ss2of(self, ss): - """return the ID of the synset""" - if ss: - return f"{ss.offset():08d}-{ss.pos()}" - - def _load_lang_data(self, lang): - """load the wordnet data of the requested language from the file to - the cache, _lang_data""" - - if lang in self._lang_data: - return - - if self._omw_reader and not self.omw_langs: - self.add_omw() - - if lang not in self.langs(): - raise WordNetError("Language is not supported.") - - if self._exomw_reader and lang not in self.omw_langs: - reader = self._exomw_reader - else: - reader = self._omw_reader - - prov = self.provenances[lang] - if prov in ["cldr", "wikt"]: - prov2 = prov - else: - prov2 = "data" - - with reader.open(f"{prov}/wn-{prov2}-{lang.split('_')[0]}.tab") as fp: - self.custom_lemmas(fp, lang) - self.disable_custom_lemmas(lang) - - def add_provs(self, reader): - """Add languages from Multilingual Wordnet to the provenance dictionary""" - fileids = reader.fileids() - for fileid in fileids: - prov, langfile = os.path.split(fileid) - file_name, file_extension = os.path.splitext(langfile) - if file_extension == ".tab": - lang = file_name.split("-")[-1] - if lang in self.provenances or prov in ["cldr", "wikt"]: - # We already have another resource for this lang, - # so we need to further specify the lang id: - lang = f"{lang}_{prov}" - self.provenances[lang] = prov - - def add_omw(self): - self.add_provs(self._omw_reader) - self.omw_langs = set(self.provenances.keys()) - - def add_exomw(self): - """ - Add languages from Extended OMW - - >>> import nltk - >>> from nltk.corpus import wordnet as wn - >>> wn.add_exomw() - >>> print(wn.synset('intrinsically.r.01').lemmas(lang="eng_wikt")) - [Lemma('intrinsically.r.01.per_se'), Lemma('intrinsically.r.01.as_such')] - """ - from nltk.corpus import extended_omw - - self.add_omw() - self._exomw_reader = extended_omw - self.add_provs(self._exomw_reader) - - def langs(self): - """return a list of languages supported by Multilingual Wordnet""" - return list(self.provenances.keys()) - - def _load_lemma_pos_offset_map(self): - for suffix in self._FILEMAP.values(): - - # parse each line of the file (ignoring comment lines) - with self.open("index.%s" % suffix) as fp: - for i, line in enumerate(fp): - if line.startswith(" "): - continue - - _iter = iter(line.split()) - - def _next_token(): - return next(_iter) - - try: - - # get the lemma and part-of-speech - lemma = _next_token() - pos = _next_token() - - # get the number of synsets for this lemma - n_synsets = int(_next_token()) - assert n_synsets > 0 - - # get and ignore the pointer symbols for all synsets of - # this lemma - n_pointers = int(_next_token()) - [_next_token() for _ in range(n_pointers)] - - # same as number of synsets - n_senses = int(_next_token()) - assert n_synsets == n_senses - - # get and ignore number of senses ranked according to - # frequency - _next_token() - - # get synset offsets - synset_offsets = [int(_next_token()) for _ in range(n_synsets)] - - # raise more informative error with file name and line number - except (AssertionError, ValueError) as e: - tup = ("index.%s" % suffix), (i + 1), e - raise WordNetError("file %s, line %i: %s" % tup) from e - - # map lemmas and parts of speech to synsets - self._lemma_pos_offset_map[lemma][pos] = synset_offsets - if pos == ADJ: - self._lemma_pos_offset_map[lemma][ADJ_SAT] = synset_offsets - - def _load_exception_map(self): - # load the exception file data into memory - for pos, suffix in self._FILEMAP.items(): - self._exception_map[pos] = {} - with self.open("%s.exc" % suffix) as fp: - for line in fp: - terms = line.split() - self._exception_map[pos][terms[0]] = terms[1:] - self._exception_map[ADJ_SAT] = self._exception_map[ADJ] - - def _compute_max_depth(self, pos, simulate_root): - """ - Compute the max depth for the given part of speech. This is - used by the lch similarity metric. - """ - depth = 0 - for ii in self.all_synsets(pos): - try: - depth = max(depth, ii.max_depth()) - except RuntimeError: - print(ii) - if simulate_root: - depth += 1 - self._max_depth[pos] = depth - - def get_version(self): - fh = self._data_file(ADJ) - fh.seek(0) - for line in fh: - match = re.search(r"Word[nN]et (\d+|\d+\.\d+) Copyright", line) - if match is not None: - version = match.group(1) - fh.seek(0) - return version - - ############################################################# - # Loading Lemmas - ############################################################# - - def lemma(self, name, lang="eng"): - """Return lemma object that matches the name""" - # cannot simply split on first '.', - # e.g.: '.45_caliber.a.01..45_caliber' - separator = SENSENUM_RE.search(name).end() - - synset_name, lemma_name = name[: separator - 1], name[separator:] - - synset = self.synset(synset_name) - for lemma in synset.lemmas(lang): - if lemma._name == lemma_name: - return lemma - raise WordNetError(f"No lemma {lemma_name!r} in {synset_name!r}") - - def lemma_from_key(self, key): - # Keys are case sensitive and always lower-case - key = key.lower() - - lemma_name, lex_sense = key.split("%") - pos_number, lexname_index, lex_id, _, _ = lex_sense.split(":") - pos = self._pos_names[int(pos_number)] - - # open the key -> synset file if necessary - if self._key_synset_file is None: - self._key_synset_file = self.open("index.sense") - - # Find the synset for the lemma. - synset_line = _binary_search_file(self._key_synset_file, key) - if not synset_line: - raise WordNetError("No synset found for key %r" % key) - offset = int(synset_line.split()[1]) - synset = self.synset_from_pos_and_offset(pos, offset) - # return the corresponding lemma - for lemma in synset._lemmas: - if lemma._key == key: - return lemma - raise WordNetError("No lemma found for for key %r" % key) - - ############################################################# - # Loading Synsets - ############################################################# - def synset(self, name): - # split name into lemma, part of speech and synset number - lemma, pos, synset_index_str = name.lower().rsplit(".", 2) - synset_index = int(synset_index_str) - 1 - - # get the offset for this synset - try: - offset = self._lemma_pos_offset_map[lemma][pos][synset_index] - except KeyError as e: - raise WordNetError(f"No lemma {lemma!r} with part of speech {pos!r}") from e - except IndexError as e: - n_senses = len(self._lemma_pos_offset_map[lemma][pos]) - raise WordNetError( - f"Lemma {lemma!r} with part of speech {pos!r} only " - f"has {n_senses} {'sense' if n_senses == 1 else 'senses'}" - ) from e - - # load synset information from the appropriate file - synset = self.synset_from_pos_and_offset(pos, offset) - - # some basic sanity checks on loaded attributes - if pos == "s" and synset._pos == "a": - message = ( - "Adjective satellite requested but only plain " - "adjective found for lemma %r" - ) - raise WordNetError(message % lemma) - assert synset._pos == pos or (pos == "a" and synset._pos == "s") - - # Return the synset object. - return synset - - def _data_file(self, pos): - """ - Return an open file pointer for the data file for the given - part of speech. - """ - if pos == ADJ_SAT: - pos = ADJ - if self._data_file_map.get(pos) is None: - fileid = "data.%s" % self._FILEMAP[pos] - self._data_file_map[pos] = self.open(fileid) - return self._data_file_map[pos] - - def synset_from_pos_and_offset(self, pos, offset): - """ - - pos: The synset's part of speech, matching one of the module level - attributes ADJ, ADJ_SAT, ADV, NOUN or VERB ('a', 's', 'r', 'n', or 'v'). - - offset: The byte offset of this synset in the WordNet dict file - for this pos. - - >>> from nltk.corpus import wordnet as wn - >>> print(wn.synset_from_pos_and_offset('n', 1740)) - Synset('entity.n.01') - """ - # Check to see if the synset is in the cache - if offset in self._synset_offset_cache[pos]: - return self._synset_offset_cache[pos][offset] - - data_file = self._data_file(pos) - data_file.seek(offset) - data_file_line = data_file.readline() - # If valid, the offset equals the 8-digit 0-padded integer found at the start of the line: - line_offset = data_file_line[:8] - if ( - line_offset.isalnum() - and line_offset == f"{'0'*(8-len(str(offset)))}{str(offset)}" - ): - synset = self._synset_from_pos_and_line(pos, data_file_line) - assert synset._offset == offset - self._synset_offset_cache[pos][offset] = synset - else: - synset = None - warnings.warn(f"No WordNet synset found for pos={pos} at offset={offset}.") - data_file.seek(0) - return synset - - @deprecated("Use public method synset_from_pos_and_offset() instead") - def _synset_from_pos_and_offset(self, *args, **kwargs): - """ - Hack to help people like the readers of - https://stackoverflow.com/a/27145655/1709587 - who were using this function before it was officially a public method - """ - return self.synset_from_pos_and_offset(*args, **kwargs) - - def _synset_from_pos_and_line(self, pos, data_file_line): - # Construct a new (empty) synset. - synset = Synset(self) - - # parse the entry for this synset - try: - - # parse out the definitions and examples from the gloss - columns_str, gloss = data_file_line.strip().split("|") - definition = re.sub(r"[\"].*?[\"]", "", gloss).strip() - examples = re.findall(r'"([^"]*)"', gloss) - for example in examples: - synset._examples.append(example) - - synset._definition = definition.strip("; ") - - # split the other info into fields - _iter = iter(columns_str.split()) - - def _next_token(): - return next(_iter) - - # get the offset - synset._offset = int(_next_token()) - - # determine the lexicographer file name - lexname_index = int(_next_token()) - synset._lexname = self._lexnames[lexname_index] - - # get the part of speech - synset._pos = _next_token() - - # create Lemma objects for each lemma - n_lemmas = int(_next_token(), 16) - for _ in range(n_lemmas): - # get the lemma name - lemma_name = _next_token() - # get the lex_id (used for sense_keys) - lex_id = int(_next_token(), 16) - # If the lemma has a syntactic marker, extract it. - m = re.match(r"(.*?)(\(.*\))?$", lemma_name) - lemma_name, syn_mark = m.groups() - # create the lemma object - lemma = Lemma(self, synset, lemma_name, lexname_index, lex_id, syn_mark) - synset._lemmas.append(lemma) - synset._lemma_names.append(lemma._name) - - # collect the pointer tuples - n_pointers = int(_next_token()) - for _ in range(n_pointers): - symbol = _next_token() - offset = int(_next_token()) - pos = _next_token() - lemma_ids_str = _next_token() - if lemma_ids_str == "0000": - synset._pointers[symbol].add((pos, offset)) - else: - source_index = int(lemma_ids_str[:2], 16) - 1 - target_index = int(lemma_ids_str[2:], 16) - 1 - source_lemma_name = synset._lemmas[source_index]._name - lemma_pointers = synset._lemma_pointers - tups = lemma_pointers[source_lemma_name, symbol] - tups.append((pos, offset, target_index)) - - # read the verb frames - try: - frame_count = int(_next_token()) - except StopIteration: - pass - else: - for _ in range(frame_count): - # read the plus sign - plus = _next_token() - assert plus == "+" - # read the frame and lemma number - frame_number = int(_next_token()) - frame_string_fmt = VERB_FRAME_STRINGS[frame_number] - lemma_number = int(_next_token(), 16) - # lemma number of 00 means all words in the synset - if lemma_number == 0: - synset._frame_ids.append(frame_number) - for lemma in synset._lemmas: - lemma._frame_ids.append(frame_number) - lemma._frame_strings.append(frame_string_fmt % lemma._name) - # only a specific word in the synset - else: - lemma = synset._lemmas[lemma_number - 1] - lemma._frame_ids.append(frame_number) - lemma._frame_strings.append(frame_string_fmt % lemma._name) - - # raise a more informative error with line text - except ValueError as e: - raise WordNetError(f"line {data_file_line!r}: {e}") from e - - # set sense keys for Lemma objects - note that this has to be - # done afterwards so that the relations are available - for lemma in synset._lemmas: - if synset._pos == ADJ_SAT: - head_lemma = synset.similar_tos()[0]._lemmas[0] - head_name = head_lemma._name - head_id = "%02d" % head_lemma._lex_id - else: - head_name = head_id = "" - tup = ( - lemma._name, - WordNetCorpusReader._pos_numbers[synset._pos], - lemma._lexname_index, - lemma._lex_id, - head_name, - head_id, - ) - lemma._key = ("%s%%%d:%02d:%02d:%s:%s" % tup).lower() - - # the canonical name is based on the first lemma - lemma_name = synset._lemmas[0]._name.lower() - offsets = self._lemma_pos_offset_map[lemma_name][synset._pos] - sense_index = offsets.index(synset._offset) - tup = lemma_name, synset._pos, sense_index + 1 - synset._name = "%s.%s.%02i" % tup - - return synset - - def synset_from_sense_key(self, sense_key): - """ - Retrieves synset based on a given sense_key. Sense keys can be - obtained from lemma.key() - - From https://wordnet.princeton.edu/documentation/senseidx5wn: - A sense_key is represented as:: - - lemma % lex_sense (e.g. 'dog%1:18:01::') - - where lex_sense is encoded as:: - - ss_type:lex_filenum:lex_id:head_word:head_id - - :lemma: ASCII text of word/collocation, in lower case - :ss_type: synset type for the sense (1 digit int) - The synset type is encoded as follows:: - - 1 NOUN - 2 VERB - 3 ADJECTIVE - 4 ADVERB - 5 ADJECTIVE SATELLITE - :lex_filenum: name of lexicographer file containing the synset for the sense (2 digit int) - :lex_id: when paired with lemma, uniquely identifies a sense in the lexicographer file (2 digit int) - :head_word: lemma of the first word in satellite's head synset - Only used if sense is in an adjective satellite synset - :head_id: uniquely identifies sense in a lexicographer file when paired with head_word - Only used if head_word is present (2 digit int) - - >>> import nltk - >>> from nltk.corpus import wordnet as wn - >>> print(wn.synset_from_sense_key("drive%1:04:03::")) - Synset('drive.n.06') - - >>> print(wn.synset_from_sense_key("driving%1:04:03::")) - Synset('drive.n.06') - """ - return self.lemma_from_key(sense_key).synset() - - ############################################################# - # Retrieve synsets and lemmas. - ############################################################# - - def synsets(self, lemma, pos=None, lang="eng", check_exceptions=True): - """Load all synsets with a given lemma and part of speech tag. - If no pos is specified, all synsets for all parts of speech - will be loaded. - If lang is specified, all the synsets associated with the lemma name - of that language will be returned. - """ - lemma = lemma.lower() - - if lang == "eng": - get_synset = self.synset_from_pos_and_offset - index = self._lemma_pos_offset_map - if pos is None: - pos = POS_LIST - return [ - get_synset(p, offset) - for p in pos - for form in self._morphy(lemma, p, check_exceptions) - for offset in index[form].get(p, []) - ] - - else: - self._load_lang_data(lang) - synset_list = [] - if lemma in self._lang_data[lang][1]: - for l in self._lang_data[lang][1][lemma]: - if pos is not None and l[-1] != pos: - continue - synset_list.append(self.of2ss(l)) - return synset_list - - def lemmas(self, lemma, pos=None, lang="eng"): - """Return all Lemma objects with a name matching the specified lemma - name and part of speech tag. Matches any part of speech tag if none is - specified.""" - - lemma = lemma.lower() - if lang == "eng": - return [ - lemma_obj - for synset in self.synsets(lemma, pos) - for lemma_obj in synset.lemmas() - if lemma_obj.name().lower() == lemma - ] - - else: - self._load_lang_data(lang) - lemmas = [] - syn = self.synsets(lemma, lang=lang) - for s in syn: - if pos is not None and s.pos() != pos: - continue - for lemma_obj in s.lemmas(lang=lang): - if lemma_obj.name().lower() == lemma: - lemmas.append(lemma_obj) - return lemmas - - def all_lemma_names(self, pos=None, lang="eng"): - """Return all lemma names for all synsets for the given - part of speech tag and language or languages. If pos is - not specified, all synsets for all parts of speech will - be used.""" - - if lang == "eng": - if pos is None: - return iter(self._lemma_pos_offset_map) - else: - return ( - lemma - for lemma in self._lemma_pos_offset_map - if pos in self._lemma_pos_offset_map[lemma] - ) - else: - self._load_lang_data(lang) - lemma = [] - for i in self._lang_data[lang][0]: - if pos is not None and i[-1] != pos: - continue - lemma.extend(self._lang_data[lang][0][i]) - - lemma = iter(set(lemma)) - return lemma - - def all_omw_synsets(self, pos=None, lang=None): - if lang not in self.langs(): - return None - self._load_lang_data(lang) - for of in self._lang_data[lang][0]: - if not pos or of[-1] == pos: - ss = self.of2ss(of) - if ss: - yield ss - - # else: - # A few OMW offsets don't exist in Wordnet 3.0. - # warnings.warn(f"Language {lang}: no synset found for {of}") - - def all_synsets(self, pos=None, lang="eng"): - """Iterate over all synsets with a given part of speech tag. - If no pos is specified, all synsets for all parts of speech - will be loaded. - """ - if lang == "eng": - return self.all_eng_synsets(pos=pos) - else: - return self.all_omw_synsets(pos=pos, lang=lang) - - def all_eng_synsets(self, pos=None): - if pos is None: - pos_tags = self._FILEMAP.keys() - else: - pos_tags = [pos] - - cache = self._synset_offset_cache - from_pos_and_line = self._synset_from_pos_and_line - - # generate all synsets for each part of speech - for pos_tag in pos_tags: - # Open the file for reading. Note that we can not re-use - # the file pointers from self._data_file_map here, because - # we're defining an iterator, and those file pointers might - # be moved while we're not looking. - if pos_tag == ADJ_SAT: - pos_file = ADJ - else: - pos_file = pos_tag - fileid = "data.%s" % self._FILEMAP[pos_file] - data_file = self.open(fileid) - - try: - # generate synsets for each line in the POS file - offset = data_file.tell() - line = data_file.readline() - while line: - if not line[0].isspace(): - if offset in cache[pos_tag]: - # See if the synset is cached - synset = cache[pos_tag][offset] - else: - # Otherwise, parse the line - synset = from_pos_and_line(pos_tag, line) - cache[pos_tag][offset] = synset - - # adjective satellites are in the same file as - # adjectives so only yield the synset if it's actually - # a satellite - if pos_tag == ADJ_SAT and synset._pos == ADJ_SAT: - yield synset - # for all other POS tags, yield all synsets (this means - # that adjectives also include adjective satellites) - elif pos_tag != ADJ_SAT: - yield synset - offset = data_file.tell() - line = data_file.readline() - - # close the extra file handle we opened - except: - data_file.close() - raise - else: - data_file.close() - - def words(self, lang="eng"): - """return lemmas of the given language as list of words""" - return self.all_lemma_names(lang=lang) - - def synonyms(self, word, lang="eng"): - """return nested list with the synonyms of the different senses of word in the given language""" - return [ - sorted(list(set(ss.lemma_names(lang=lang)) - {word})) - for ss in self.synsets(word, lang=lang) - ] - - def doc(self, file="README", lang="eng"): - """Return the contents of readme, license or citation file - use lang=lang to get the file for an individual language""" - if lang == "eng": - reader = self - else: - reader = self._omw_reader - if lang in self.langs(): - file = f"{os.path.join(self.provenances[lang],file)}" - try: - with reader.open(file) as fp: - return fp.read() - except: - if lang in self._lang_data: - return f"Cannot determine {file} for {lang}" - else: - return f"Language {lang} is not supported." - - def license(self, lang="eng"): - """Return the contents of LICENSE (for omw) - use lang=lang to get the license for an individual language""" - return self.doc(file="LICENSE", lang=lang) - - def readme(self, lang="eng"): - """Return the contents of README (for omw) - use lang=lang to get the readme for an individual language""" - return self.doc(file="README", lang=lang) - - def citation(self, lang="eng"): - """Return the contents of citation.bib file (for omw) - use lang=lang to get the citation for an individual language""" - return self.doc(file="citation.bib", lang=lang) - - ############################################################# - # Misc - ############################################################# - def lemma_count(self, lemma): - """Return the frequency count for this Lemma""" - # Currently, count is only work for English - if lemma._lang != "eng": - return 0 - # open the count file if we haven't already - if self._key_count_file is None: - self._key_count_file = self.open("cntlist.rev") - # find the key in the counts file and return the count - line = _binary_search_file(self._key_count_file, lemma._key) - if line: - return int(line.rsplit(" ", 1)[-1]) - else: - return 0 - - def path_similarity(self, synset1, synset2, verbose=False, simulate_root=True): - return synset1.path_similarity(synset2, verbose, simulate_root) - - path_similarity.__doc__ = Synset.path_similarity.__doc__ - - def lch_similarity(self, synset1, synset2, verbose=False, simulate_root=True): - return synset1.lch_similarity(synset2, verbose, simulate_root) - - lch_similarity.__doc__ = Synset.lch_similarity.__doc__ - - def wup_similarity(self, synset1, synset2, verbose=False, simulate_root=True): - return synset1.wup_similarity(synset2, verbose, simulate_root) - - wup_similarity.__doc__ = Synset.wup_similarity.__doc__ - - def res_similarity(self, synset1, synset2, ic, verbose=False): - return synset1.res_similarity(synset2, ic, verbose) - - res_similarity.__doc__ = Synset.res_similarity.__doc__ - - def jcn_similarity(self, synset1, synset2, ic, verbose=False): - return synset1.jcn_similarity(synset2, ic, verbose) - - jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__ - - def lin_similarity(self, synset1, synset2, ic, verbose=False): - return synset1.lin_similarity(synset2, ic, verbose) - - lin_similarity.__doc__ = Synset.lin_similarity.__doc__ - - ############################################################# - # Morphy - ############################################################# - # Morphy, adapted from Oliver Steele's pywordnet - def morphy(self, form, pos=None, check_exceptions=True): - """ - Find a possible base form for the given form, with the given - part of speech, by checking WordNet's list of exceptional - forms, and by recursively stripping affixes for this part of - speech until a form in WordNet is found. - - >>> from nltk.corpus import wordnet as wn - >>> print(wn.morphy('dogs')) - dog - >>> print(wn.morphy('churches')) - church - >>> print(wn.morphy('aardwolves')) - aardwolf - >>> print(wn.morphy('abaci')) - abacus - >>> wn.morphy('hardrock', wn.ADV) - >>> print(wn.morphy('book', wn.NOUN)) - book - >>> wn.morphy('book', wn.ADJ) - """ - - if pos is None: - morphy = self._morphy - analyses = chain(a for p in POS_LIST for a in morphy(form, p)) - else: - analyses = self._morphy(form, pos, check_exceptions) - - # get the first one we find - first = list(islice(analyses, 1)) - if len(first) == 1: - return first[0] - else: - return None - - MORPHOLOGICAL_SUBSTITUTIONS = { - NOUN: [ - ("s", ""), - ("ses", "s"), - ("ves", "f"), - ("xes", "x"), - ("zes", "z"), - ("ches", "ch"), - ("shes", "sh"), - ("men", "man"), - ("ies", "y"), - ], - VERB: [ - ("s", ""), - ("ies", "y"), - ("es", "e"), - ("es", ""), - ("ed", "e"), - ("ed", ""), - ("ing", "e"), - ("ing", ""), - ], - ADJ: [("er", ""), ("est", ""), ("er", "e"), ("est", "e")], - ADV: [], - } - - MORPHOLOGICAL_SUBSTITUTIONS[ADJ_SAT] = MORPHOLOGICAL_SUBSTITUTIONS[ADJ] - - def _morphy(self, form, pos, check_exceptions=True): - # from jordanbg: - # Given an original string x - # 1. Apply rules once to the input to get y1, y2, y3, etc. - # 2. Return all that are in the database - # 3. If there are no matches, keep applying rules until you either - # find a match or you can't go any further - - exceptions = self._exception_map[pos] - substitutions = self.MORPHOLOGICAL_SUBSTITUTIONS[pos] - - def apply_rules(forms): - return [ - form[: -len(old)] + new - for form in forms - for old, new in substitutions - if form.endswith(old) - ] - - def filter_forms(forms): - result = [] - seen = set() - for form in forms: - if form in self._lemma_pos_offset_map: - if pos in self._lemma_pos_offset_map[form]: - if form not in seen: - result.append(form) - seen.add(form) - return result - - # 0. Check the exception lists - if check_exceptions: - if form in exceptions: - return filter_forms([form] + exceptions[form]) - - # 1. Apply rules once to the input to get y1, y2, y3, etc. - forms = apply_rules([form]) - - # 2. Return all that are in the database (and check the original too) - results = filter_forms([form] + forms) - if results: - return results - - # 3. If there are no matches, keep applying rules until we find a match - while forms: - forms = apply_rules(forms) - results = filter_forms(forms) - if results: - return results - - # Return an empty list if we can't find anything - return [] - - ############################################################# - # Create information content from corpus - ############################################################# - def ic(self, corpus, weight_senses_equally=False, smoothing=1.0): - """ - Creates an information content lookup dictionary from a corpus. - - :type corpus: CorpusReader - :param corpus: The corpus from which we create an information - content dictionary. - :type weight_senses_equally: bool - :param weight_senses_equally: If this is True, gives all - possible senses equal weight rather than dividing by the - number of possible senses. (If a word has 3 synses, each - sense gets 0.3333 per appearance when this is False, 1.0 when - it is true.) - :param smoothing: How much do we smooth synset counts (default is 1.0) - :type smoothing: float - :return: An information content dictionary - """ - counts = FreqDist() - for ww in corpus.words(): - counts[ww] += 1 - - ic = {} - for pp in POS_LIST: - ic[pp] = defaultdict(float) - - # Initialize the counts with the smoothing value - if smoothing > 0.0: - for pp in POS_LIST: - ic[pp][0] = smoothing - for ss in self.all_synsets(): - pos = ss._pos - if pos == ADJ_SAT: - pos = ADJ - ic[pos][ss._offset] = smoothing - - for ww in counts: - possible_synsets = self.synsets(ww) - if len(possible_synsets) == 0: - continue - - # Distribute weight among possible synsets - weight = float(counts[ww]) - if not weight_senses_equally: - weight /= float(len(possible_synsets)) - - for ss in possible_synsets: - pos = ss._pos - if pos == ADJ_SAT: - pos = ADJ - for level in ss._iter_hypernym_lists(): - for hh in level: - ic[pos][hh._offset] += weight - # Add the weight to the root - ic[pos][0] += weight - return ic - - def custom_lemmas(self, tab_file, lang): - """ - Reads a custom tab file containing mappings of lemmas in the given - language to Princeton WordNet 3.0 synset offsets, allowing NLTK's - WordNet functions to then be used with that language. - - See the "Tab files" section at https://omwn.org/omw1.html for - documentation on the Multilingual WordNet tab file format. - - :param tab_file: Tab file as a file or file-like object - :type: lang str - :param: lang ISO 639-3 code of the language of the tab file - """ - lg = lang.split("_")[0] - if len(lg) != 3: - raise ValueError("lang should be a (3 character) ISO 639-3 code") - self._lang_data[lang] = [ - defaultdict(list), - defaultdict(list), - defaultdict(list), - defaultdict(list), - ] - for line in tab_file.readlines(): - if isinstance(line, bytes): - # Support byte-stream files (e.g. as returned by Python 2's - # open() function) as well as text-stream ones - line = line.decode("utf-8") - if not line.startswith("#"): - triple = line.strip().split("\t") - if len(triple) < 3: - continue - offset_pos, label = triple[:2] - val = triple[-1] - if self.map30: - if offset_pos in self.map30: - # Map offset_pos to current Wordnet version: - offset_pos = self.map30[offset_pos] - else: - # Some OMW offsets were never in Wordnet: - if ( - offset_pos not in self.nomap - and offset_pos.replace("a", "s") not in self.nomap - ): - warnings.warn( - f"{lang}: invalid offset {offset_pos} in '{line}'" - ) - continue - elif offset_pos[-1] == "a": - wnss = self.of2ss(offset_pos) - if wnss and wnss.pos() == "s": # Wordnet pos is "s" - # Label OMW adjective satellites back to their Wordnet pos ("s") - offset_pos = self.ss2of(wnss) - pair = label.split(":") - attr = pair[-1] - if len(pair) == 1 or pair[0] == lg: - if attr == "lemma": - val = val.strip().replace(" ", "_") - self._lang_data[lang][1][val.lower()].append(offset_pos) - if attr in self.lg_attrs: - self._lang_data[lang][self.lg_attrs.index(attr)][ - offset_pos - ].append(val) - - def disable_custom_lemmas(self, lang): - """prevent synsets from being mistakenly added""" - for n in range(len(self.lg_attrs)): - self._lang_data[lang][n].default_factory = None - - ###################################################################### - # Visualize WordNet relation graphs using Graphviz - ###################################################################### - - def digraph( - self, - inputs, - rel=lambda s: s.hypernyms(), - pos=None, - maxdepth=-1, - shapes=None, - attr=None, - verbose=False, - ): - """ - Produce a graphical representation from 'inputs' (a list of - start nodes, which can be a mix of Synsets, Lemmas and/or words), - and a synset relation, for drawing with the 'dot' graph visualisation - program from the Graphviz package. - - Return a string in the DOT graph file language, which can then be - converted to an image by nltk.parse.dependencygraph.dot2img(dot_string). - - Optional Parameters: - :rel: Wordnet synset relation - :pos: for words, restricts Part of Speech to 'n', 'v', 'a' or 'r' - :maxdepth: limit the longest path - :shapes: dictionary of strings that trigger a specified shape - :attr: dictionary with global graph attributes - :verbose: warn about cycles - - >>> from nltk.corpus import wordnet as wn - >>> print(wn.digraph([wn.synset('dog.n.01')])) - digraph G { - "Synset('animal.n.01')" -> "Synset('organism.n.01')"; - "Synset('canine.n.02')" -> "Synset('carnivore.n.01')"; - "Synset('carnivore.n.01')" -> "Synset('placental.n.01')"; - "Synset('chordate.n.01')" -> "Synset('animal.n.01')"; - "Synset('dog.n.01')" -> "Synset('canine.n.02')"; - "Synset('dog.n.01')" -> "Synset('domestic_animal.n.01')"; - "Synset('domestic_animal.n.01')" -> "Synset('animal.n.01')"; - "Synset('living_thing.n.01')" -> "Synset('whole.n.02')"; - "Synset('mammal.n.01')" -> "Synset('vertebrate.n.01')"; - "Synset('object.n.01')" -> "Synset('physical_entity.n.01')"; - "Synset('organism.n.01')" -> "Synset('living_thing.n.01')"; - "Synset('physical_entity.n.01')" -> "Synset('entity.n.01')"; - "Synset('placental.n.01')" -> "Synset('mammal.n.01')"; - "Synset('vertebrate.n.01')" -> "Synset('chordate.n.01')"; - "Synset('whole.n.02')" -> "Synset('object.n.01')"; - } - - """ - from nltk.util import edge_closure, edges2dot - - synsets = set() - edges = set() - if not shapes: - shapes = dict() - if not attr: - attr = dict() - - def add_lemma(lem): - ss = lem.synset() - synsets.add(ss) - edges.add((lem, ss)) - - for node in inputs: - typ = type(node) - if typ == Synset: - synsets.add(node) - elif typ == Lemma: - add_lemma(node) - elif typ == str: - for lemma in self.lemmas(node, pos): - add_lemma(lemma) - - for ss in synsets: - edges = edges.union(edge_closure(ss, rel, maxdepth, verbose)) - dot_string = edges2dot(sorted(list(edges)), shapes=shapes, attr=attr) - return dot_string - - -###################################################################### -# WordNet Information Content Corpus Reader -###################################################################### - - -class WordNetICCorpusReader(CorpusReader): - """ - A corpus reader for the WordNet information content corpus. - """ - - def __init__(self, root, fileids): - CorpusReader.__init__(self, root, fileids, encoding="utf8") - - # this load function would be more efficient if the data was pickled - # Note that we can't use NLTK's frequency distributions because - # synsets are overlapping (each instance of a synset also counts - # as an instance of its hypernyms) - def ic(self, icfile): - """ - Load an information content file from the wordnet_ic corpus - and return a dictionary. This dictionary has just two keys, - NOUN and VERB, whose values are dictionaries that map from - synsets to information content values. - - :type icfile: str - :param icfile: The name of the wordnet_ic file (e.g. "ic-brown.dat") - :return: An information content dictionary - """ - ic = {} - ic[NOUN] = defaultdict(float) - ic[VERB] = defaultdict(float) - with self.open(icfile) as fp: - for num, line in enumerate(fp): - if num == 0: # skip the header - continue - fields = line.split() - offset = int(fields[0][:-1]) - value = float(fields[1]) - pos = _get_pos(fields[0]) - if len(fields) == 3 and fields[2] == "ROOT": - # Store root count. - ic[pos][0] += value - if value != 0: - ic[pos][offset] = value - return ic - - -###################################################################### -# Similarity metrics -###################################################################### - -# TODO: Add in the option to manually add a new root node; this will be -# useful for verb similarity as there exist multiple verb taxonomies. - -# More information about the metrics is available at -# http://marimba.d.umn.edu/similarity/measures.html - - -def path_similarity(synset1, synset2, verbose=False, simulate_root=True): - return synset1.path_similarity( - synset2, verbose=verbose, simulate_root=simulate_root - ) - - -def lch_similarity(synset1, synset2, verbose=False, simulate_root=True): - return synset1.lch_similarity(synset2, verbose=verbose, simulate_root=simulate_root) - - -def wup_similarity(synset1, synset2, verbose=False, simulate_root=True): - return synset1.wup_similarity(synset2, verbose=verbose, simulate_root=simulate_root) - - -def res_similarity(synset1, synset2, ic, verbose=False): - return synset1.res_similarity(synset2, ic, verbose=verbose) - - -def jcn_similarity(synset1, synset2, ic, verbose=False): - return synset1.jcn_similarity(synset2, ic, verbose=verbose) - - -def lin_similarity(synset1, synset2, ic, verbose=False): - return synset1.lin_similarity(synset2, ic, verbose=verbose) - - -path_similarity.__doc__ = Synset.path_similarity.__doc__ -lch_similarity.__doc__ = Synset.lch_similarity.__doc__ -wup_similarity.__doc__ = Synset.wup_similarity.__doc__ -res_similarity.__doc__ = Synset.res_similarity.__doc__ -jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__ -lin_similarity.__doc__ = Synset.lin_similarity.__doc__ - - -def _lcs_ic(synset1, synset2, ic, verbose=False): - """ - Get the information content of the least common subsumer that has - the highest information content value. If two nodes have no - explicit common subsumer, assume that they share an artificial - root node that is the hypernym of all explicit roots. - - :type synset1: Synset - :param synset1: First input synset. - :type synset2: Synset - :param synset2: Second input synset. Must be the same part of - speech as the first synset. - :type ic: dict - :param ic: an information content object (as returned by ``load_ic()``). - :return: The information content of the two synsets and their most - informative subsumer - """ - if synset1._pos != synset2._pos: - raise WordNetError( - "Computing the least common subsumer requires " - "%s and %s to have the same part of speech." % (synset1, synset2) - ) - - ic1 = information_content(synset1, ic) - ic2 = information_content(synset2, ic) - subsumers = synset1.common_hypernyms(synset2) - if len(subsumers) == 0: - subsumer_ic = 0 - else: - subsumer_ic = max(information_content(s, ic) for s in subsumers) - - if verbose: - print("> LCS Subsumer by content:", subsumer_ic) - - return ic1, ic2, subsumer_ic - - -# Utility functions - - -def information_content(synset, ic): - pos = synset._pos - if pos == ADJ_SAT: - pos = ADJ - try: - icpos = ic[pos] - except KeyError as e: - msg = "Information content file has no entries for part-of-speech: %s" - raise WordNetError(msg % pos) from e - - counts = icpos[synset._offset] - if counts == 0: - return _INF - else: - return -math.log(counts / icpos[0]) - - -# get the part of speech (NOUN or VERB) from the information content record -# (each identifier has a 'n' or 'v' suffix) - - -def _get_pos(field): - if field[-1] == "n": - return NOUN - elif field[-1] == "v": - return VERB - else: - msg = ( - "Unidentified part of speech in WordNet Information Content file " - "for field %s" % field - ) - raise ValueError(msg) diff --git a/pipeline/nltk/corpus/reader/xmldocs.py b/pipeline/nltk/corpus/reader/xmldocs.py deleted file mode 100644 index 1a9b3d001e0e31120ff1a7df266bb4c82b8de360..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/xmldocs.py +++ /dev/null @@ -1,397 +0,0 @@ -# Natural Language Toolkit: XML Corpus Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# URL: -# For license information, see LICENSE.TXT - -""" -Corpus reader for corpora whose documents are xml files. - -(note -- not named 'xml' to avoid conflicting w/ standard xml package) -""" - -import codecs -from xml.etree import ElementTree - -from nltk.corpus.reader.api import CorpusReader -from nltk.corpus.reader.util import * -from nltk.data import SeekableUnicodeStreamReader -from nltk.internals import ElementWrapper -from nltk.tokenize import WordPunctTokenizer - - -class XMLCorpusReader(CorpusReader): - """ - Corpus reader for corpora whose documents are xml files. - - Note that the ``XMLCorpusReader`` constructor does not take an - ``encoding`` argument, because the unicode encoding is specified by - the XML files themselves. See the XML specs for more info. - """ - - def __init__(self, root, fileids, wrap_etree=False): - self._wrap_etree = wrap_etree - CorpusReader.__init__(self, root, fileids) - - def xml(self, fileid=None): - # Make sure we have exactly one file -- no concatenating XML. - if fileid is None and len(self._fileids) == 1: - fileid = self._fileids[0] - if not isinstance(fileid, str): - raise TypeError("Expected a single file identifier string") - # Read the XML in using ElementTree. - with self.abspath(fileid).open() as fp: - elt = ElementTree.parse(fp).getroot() - # If requested, wrap it. - if self._wrap_etree: - elt = ElementWrapper(elt) - # Return the ElementTree element. - return elt - - def words(self, fileid=None): - """ - Returns all of the words and punctuation symbols in the specified file - that were in text nodes -- ie, tags are ignored. Like the xml() method, - fileid can only specify one file. - - :return: the given file's text nodes as a list of words and punctuation symbols - :rtype: list(str) - """ - - elt = self.xml(fileid) - encoding = self.encoding(fileid) - word_tokenizer = WordPunctTokenizer() - try: - iterator = elt.getiterator() - except: - iterator = elt.iter() - out = [] - - for node in iterator: - text = node.text - if text is not None: - if isinstance(text, bytes): - text = text.decode(encoding) - toks = word_tokenizer.tokenize(text) - out.extend(toks) - return out - - -class XMLCorpusView(StreamBackedCorpusView): - """ - A corpus view that selects out specified elements from an XML - file, and provides a flat list-like interface for accessing them. - (Note: ``XMLCorpusView`` is not used by ``XMLCorpusReader`` itself, - but may be used by subclasses of ``XMLCorpusReader``.) - - Every XML corpus view has a "tag specification", indicating what - XML elements should be included in the view; and each (non-nested) - element that matches this specification corresponds to one item in - the view. Tag specifications are regular expressions over tag - paths, where a tag path is a list of element tag names, separated - by '/', indicating the ancestry of the element. Some examples: - - - ``'foo'``: A top-level element whose tag is ``foo``. - - ``'foo/bar'``: An element whose tag is ``bar`` and whose parent - is a top-level element whose tag is ``foo``. - - ``'.*/foo'``: An element whose tag is ``foo``, appearing anywhere - in the xml tree. - - ``'.*/(foo|bar)'``: An wlement whose tag is ``foo`` or ``bar``, - appearing anywhere in the xml tree. - - The view items are generated from the selected XML elements via - the method ``handle_elt()``. By default, this method returns the - element as-is (i.e., as an ElementTree object); but it can be - overridden, either via subclassing or via the ``elt_handler`` - constructor parameter. - """ - - #: If true, then display debugging output to stdout when reading - #: blocks. - _DEBUG = False - - #: The number of characters read at a time by this corpus reader. - _BLOCK_SIZE = 1024 - - def __init__(self, fileid, tagspec, elt_handler=None): - """ - Create a new corpus view based on a specified XML file. - - Note that the ``XMLCorpusView`` constructor does not take an - ``encoding`` argument, because the unicode encoding is - specified by the XML files themselves. - - :type tagspec: str - :param tagspec: A tag specification, indicating what XML - elements should be included in the view. Each non-nested - element that matches this specification corresponds to one - item in the view. - - :param elt_handler: A function used to transform each element - to a value for the view. If no handler is specified, then - ``self.handle_elt()`` is called, which returns the element - as an ElementTree object. The signature of elt_handler is:: - - elt_handler(elt, tagspec) -> value - """ - if elt_handler: - self.handle_elt = elt_handler - - self._tagspec = re.compile(tagspec + r"\Z") - """The tag specification for this corpus view.""" - - self._tag_context = {0: ()} - """A dictionary mapping from file positions (as returned by - ``stream.seek()`` to XML contexts. An XML context is a - tuple of XML tag names, indicating which tags have not yet - been closed.""" - - encoding = self._detect_encoding(fileid) - StreamBackedCorpusView.__init__(self, fileid, encoding=encoding) - - def _detect_encoding(self, fileid): - if isinstance(fileid, PathPointer): - try: - infile = fileid.open() - s = infile.readline() - finally: - infile.close() - else: - with open(fileid, "rb") as infile: - s = infile.readline() - if s.startswith(codecs.BOM_UTF16_BE): - return "utf-16-be" - if s.startswith(codecs.BOM_UTF16_LE): - return "utf-16-le" - if s.startswith(codecs.BOM_UTF32_BE): - return "utf-32-be" - if s.startswith(codecs.BOM_UTF32_LE): - return "utf-32-le" - if s.startswith(codecs.BOM_UTF8): - return "utf-8" - m = re.match(rb'\s*<\?xml\b.*\bencoding="([^"]+)"', s) - if m: - return m.group(1).decode() - m = re.match(rb"\s*<\?xml\b.*\bencoding='([^']+)'", s) - if m: - return m.group(1).decode() - # No encoding found -- what should the default be? - return "utf-8" - - def handle_elt(self, elt, context): - """ - Convert an element into an appropriate value for inclusion in - the view. Unless overridden by a subclass or by the - ``elt_handler`` constructor argument, this method simply - returns ``elt``. - - :return: The view value corresponding to ``elt``. - - :type elt: ElementTree - :param elt: The element that should be converted. - - :type context: str - :param context: A string composed of element tags separated by - forward slashes, indicating the XML context of the given - element. For example, the string ``'foo/bar/baz'`` - indicates that the element is a ``baz`` element whose - parent is a ``bar`` element and whose grandparent is a - top-level ``foo`` element. - """ - return elt - - #: A regular expression that matches XML fragments that do not - #: contain any un-closed tags. - _VALID_XML_RE = re.compile( - r""" - [^<]* - ( - (() | # comment - () | # doctype decl - (<[^!>][^>]*>)) # tag or PI - [^<]*)* - \Z""", - re.DOTALL | re.VERBOSE, - ) - - #: A regular expression used to extract the tag name from a start tag, - #: end tag, or empty-elt tag string. - _XML_TAG_NAME = re.compile(r"<\s*(?:/\s*)?([^\s>]+)") - - #: A regular expression used to find all start-tags, end-tags, and - #: empty-elt tags in an XML file. This regexp is more lenient than - #: the XML spec -- e.g., it allows spaces in some places where the - #: spec does not. - _XML_PIECE = re.compile( - r""" - # Include these so we can skip them: - (?P )| - (?P )| - (?P <\?.*?\?> )| - (?P ]*(\[[^\]]*])?\s*>)| - # These are the ones we actually care about: - (?P <\s*[^>/\?!\s][^>]*/\s*> )| - (?P <\s*[^>/\?!\s][^>]*> )| - (?P <\s*/[^>/\?!\s][^>]*> )""", - re.DOTALL | re.VERBOSE, - ) - - def _read_xml_fragment(self, stream): - """ - Read a string from the given stream that does not contain any - un-closed tags. In particular, this function first reads a - block from the stream of size ``self._BLOCK_SIZE``. It then - checks if that block contains an un-closed tag. If it does, - then this function either backtracks to the last '<', or reads - another block. - """ - fragment = "" - - if isinstance(stream, SeekableUnicodeStreamReader): - startpos = stream.tell() - while True: - # Read a block and add it to the fragment. - xml_block = stream.read(self._BLOCK_SIZE) - fragment += xml_block - - # Do we have a well-formed xml fragment? - if self._VALID_XML_RE.match(fragment): - return fragment - - # Do we have a fragment that will never be well-formed? - if re.search("[<>]", fragment).group(0) == ">": - pos = stream.tell() - ( - len(fragment) - re.search("[<>]", fragment).end() - ) - raise ValueError('Unexpected ">" near char %s' % pos) - - # End of file? - if not xml_block: - raise ValueError("Unexpected end of file: tag not closed") - - # If not, then we must be in the middle of a <..tag..>. - # If appropriate, backtrack to the most recent '<' - # character. - last_open_bracket = fragment.rfind("<") - if last_open_bracket > 0: - if self._VALID_XML_RE.match(fragment[:last_open_bracket]): - if isinstance(stream, SeekableUnicodeStreamReader): - stream.seek(startpos) - stream.char_seek_forward(last_open_bracket) - else: - stream.seek(-(len(fragment) - last_open_bracket), 1) - return fragment[:last_open_bracket] - - # Otherwise, read another block. (i.e., return to the - # top of the loop.) - - def read_block(self, stream, tagspec=None, elt_handler=None): - """ - Read from ``stream`` until we find at least one element that - matches ``tagspec``, and return the result of applying - ``elt_handler`` to each element found. - """ - if tagspec is None: - tagspec = self._tagspec - if elt_handler is None: - elt_handler = self.handle_elt - - # Use a stack of strings to keep track of our context: - context = list(self._tag_context.get(stream.tell())) - assert context is not None # check this -- could it ever happen? - - elts = [] - - elt_start = None # where does the elt start - elt_depth = None # what context depth - elt_text = "" - - while elts == [] or elt_start is not None: - if isinstance(stream, SeekableUnicodeStreamReader): - startpos = stream.tell() - xml_fragment = self._read_xml_fragment(stream) - - # End of file. - if not xml_fragment: - if elt_start is None: - break - else: - raise ValueError("Unexpected end of file") - - # Process each in the xml fragment. - for piece in self._XML_PIECE.finditer(xml_fragment): - if self._DEBUG: - print("{:>25} {}".format("/".join(context)[-20:], piece.group())) - - if piece.group("START_TAG"): - name = self._XML_TAG_NAME.match(piece.group()).group(1) - # Keep context up-to-date. - context.append(name) - # Is this one of the elts we're looking for? - if elt_start is None: - if re.match(tagspec, "/".join(context)): - elt_start = piece.start() - elt_depth = len(context) - - elif piece.group("END_TAG"): - name = self._XML_TAG_NAME.match(piece.group()).group(1) - # sanity checks: - if not context: - raise ValueError("Unmatched tag " % name) - if name != context[-1]: - raise ValueError(f"Unmatched tag <{context[-1]}>...") - # Is this the end of an element? - if elt_start is not None and elt_depth == len(context): - elt_text += xml_fragment[elt_start : piece.end()] - elts.append((elt_text, "/".join(context))) - elt_start = elt_depth = None - elt_text = "" - # Keep context up-to-date - context.pop() - - elif piece.group("EMPTY_ELT_TAG"): - name = self._XML_TAG_NAME.match(piece.group()).group(1) - if elt_start is None: - if re.match(tagspec, "/".join(context) + "/" + name): - elts.append((piece.group(), "/".join(context) + "/" + name)) - - if elt_start is not None: - # If we haven't found any elements yet, then keep - # looping until we do. - if elts == []: - elt_text += xml_fragment[elt_start:] - elt_start = 0 - - # If we've found at least one element, then try - # backtracking to the start of the element that we're - # inside of. - else: - # take back the last start-tag, and return what - # we've gotten so far (elts is non-empty). - if self._DEBUG: - print(" " * 36 + "(backtrack)") - if isinstance(stream, SeekableUnicodeStreamReader): - stream.seek(startpos) - stream.char_seek_forward(elt_start) - else: - stream.seek(-(len(xml_fragment) - elt_start), 1) - context = context[: elt_depth - 1] - elt_start = elt_depth = None - elt_text = "" - - # Update the _tag_context dict. - pos = stream.tell() - if pos in self._tag_context: - assert tuple(context) == self._tag_context[pos] - else: - self._tag_context[pos] = tuple(context) - - return [ - elt_handler( - ElementTree.fromstring(elt.encode("ascii", "xmlcharrefreplace")), - context, - ) - for (elt, context) in elts - ] diff --git a/pipeline/nltk/corpus/reader/ycoe.py b/pipeline/nltk/corpus/reader/ycoe.py deleted file mode 100644 index 35bafdfef4f12f934de8e5e4617341fb2ba7b7a8..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/reader/ycoe.py +++ /dev/null @@ -1,256 +0,0 @@ -# Natural Language Toolkit: York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE) -# -# Copyright (C) 2001-2015 NLTK Project -# Author: Selina Dennis -# URL: -# For license information, see LICENSE.TXT - -""" -Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old -English Prose (YCOE), a 1.5 million word syntactically-annotated -corpus of Old English prose texts. The corpus is distributed by the -Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included -with NLTK. - -The YCOE corpus is divided into 100 files, each representing -an Old English prose text. Tags used within each text complies -to the YCOE standard: https://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm -""" - -import os -import re - -from nltk.corpus.reader.api import * -from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader -from nltk.corpus.reader.tagged import TaggedCorpusReader -from nltk.corpus.reader.util import * -from nltk.tokenize import RegexpTokenizer - - -class YCOECorpusReader(CorpusReader): - """ - Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old - English Prose (YCOE), a 1.5 million word syntactically-annotated - corpus of Old English prose texts. - """ - - def __init__(self, root, encoding="utf8"): - CorpusReader.__init__(self, root, [], encoding) - - self._psd_reader = YCOEParseCorpusReader( - self.root.join("psd"), ".*", ".psd", encoding=encoding - ) - self._pos_reader = YCOETaggedCorpusReader(self.root.join("pos"), ".*", ".pos") - - # Make sure we have a consistent set of items: - documents = {f[:-4] for f in self._psd_reader.fileids()} - if {f[:-4] for f in self._pos_reader.fileids()} != documents: - raise ValueError('Items in "psd" and "pos" ' "subdirectories do not match.") - - fileids = sorted( - ["%s.psd" % doc for doc in documents] - + ["%s.pos" % doc for doc in documents] - ) - CorpusReader.__init__(self, root, fileids, encoding) - self._documents = sorted(documents) - - def documents(self, fileids=None): - """ - Return a list of document identifiers for all documents in - this corpus, or for the documents with the given file(s) if - specified. - """ - if fileids is None: - return self._documents - if isinstance(fileids, str): - fileids = [fileids] - for f in fileids: - if f not in self._fileids: - raise KeyError("File id %s not found" % fileids) - # Strip off the '.pos' and '.psd' extensions. - return sorted({f[:-4] for f in fileids}) - - def fileids(self, documents=None): - """ - Return a list of file identifiers for the files that make up - this corpus, or that store the given document(s) if specified. - """ - if documents is None: - return self._fileids - elif isinstance(documents, str): - documents = [documents] - return sorted( - set( - ["%s.pos" % doc for doc in documents] - + ["%s.psd" % doc for doc in documents] - ) - ) - - def _getfileids(self, documents, subcorpus): - """ - Helper that selects the appropriate fileids for a given set of - documents from a given subcorpus (pos or psd). - """ - if documents is None: - documents = self._documents - else: - if isinstance(documents, str): - documents = [documents] - for document in documents: - if document not in self._documents: - if document[-4:] in (".pos", ".psd"): - raise ValueError( - "Expected a document identifier, not a file " - "identifier. (Use corpus.documents() to get " - "a list of document identifiers." - ) - else: - raise ValueError("Document identifier %s not found" % document) - return [f"{d}.{subcorpus}" for d in documents] - - # Delegate to one of our two sub-readers: - def words(self, documents=None): - return self._pos_reader.words(self._getfileids(documents, "pos")) - - def sents(self, documents=None): - return self._pos_reader.sents(self._getfileids(documents, "pos")) - - def paras(self, documents=None): - return self._pos_reader.paras(self._getfileids(documents, "pos")) - - def tagged_words(self, documents=None): - return self._pos_reader.tagged_words(self._getfileids(documents, "pos")) - - def tagged_sents(self, documents=None): - return self._pos_reader.tagged_sents(self._getfileids(documents, "pos")) - - def tagged_paras(self, documents=None): - return self._pos_reader.tagged_paras(self._getfileids(documents, "pos")) - - def parsed_sents(self, documents=None): - return self._psd_reader.parsed_sents(self._getfileids(documents, "psd")) - - -class YCOEParseCorpusReader(BracketParseCorpusReader): - """Specialized version of the standard bracket parse corpus reader - that strips out (CODE ...) and (ID ...) nodes.""" - - def _parse(self, t): - t = re.sub(r"(?u)\((CODE|ID)[^\)]*\)", "", t) - if re.match(r"\s*\(\s*\)\s*$", t): - return None - return BracketParseCorpusReader._parse(self, t) - - -class YCOETaggedCorpusReader(TaggedCorpusReader): - def __init__(self, root, items, encoding="utf8"): - gaps_re = r"(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*" - sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True) - TaggedCorpusReader.__init__( - self, root, items, sep="_", sent_tokenizer=sent_tokenizer - ) - - -#: A list of all documents and their titles in ycoe. -documents = { - "coadrian.o34": "Adrian and Ritheus", - "coaelhom.o3": "Ælfric, Supplemental Homilies", - "coaelive.o3": "Ælfric's Lives of Saints", - "coalcuin": "Alcuin De virtutibus et vitiis", - "coalex.o23": "Alexander's Letter to Aristotle", - "coapollo.o3": "Apollonius of Tyre", - "coaugust": "Augustine", - "cobede.o2": "Bede's History of the English Church", - "cobenrul.o3": "Benedictine Rule", - "coblick.o23": "Blickling Homilies", - "coboeth.o2": "Boethius' Consolation of Philosophy", - "cobyrhtf.o3": "Byrhtferth's Manual", - "cocanedgD": "Canons of Edgar (D)", - "cocanedgX": "Canons of Edgar (X)", - "cocathom1.o3": "Ælfric's Catholic Homilies I", - "cocathom2.o3": "Ælfric's Catholic Homilies II", - "cochad.o24": "Saint Chad", - "cochdrul": "Chrodegang of Metz, Rule", - "cochristoph": "Saint Christopher", - "cochronA.o23": "Anglo-Saxon Chronicle A", - "cochronC": "Anglo-Saxon Chronicle C", - "cochronD": "Anglo-Saxon Chronicle D", - "cochronE.o34": "Anglo-Saxon Chronicle E", - "cocura.o2": "Cura Pastoralis", - "cocuraC": "Cura Pastoralis (Cotton)", - "codicts.o34": "Dicts of Cato", - "codocu1.o1": "Documents 1 (O1)", - "codocu2.o12": "Documents 2 (O1/O2)", - "codocu2.o2": "Documents 2 (O2)", - "codocu3.o23": "Documents 3 (O2/O3)", - "codocu3.o3": "Documents 3 (O3)", - "codocu4.o24": "Documents 4 (O2/O4)", - "coeluc1": "Honorius of Autun, Elucidarium 1", - "coeluc2": "Honorius of Autun, Elucidarium 1", - "coepigen.o3": "Ælfric's Epilogue to Genesis", - "coeuphr": "Saint Euphrosyne", - "coeust": "Saint Eustace and his companions", - "coexodusP": "Exodus (P)", - "cogenesiC": "Genesis (C)", - "cogregdC.o24": "Gregory's Dialogues (C)", - "cogregdH.o23": "Gregory's Dialogues (H)", - "coherbar": "Pseudo-Apuleius, Herbarium", - "coinspolD.o34": "Wulfstan's Institute of Polity (D)", - "coinspolX": "Wulfstan's Institute of Polity (X)", - "cojames": "Saint James", - "colacnu.o23": "Lacnunga", - "colaece.o2": "Leechdoms", - "colaw1cn.o3": "Laws, Cnut I", - "colaw2cn.o3": "Laws, Cnut II", - "colaw5atr.o3": "Laws, Æthelred V", - "colaw6atr.o3": "Laws, Æthelred VI", - "colawaf.o2": "Laws, Alfred", - "colawafint.o2": "Alfred's Introduction to Laws", - "colawger.o34": "Laws, Gerefa", - "colawine.ox2": "Laws, Ine", - "colawnorthu.o3": "Northumbra Preosta Lagu", - "colawwllad.o4": "Laws, William I, Lad", - "coleofri.o4": "Leofric", - "colsigef.o3": "Ælfric's Letter to Sigefyrth", - "colsigewB": "Ælfric's Letter to Sigeweard (B)", - "colsigewZ.o34": "Ælfric's Letter to Sigeweard (Z)", - "colwgeat": "Ælfric's Letter to Wulfgeat", - "colwsigeT": "Ælfric's Letter to Wulfsige (T)", - "colwsigeXa.o34": "Ælfric's Letter to Wulfsige (Xa)", - "colwstan1.o3": "Ælfric's Letter to Wulfstan I", - "colwstan2.o3": "Ælfric's Letter to Wulfstan II", - "comargaC.o34": "Saint Margaret (C)", - "comargaT": "Saint Margaret (T)", - "comart1": "Martyrology, I", - "comart2": "Martyrology, II", - "comart3.o23": "Martyrology, III", - "comarvel.o23": "Marvels of the East", - "comary": "Mary of Egypt", - "coneot": "Saint Neot", - "conicodA": "Gospel of Nicodemus (A)", - "conicodC": "Gospel of Nicodemus (C)", - "conicodD": "Gospel of Nicodemus (D)", - "conicodE": "Gospel of Nicodemus (E)", - "coorosiu.o2": "Orosius", - "cootest.o3": "Heptateuch", - "coprefcath1.o3": "Ælfric's Preface to Catholic Homilies I", - "coprefcath2.o3": "Ælfric's Preface to Catholic Homilies II", - "coprefcura.o2": "Preface to the Cura Pastoralis", - "coprefgen.o3": "Ælfric's Preface to Genesis", - "copreflives.o3": "Ælfric's Preface to Lives of Saints", - "coprefsolilo": "Preface to Augustine's Soliloquies", - "coquadru.o23": "Pseudo-Apuleius, Medicina de quadrupedibus", - "corood": "History of the Holy Rood-Tree", - "cosevensl": "Seven Sleepers", - "cosolilo": "St. Augustine's Soliloquies", - "cosolsat1.o4": "Solomon and Saturn I", - "cosolsat2": "Solomon and Saturn II", - "cotempo.o3": "Ælfric's De Temporibus Anni", - "coverhom": "Vercelli Homilies", - "coverhomE": "Vercelli Homilies (E)", - "coverhomL": "Vercelli Homilies (L)", - "covinceB": "Saint Vincent (Bodley 343)", - "covinsal": "Vindicta Salvatoris", - "cowsgosp.o3": "West-Saxon Gospels", - "cowulf.o34": "Wulfstan's Homilies", -} diff --git a/pipeline/nltk/corpus/util.py b/pipeline/nltk/corpus/util.py deleted file mode 100644 index 29a63574264c4859081ef8e36e26d9382f5b087f..0000000000000000000000000000000000000000 --- a/pipeline/nltk/corpus/util.py +++ /dev/null @@ -1,154 +0,0 @@ -# Natural Language Toolkit: Corpus Reader Utility Functions -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT - -###################################################################### -# { Lazy Corpus Loader -###################################################################### - -import gc -import re - -import nltk - -TRY_ZIPFILE_FIRST = False - - -class LazyCorpusLoader: - """ - To see the API documentation for this lazily loaded corpus, first - run corpus.ensure_loaded(), and then run help(this_corpus). - - LazyCorpusLoader is a proxy object which is used to stand in for a - corpus object before the corpus is loaded. This allows NLTK to - create an object for each corpus, but defer the costs associated - with loading those corpora until the first time that they're - actually accessed. - - The first time this object is accessed in any way, it will load - the corresponding corpus, and transform itself into that corpus - (by modifying its own ``__class__`` and ``__dict__`` attributes). - - If the corpus can not be found, then accessing this object will - raise an exception, displaying installation instructions for the - NLTK data package. Once they've properly installed the data - package (or modified ``nltk.data.path`` to point to its location), - they can then use the corpus object without restarting python. - - :param name: The name of the corpus - :type name: str - :param reader_cls: The specific CorpusReader class, e.g. PlaintextCorpusReader, WordListCorpusReader - :type reader: nltk.corpus.reader.api.CorpusReader - :param nltk_data_subdir: The subdirectory where the corpus is stored. - :type nltk_data_subdir: str - :param `*args`: Any other non-keywords arguments that `reader_cls` might need. - :param `**kwargs`: Any other keywords arguments that `reader_cls` might need. - """ - - def __init__(self, name, reader_cls, *args, **kwargs): - from nltk.corpus.reader.api import CorpusReader - - assert issubclass(reader_cls, CorpusReader) - self.__name = self.__name__ = name - self.__reader_cls = reader_cls - # If nltk_data_subdir is set explicitly - if "nltk_data_subdir" in kwargs: - # Use the specified subdirectory path - self.subdir = kwargs["nltk_data_subdir"] - # Pops the `nltk_data_subdir` argument, we don't need it anymore. - kwargs.pop("nltk_data_subdir", None) - else: # Otherwise use 'nltk_data/corpora' - self.subdir = "corpora" - self.__args = args - self.__kwargs = kwargs - - def __load(self): - # Find the corpus root directory. - zip_name = re.sub(r"(([^/]+)(/.*)?)", r"\2.zip/\1/", self.__name) - if TRY_ZIPFILE_FIRST: - try: - root = nltk.data.find(f"{self.subdir}/{zip_name}") - except LookupError as e: - try: - root = nltk.data.find(f"{self.subdir}/{self.__name}") - except LookupError: - raise e - else: - try: - root = nltk.data.find(f"{self.subdir}/{self.__name}") - except LookupError as e: - try: - root = nltk.data.find(f"{self.subdir}/{zip_name}") - except LookupError: - raise e - - # Load the corpus. - corpus = self.__reader_cls(root, *self.__args, **self.__kwargs) - - # This is where the magic happens! Transform ourselves into - # the corpus by modifying our own __dict__ and __class__ to - # match that of the corpus. - - args, kwargs = self.__args, self.__kwargs - name, reader_cls = self.__name, self.__reader_cls - - self.__dict__ = corpus.__dict__ - self.__class__ = corpus.__class__ - - # _unload support: assign __dict__ and __class__ back, then do GC. - # after reassigning __dict__ there shouldn't be any references to - # corpus data so the memory should be deallocated after gc.collect() - def _unload(self): - lazy_reader = LazyCorpusLoader(name, reader_cls, *args, **kwargs) - self.__dict__ = lazy_reader.__dict__ - self.__class__ = lazy_reader.__class__ - gc.collect() - - self._unload = _make_bound_method(_unload, self) - - def __getattr__(self, attr): - - # Fix for inspect.isclass under Python 2.6 - # (see https://bugs.python.org/issue1225107). - # Without this fix tests may take extra 1.5GB RAM - # because all corpora gets loaded during test collection. - if attr == "__bases__": - raise AttributeError("LazyCorpusLoader object has no attribute '__bases__'") - - self.__load() - # This looks circular, but its not, since __load() changes our - # __class__ to something new: - return getattr(self, attr) - - def __repr__(self): - return "<{} in {!r} (not loaded yet)>".format( - self.__reader_cls.__name__, - ".../corpora/" + self.__name, - ) - - def _unload(self): - # If an exception occurs during corpus loading then - # '_unload' method may be unattached, so __getattr__ can be called; - # we shouldn't trigger corpus loading again in this case. - pass - - -def _make_bound_method(func, self): - """ - Magic for creating bound methods (used for _unload). - """ - - class Foo: - def meth(self): - pass - - f = Foo() - bound_method = type(f.meth) - - try: - return bound_method(func, self, self.__class__) - except TypeError: # python3 - return bound_method(func, self) diff --git a/pipeline/nltk/data.py b/pipeline/nltk/data.py deleted file mode 100644 index fed75d2bfbf2953a2ecc61d1d5a24244f5749be6..0000000000000000000000000000000000000000 --- a/pipeline/nltk/data.py +++ /dev/null @@ -1,1441 +0,0 @@ -# Natural Language Toolkit: Utility functions -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -Functions to find and load NLTK resource files, such as corpora, -grammars, and saved processing objects. Resource files are identified -using URLs, such as ``nltk:corpora/abc/rural.txt`` or -``https://raw.githubusercontent.com/nltk/nltk/develop/nltk/test/toy.cfg``. -The following URL protocols are supported: - - - ``file:path``: Specifies the file whose path is *path*. - Both relative and absolute paths may be used. - - - ``https://host/path``: Specifies the file stored on the web - server *host* at path *path*. - - - ``nltk:path``: Specifies the file stored in the NLTK data - package at *path*. NLTK will search for these files in the - directories specified by ``nltk.data.path``. - -If no protocol is specified, then the default protocol ``nltk:`` will -be used. - -This module provides to functions that can be used to access a -resource file, given its URL: ``load()`` loads a given resource, and -adds it to a resource cache; and ``retrieve()`` copies a given resource -to a local file. -""" - -import codecs -import functools -import os -import pickle -import re -import sys -import textwrap -import zipfile -from abc import ABCMeta, abstractmethod -from gzip import WRITE as GZ_WRITE -from gzip import GzipFile -from io import BytesIO, TextIOWrapper -from urllib.request import url2pathname, urlopen - -try: - from zlib import Z_SYNC_FLUSH as FLUSH -except ImportError: - from zlib import Z_FINISH as FLUSH - -from nltk import grammar, sem -from nltk.compat import add_py3_data, py3_data -from nltk.internals import deprecated - -textwrap_indent = functools.partial(textwrap.indent, prefix=" ") - -###################################################################### -# Search Path -###################################################################### - -path = [] -"""A list of directories where the NLTK data package might reside. - These directories will be checked in order when looking for a - resource in the data package. Note that this allows users to - substitute in their own versions of resources, if they have them - (e.g., in their home directory under ~/nltk_data).""" - -# User-specified locations: -_paths_from_env = os.environ.get("NLTK_DATA", "").split(os.pathsep) -path += [d for d in _paths_from_env if d] -if "APPENGINE_RUNTIME" not in os.environ and os.path.expanduser("~/") != "~/": - path.append(os.path.expanduser("~/nltk_data")) - -if sys.platform.startswith("win"): - # Common locations on Windows: - path += [ - os.path.join(sys.prefix, "nltk_data"), - os.path.join(sys.prefix, "share", "nltk_data"), - os.path.join(sys.prefix, "lib", "nltk_data"), - os.path.join(os.environ.get("APPDATA", "C:\\"), "nltk_data"), - r"C:\nltk_data", - r"D:\nltk_data", - r"E:\nltk_data", - ] -else: - # Common locations on UNIX & OS X: - path += [ - os.path.join(sys.prefix, "nltk_data"), - os.path.join(sys.prefix, "share", "nltk_data"), - os.path.join(sys.prefix, "lib", "nltk_data"), - "/usr/share/nltk_data", - "/usr/local/share/nltk_data", - "/usr/lib/nltk_data", - "/usr/local/lib/nltk_data", - ] - - -###################################################################### -# Util Functions -###################################################################### - - -def gzip_open_unicode( - filename, - mode="rb", - compresslevel=9, - encoding="utf-8", - fileobj=None, - errors=None, - newline=None, -): - if fileobj is None: - fileobj = GzipFile(filename, mode, compresslevel, fileobj) - return TextIOWrapper(fileobj, encoding, errors, newline) - - -def split_resource_url(resource_url): - """ - Splits a resource url into ":". - - >>> windows = sys.platform.startswith('win') - >>> split_resource_url('nltk:home/nltk') - ('nltk', 'home/nltk') - >>> split_resource_url('nltk:/home/nltk') - ('nltk', '/home/nltk') - >>> split_resource_url('file:/home/nltk') - ('file', '/home/nltk') - >>> split_resource_url('file:///home/nltk') - ('file', '/home/nltk') - >>> split_resource_url('file:///C:/home/nltk') - ('file', '/C:/home/nltk') - """ - protocol, path_ = resource_url.split(":", 1) - if protocol == "nltk": - pass - elif protocol == "file": - if path_.startswith("/"): - path_ = "/" + path_.lstrip("/") - else: - path_ = re.sub(r"^/{0,2}", "", path_) - return protocol, path_ - - -def normalize_resource_url(resource_url): - r""" - Normalizes a resource url - - >>> windows = sys.platform.startswith('win') - >>> os.path.normpath(split_resource_url(normalize_resource_url('file:grammar.fcfg'))[1]) == \ - ... ('\\' if windows else '') + os.path.abspath(os.path.join(os.curdir, 'grammar.fcfg')) - True - >>> not windows or normalize_resource_url('file:C:/dir/file') == 'file:///C:/dir/file' - True - >>> not windows or normalize_resource_url('file:C:\\dir\\file') == 'file:///C:/dir/file' - True - >>> not windows or normalize_resource_url('file:C:\\dir/file') == 'file:///C:/dir/file' - True - >>> not windows or normalize_resource_url('file://C:/dir/file') == 'file:///C:/dir/file' - True - >>> not windows or normalize_resource_url('file:////C:/dir/file') == 'file:///C:/dir/file' - True - >>> not windows or normalize_resource_url('nltk:C:/dir/file') == 'file:///C:/dir/file' - True - >>> not windows or normalize_resource_url('nltk:C:\\dir\\file') == 'file:///C:/dir/file' - True - >>> windows or normalize_resource_url('file:/dir/file/toy.cfg') == 'file:///dir/file/toy.cfg' - True - >>> normalize_resource_url('nltk:home/nltk') - 'nltk:home/nltk' - >>> windows or normalize_resource_url('nltk:/home/nltk') == 'file:///home/nltk' - True - >>> normalize_resource_url('https://example.com/dir/file') - 'https://example.com/dir/file' - >>> normalize_resource_url('dir/file') - 'nltk:dir/file' - """ - try: - protocol, name = split_resource_url(resource_url) - except ValueError: - # the resource url has no protocol, use the nltk protocol by default - protocol = "nltk" - name = resource_url - # use file protocol if the path is an absolute path - if protocol == "nltk" and os.path.isabs(name): - protocol = "file://" - name = normalize_resource_name(name, False, None) - elif protocol == "file": - protocol = "file://" - # name is absolute - name = normalize_resource_name(name, False, None) - elif protocol == "nltk": - protocol = "nltk:" - name = normalize_resource_name(name, True) - else: - # handled by urllib - protocol += "://" - return "".join([protocol, name]) - - -def normalize_resource_name(resource_name, allow_relative=True, relative_path=None): - """ - :type resource_name: str or unicode - :param resource_name: The name of the resource to search for. - Resource names are posix-style relative path names, such as - ``corpora/brown``. Directory names will automatically - be converted to a platform-appropriate path separator. - Directory trailing slashes are preserved - - >>> windows = sys.platform.startswith('win') - >>> normalize_resource_name('.', True) - './' - >>> normalize_resource_name('./', True) - './' - >>> windows or normalize_resource_name('dir/file', False, '/') == '/dir/file' - True - >>> not windows or normalize_resource_name('C:/file', False, '/') == '/C:/file' - True - >>> windows or normalize_resource_name('/dir/file', False, '/') == '/dir/file' - True - >>> windows or normalize_resource_name('../dir/file', False, '/') == '/dir/file' - True - >>> not windows or normalize_resource_name('/dir/file', True, '/') == 'dir/file' - True - >>> windows or normalize_resource_name('/dir/file', True, '/') == '/dir/file' - True - """ - is_dir = bool(re.search(r"[\\/.]$", resource_name)) or resource_name.endswith( - os.path.sep - ) - if sys.platform.startswith("win"): - resource_name = resource_name.lstrip("/") - else: - resource_name = re.sub(r"^/+", "/", resource_name) - if allow_relative: - resource_name = os.path.normpath(resource_name) - else: - if relative_path is None: - relative_path = os.curdir - resource_name = os.path.abspath(os.path.join(relative_path, resource_name)) - resource_name = resource_name.replace("\\", "/").replace(os.path.sep, "/") - if sys.platform.startswith("win") and os.path.isabs(resource_name): - resource_name = "/" + resource_name - if is_dir and not resource_name.endswith("/"): - resource_name += "/" - return resource_name - - -###################################################################### -# Path Pointers -###################################################################### - - -class PathPointer(metaclass=ABCMeta): - """ - An abstract base class for 'path pointers,' used by NLTK's data - package to identify specific paths. Two subclasses exist: - ``FileSystemPathPointer`` identifies a file that can be accessed - directly via a given absolute path. ``ZipFilePathPointer`` - identifies a file contained within a zipfile, that can be accessed - by reading that zipfile. - """ - - @abstractmethod - def open(self, encoding=None): - """ - Return a seekable read-only stream that can be used to read - the contents of the file identified by this path pointer. - - :raise IOError: If the path specified by this pointer does - not contain a readable file. - """ - - @abstractmethod - def file_size(self): - """ - Return the size of the file pointed to by this path pointer, - in bytes. - - :raise IOError: If the path specified by this pointer does - not contain a readable file. - """ - - @abstractmethod - def join(self, fileid): - """ - Return a new path pointer formed by starting at the path - identified by this pointer, and then following the relative - path given by ``fileid``. The path components of ``fileid`` - should be separated by forward slashes, regardless of - the underlying file system's path separator character. - """ - - -class FileSystemPathPointer(PathPointer, str): - """ - A path pointer that identifies a file which can be accessed - directly via a given absolute path. - """ - - @py3_data - def __init__(self, _path): - """ - Create a new path pointer for the given absolute path. - - :raise IOError: If the given path does not exist. - """ - - _path = os.path.abspath(_path) - if not os.path.exists(_path): - raise OSError("No such file or directory: %r" % _path) - self._path = _path - - # There's no need to call str.__init__(), since it's a no-op; - # str does all of its setup work in __new__. - - @property - def path(self): - """The absolute path identified by this path pointer.""" - return self._path - - def open(self, encoding=None): - stream = open(self._path, "rb") - if encoding is not None: - stream = SeekableUnicodeStreamReader(stream, encoding) - return stream - - def file_size(self): - return os.stat(self._path).st_size - - def join(self, fileid): - _path = os.path.join(self._path, fileid) - return FileSystemPathPointer(_path) - - def __repr__(self): - return "FileSystemPathPointer(%r)" % self._path - - def __str__(self): - return self._path - - -@deprecated("Use gzip.GzipFile instead as it also uses a buffer.") -class BufferedGzipFile(GzipFile): - """A ``GzipFile`` subclass for compatibility with older nltk releases. - - Use ``GzipFile`` directly as it also buffers in all supported - Python versions. - """ - - @py3_data - def __init__( - self, filename=None, mode=None, compresslevel=9, fileobj=None, **kwargs - ): - """Return a buffered gzip file object.""" - GzipFile.__init__(self, filename, mode, compresslevel, fileobj) - - def write(self, data): - # This is identical to GzipFile.write but does not return - # the bytes written to retain compatibility. - super().write(data) - - -class GzipFileSystemPathPointer(FileSystemPathPointer): - """ - A subclass of ``FileSystemPathPointer`` that identifies a gzip-compressed - file located at a given absolute path. ``GzipFileSystemPathPointer`` is - appropriate for loading large gzip-compressed pickle objects efficiently. - """ - - def open(self, encoding=None): - stream = GzipFile(self._path, "rb") - if encoding: - stream = SeekableUnicodeStreamReader(stream, encoding) - return stream - - -class ZipFilePathPointer(PathPointer): - """ - A path pointer that identifies a file contained within a zipfile, - which can be accessed by reading that zipfile. - """ - - @py3_data - def __init__(self, zipfile, entry=""): - """ - Create a new path pointer pointing at the specified entry - in the given zipfile. - - :raise IOError: If the given zipfile does not exist, or if it - does not contain the specified entry. - """ - if isinstance(zipfile, str): - zipfile = OpenOnDemandZipFile(os.path.abspath(zipfile)) - - # Check that the entry exists: - if entry: - - # Normalize the entry string, it should be relative: - entry = normalize_resource_name(entry, True, "/").lstrip("/") - - try: - zipfile.getinfo(entry) - except Exception as e: - # Sometimes directories aren't explicitly listed in - # the zip file. So if `entry` is a directory name, - # then check if the zipfile contains any files that - # are under the given directory. - if entry.endswith("/") and [ - n for n in zipfile.namelist() if n.startswith(entry) - ]: - pass # zipfile contains a file in that directory. - else: - # Otherwise, complain. - raise OSError( - f"Zipfile {zipfile.filename!r} does not contain {entry!r}" - ) from e - self._zipfile = zipfile - self._entry = entry - - @property - def zipfile(self): - """ - The zipfile.ZipFile object used to access the zip file - containing the entry identified by this path pointer. - """ - return self._zipfile - - @property - def entry(self): - """ - The name of the file within zipfile that this path - pointer points to. - """ - return self._entry - - def open(self, encoding=None): - data = self._zipfile.read(self._entry) - stream = BytesIO(data) - if self._entry.endswith(".gz"): - stream = GzipFile(self._entry, fileobj=stream) - elif encoding is not None: - stream = SeekableUnicodeStreamReader(stream, encoding) - return stream - - def file_size(self): - return self._zipfile.getinfo(self._entry).file_size - - def join(self, fileid): - entry = f"{self._entry}/{fileid}" - return ZipFilePathPointer(self._zipfile, entry) - - def __repr__(self): - return f"ZipFilePathPointer({self._zipfile.filename!r}, {self._entry!r})" - - def __str__(self): - return os.path.normpath(os.path.join(self._zipfile.filename, self._entry)) - - -###################################################################### -# Access Functions -###################################################################### - -# Don't use a weak dictionary, because in the common case this -# causes a lot more reloading that necessary. -_resource_cache = {} -"""A dictionary used to cache resources so that they won't - need to be loaded more than once.""" - - -def find(resource_name, paths=None): - """ - Find the given resource by searching through the directories and - zip files in paths, where a None or empty string specifies an absolute path. - Returns a corresponding path name. If the given resource is not - found, raise a ``LookupError``, whose message gives a pointer to - the installation instructions for the NLTK downloader. - - Zip File Handling: - - - If ``resource_name`` contains a component with a ``.zip`` - extension, then it is assumed to be a zipfile; and the - remaining path components are used to look inside the zipfile. - - - If any element of ``nltk.data.path`` has a ``.zip`` extension, - then it is assumed to be a zipfile. - - - If a given resource name that does not contain any zipfile - component is not found initially, then ``find()`` will make a - second attempt to find that resource, by replacing each - component *p* in the path with *p.zip/p*. For example, this - allows ``find()`` to map the resource name - ``corpora/chat80/cities.pl`` to a zip file path pointer to - ``corpora/chat80.zip/chat80/cities.pl``. - - - When using ``find()`` to locate a directory contained in a - zipfile, the resource name must end with the forward slash - character. Otherwise, ``find()`` will not locate the - directory. - - :type resource_name: str or unicode - :param resource_name: The name of the resource to search for. - Resource names are posix-style relative path names, such as - ``corpora/brown``. Directory names will be - automatically converted to a platform-appropriate path separator. - :rtype: str - """ - resource_name = normalize_resource_name(resource_name, True) - - # Resolve default paths at runtime in-case the user overrides - # nltk.data.path - if paths is None: - paths = path - - # Check if the resource name includes a zipfile name - m = re.match(r"(.*\.zip)/?(.*)$|", resource_name) - zipfile, zipentry = m.groups() - - # Check each item in our path - for path_ in paths: - # Is the path item a zipfile? - if path_ and (os.path.isfile(path_) and path_.endswith(".zip")): - try: - return ZipFilePathPointer(path_, resource_name) - except OSError: - # resource not in zipfile - continue - - # Is the path item a directory or is resource_name an absolute path? - elif not path_ or os.path.isdir(path_): - if zipfile is None: - p = os.path.join(path_, url2pathname(resource_name)) - if os.path.exists(p): - if p.endswith(".gz"): - return GzipFileSystemPathPointer(p) - else: - return FileSystemPathPointer(p) - else: - p = os.path.join(path_, url2pathname(zipfile)) - if os.path.exists(p): - try: - return ZipFilePathPointer(p, zipentry) - except OSError: - # resource not in zipfile - continue - - # Fallback: if the path doesn't include a zip file, then try - # again, assuming that one of the path components is inside a - # zipfile of the same name. - if zipfile is None: - pieces = resource_name.split("/") - for i in range(len(pieces)): - modified_name = "/".join(pieces[:i] + [pieces[i] + ".zip"] + pieces[i:]) - try: - return find(modified_name, paths) - except LookupError: - pass - - # Identify the package (i.e. the .zip file) to download. - resource_zipname = resource_name.split("/")[1] - if resource_zipname.endswith(".zip"): - resource_zipname = resource_zipname.rpartition(".")[0] - # Display a friendly error message if the resource wasn't found: - msg = str( - "Resource \33[93m{resource}\033[0m not found.\n" - "Please use the NLTK Downloader to obtain the resource:\n\n" - "\33[31m" # To display red text in terminal. - ">>> import nltk\n" - ">>> nltk.download('{resource}')\n" - "\033[0m" - ).format(resource=resource_zipname) - msg = textwrap_indent(msg) - - msg += "\n For more information see: https://www.nltk.org/data.html\n" - - msg += "\n Attempted to load \33[93m{resource_name}\033[0m\n".format( - resource_name=resource_name - ) - - msg += "\n Searched in:" + "".join("\n - %r" % d for d in paths) - sep = "*" * 70 - resource_not_found = f"\n{sep}\n{msg}\n{sep}\n" - raise LookupError(resource_not_found) - - -def retrieve(resource_url, filename=None, verbose=True): - """ - Copy the given resource to a local file. If no filename is - specified, then use the URL's filename. If there is already a - file named ``filename``, then raise a ``ValueError``. - - :type resource_url: str - :param resource_url: A URL specifying where the resource should be - loaded from. The default protocol is "nltk:", which searches - for the file in the the NLTK data package. - """ - resource_url = normalize_resource_url(resource_url) - if filename is None: - if resource_url.startswith("file:"): - filename = os.path.split(resource_url)[-1] - else: - filename = re.sub(r"(^\w+:)?.*/", "", resource_url) - if os.path.exists(filename): - filename = os.path.abspath(filename) - raise ValueError("File %r already exists!" % filename) - - if verbose: - print(f"Retrieving {resource_url!r}, saving to {filename!r}") - - # Open the input & output streams. - infile = _open(resource_url) - - # Copy infile -> outfile, using 64k blocks. - with open(filename, "wb") as outfile: - while True: - s = infile.read(1024 * 64) # 64k blocks. - outfile.write(s) - if not s: - break - - infile.close() - - -#: A dictionary describing the formats that are supported by NLTK's -#: load() method. Keys are format names, and values are format -#: descriptions. -FORMATS = { - "pickle": "A serialized python object, stored using the pickle module.", - "json": "A serialized python object, stored using the json module.", - "yaml": "A serialized python object, stored using the yaml module.", - "cfg": "A context free grammar.", - "pcfg": "A probabilistic CFG.", - "fcfg": "A feature CFG.", - "fol": "A list of first order logic expressions, parsed with " - "nltk.sem.logic.Expression.fromstring.", - "logic": "A list of first order logic expressions, parsed with " - "nltk.sem.logic.LogicParser. Requires an additional logic_parser " - "parameter", - "val": "A semantic valuation, parsed by nltk.sem.Valuation.fromstring.", - "raw": "The raw (byte string) contents of a file.", - "text": "The raw (unicode string) contents of a file. ", -} - -#: A dictionary mapping from file extensions to format names, used -#: by load() when format="auto" to decide the format for a -#: given resource url. -AUTO_FORMATS = { - "pickle": "pickle", - "json": "json", - "yaml": "yaml", - "cfg": "cfg", - "pcfg": "pcfg", - "fcfg": "fcfg", - "fol": "fol", - "logic": "logic", - "val": "val", - "txt": "text", - "text": "text", -} - - -def load( - resource_url, - format="auto", - cache=True, - verbose=False, - logic_parser=None, - fstruct_reader=None, - encoding=None, -): - """ - Load a given resource from the NLTK data package. The following - resource formats are currently supported: - - - ``pickle`` - - ``json`` - - ``yaml`` - - ``cfg`` (context free grammars) - - ``pcfg`` (probabilistic CFGs) - - ``fcfg`` (feature-based CFGs) - - ``fol`` (formulas of First Order Logic) - - ``logic`` (Logical formulas to be parsed by the given logic_parser) - - ``val`` (valuation of First Order Logic model) - - ``text`` (the file contents as a unicode string) - - ``raw`` (the raw file contents as a byte string) - - If no format is specified, ``load()`` will attempt to determine a - format based on the resource name's file extension. If that - fails, ``load()`` will raise a ``ValueError`` exception. - - For all text formats (everything except ``pickle``, ``json``, ``yaml`` and ``raw``), - it tries to decode the raw contents using UTF-8, and if that doesn't - work, it tries with ISO-8859-1 (Latin-1), unless the ``encoding`` - is specified. - - :type resource_url: str - :param resource_url: A URL specifying where the resource should be - loaded from. The default protocol is "nltk:", which searches - for the file in the the NLTK data package. - :type cache: bool - :param cache: If true, add this resource to a cache. If load() - finds a resource in its cache, then it will return it from the - cache rather than loading it. - :type verbose: bool - :param verbose: If true, print a message when loading a resource. - Messages are not displayed when a resource is retrieved from - the cache. - :type logic_parser: LogicParser - :param logic_parser: The parser that will be used to parse logical - expressions. - :type fstruct_reader: FeatStructReader - :param fstruct_reader: The parser that will be used to parse the - feature structure of an fcfg. - :type encoding: str - :param encoding: the encoding of the input; only used for text formats. - """ - resource_url = normalize_resource_url(resource_url) - resource_url = add_py3_data(resource_url) - - # Determine the format of the resource. - if format == "auto": - resource_url_parts = resource_url.split(".") - ext = resource_url_parts[-1] - if ext == "gz": - ext = resource_url_parts[-2] - format = AUTO_FORMATS.get(ext) - if format is None: - raise ValueError( - "Could not determine format for %s based " - 'on its file\nextension; use the "format" ' - "argument to specify the format explicitly." % resource_url - ) - - if format not in FORMATS: - raise ValueError(f"Unknown format type: {format}!") - - # If we've cached the resource, then just return it. - if cache: - resource_val = _resource_cache.get((resource_url, format)) - if resource_val is not None: - if verbose: - print(f"<>") - return resource_val - - # Let the user know what's going on. - if verbose: - print(f"<>") - - # Load the resource. - opened_resource = _open(resource_url) - - if format == "raw": - resource_val = opened_resource.read() - elif format == "pickle": - resource_val = pickle.load(opened_resource) - elif format == "json": - import json - - from nltk.jsontags import json_tags - - resource_val = json.load(opened_resource) - tag = None - if len(resource_val) != 1: - tag = next(resource_val.keys()) - if tag not in json_tags: - raise ValueError("Unknown json tag.") - elif format == "yaml": - import yaml - - resource_val = yaml.safe_load(opened_resource) - else: - # The resource is a text format. - binary_data = opened_resource.read() - if encoding is not None: - string_data = binary_data.decode(encoding) - else: - try: - string_data = binary_data.decode("utf-8") - except UnicodeDecodeError: - string_data = binary_data.decode("latin-1") - if format == "text": - resource_val = string_data - elif format == "cfg": - resource_val = grammar.CFG.fromstring(string_data, encoding=encoding) - elif format == "pcfg": - resource_val = grammar.PCFG.fromstring(string_data, encoding=encoding) - elif format == "fcfg": - resource_val = grammar.FeatureGrammar.fromstring( - string_data, - logic_parser=logic_parser, - fstruct_reader=fstruct_reader, - encoding=encoding, - ) - elif format == "fol": - resource_val = sem.read_logic( - string_data, - logic_parser=sem.logic.LogicParser(), - encoding=encoding, - ) - elif format == "logic": - resource_val = sem.read_logic( - string_data, logic_parser=logic_parser, encoding=encoding - ) - elif format == "val": - resource_val = sem.read_valuation(string_data, encoding=encoding) - else: - raise AssertionError( - "Internal NLTK error: Format %s isn't " - "handled by nltk.data.load()" % (format,) - ) - - opened_resource.close() - - # If requested, add it to the cache. - if cache: - try: - _resource_cache[(resource_url, format)] = resource_val - # TODO: add this line - # print('<>' % (resource_url,)) - except TypeError: - # We can't create weak references to some object types, like - # strings and tuples. For now, just don't cache them. - pass - - return resource_val - - -def show_cfg(resource_url, escape="##"): - """ - Write out a grammar file, ignoring escaped and empty lines. - - :type resource_url: str - :param resource_url: A URL specifying where the resource should be - loaded from. The default protocol is "nltk:", which searches - for the file in the the NLTK data package. - :type escape: str - :param escape: Prepended string that signals lines to be ignored - """ - resource_url = normalize_resource_url(resource_url) - resource_val = load(resource_url, format="text", cache=False) - lines = resource_val.splitlines() - for l in lines: - if l.startswith(escape): - continue - if re.match("^$", l): - continue - print(l) - - -def clear_cache(): - """ - Remove all objects from the resource cache. - :see: load() - """ - _resource_cache.clear() - - -def _open(resource_url): - """ - Helper function that returns an open file object for a resource, - given its resource URL. If the given resource URL uses the "nltk:" - protocol, or uses no protocol, then use ``nltk.data.find`` to find - its path, and open it with the given mode; if the resource URL - uses the 'file' protocol, then open the file with the given mode; - otherwise, delegate to ``urllib2.urlopen``. - - :type resource_url: str - :param resource_url: A URL specifying where the resource should be - loaded from. The default protocol is "nltk:", which searches - for the file in the the NLTK data package. - """ - resource_url = normalize_resource_url(resource_url) - protocol, path_ = split_resource_url(resource_url) - - if protocol is None or protocol.lower() == "nltk": - return find(path_, path + [""]).open() - elif protocol.lower() == "file": - # urllib might not use mode='rb', so handle this one ourselves: - return find(path_, [""]).open() - else: - return urlopen(resource_url) - - -###################################################################### -# Lazy Resource Loader -###################################################################### - - -class LazyLoader: - @py3_data - def __init__(self, _path): - self._path = _path - - def __load(self): - resource = load(self._path) - # This is where the magic happens! Transform ourselves into - # the object by modifying our own __dict__ and __class__ to - # match that of `resource`. - self.__dict__ = resource.__dict__ - self.__class__ = resource.__class__ - - def __getattr__(self, attr): - self.__load() - # This looks circular, but its not, since __load() changes our - # __class__ to something new: - return getattr(self, attr) - - def __repr__(self): - self.__load() - # This looks circular, but its not, since __load() changes our - # __class__ to something new: - return repr(self) - - -###################################################################### -# Open-On-Demand ZipFile -###################################################################### - - -class OpenOnDemandZipFile(zipfile.ZipFile): - """ - A subclass of ``zipfile.ZipFile`` that closes its file pointer - whenever it is not using it; and re-opens it when it needs to read - data from the zipfile. This is useful for reducing the number of - open file handles when many zip files are being accessed at once. - ``OpenOnDemandZipFile`` must be constructed from a filename, not a - file-like object (to allow re-opening). ``OpenOnDemandZipFile`` is - read-only (i.e. ``write()`` and ``writestr()`` are disabled. - """ - - @py3_data - def __init__(self, filename): - if not isinstance(filename, str): - raise TypeError("ReopenableZipFile filename must be a string") - zipfile.ZipFile.__init__(self, filename) - assert self.filename == filename - self.close() - # After closing a ZipFile object, the _fileRefCnt needs to be cleared - # for Python2and3 compatible code. - self._fileRefCnt = 0 - - def read(self, name): - assert self.fp is None - self.fp = open(self.filename, "rb") - value = zipfile.ZipFile.read(self, name) - # Ensure that _fileRefCnt needs to be set for Python2and3 compatible code. - # Since we only opened one file here, we add 1. - self._fileRefCnt += 1 - self.close() - return value - - def write(self, *args, **kwargs): - """:raise NotImplementedError: OpenOnDemandZipfile is read-only""" - raise NotImplementedError("OpenOnDemandZipfile is read-only") - - def writestr(self, *args, **kwargs): - """:raise NotImplementedError: OpenOnDemandZipfile is read-only""" - raise NotImplementedError("OpenOnDemandZipfile is read-only") - - def __repr__(self): - return repr("OpenOnDemandZipFile(%r)" % self.filename) - - -###################################################################### -# Seekable Unicode Stream Reader -###################################################################### - - -class SeekableUnicodeStreamReader: - """ - A stream reader that automatically encodes the source byte stream - into unicode (like ``codecs.StreamReader``); but still supports the - ``seek()`` and ``tell()`` operations correctly. This is in contrast - to ``codecs.StreamReader``, which provide *broken* ``seek()`` and - ``tell()`` methods. - - This class was motivated by ``StreamBackedCorpusView``, which - makes extensive use of ``seek()`` and ``tell()``, and needs to be - able to handle unicode-encoded files. - - Note: this class requires stateless decoders. To my knowledge, - this shouldn't cause a problem with any of python's builtin - unicode encodings. - """ - - DEBUG = True # : If true, then perform extra sanity checks. - - @py3_data - def __init__(self, stream, encoding, errors="strict"): - # Rewind the stream to its beginning. - stream.seek(0) - - self.stream = stream - """The underlying stream.""" - - self.encoding = encoding - """The name of the encoding that should be used to encode the - underlying stream.""" - - self.errors = errors - """The error mode that should be used when decoding data from - the underlying stream. Can be 'strict', 'ignore', or - 'replace'.""" - - self.decode = codecs.getdecoder(encoding) - """The function that is used to decode byte strings into - unicode strings.""" - - self.bytebuffer = b"" - """A buffer to use bytes that have been read but have not yet - been decoded. This is only used when the final bytes from - a read do not form a complete encoding for a character.""" - - self.linebuffer = None - """A buffer used by ``readline()`` to hold characters that have - been read, but have not yet been returned by ``read()`` or - ``readline()``. This buffer consists of a list of unicode - strings, where each string corresponds to a single line. - The final element of the list may or may not be a complete - line. Note that the existence of a linebuffer makes the - ``tell()`` operation more complex, because it must backtrack - to the beginning of the buffer to determine the correct - file position in the underlying byte stream.""" - - self._rewind_checkpoint = 0 - """The file position at which the most recent read on the - underlying stream began. This is used, together with - ``_rewind_numchars``, to backtrack to the beginning of - ``linebuffer`` (which is required by ``tell()``).""" - - self._rewind_numchars = None - """The number of characters that have been returned since the - read that started at ``_rewind_checkpoint``. This is used, - together with ``_rewind_checkpoint``, to backtrack to the - beginning of ``linebuffer`` (which is required by ``tell()``).""" - - self._bom = self._check_bom() - """The length of the byte order marker at the beginning of - the stream (or None for no byte order marker).""" - - # ///////////////////////////////////////////////////////////////// - # Read methods - # ///////////////////////////////////////////////////////////////// - - def read(self, size=None): - """ - Read up to ``size`` bytes, decode them using this reader's - encoding, and return the resulting unicode string. - - :param size: The maximum number of bytes to read. If not - specified, then read as many bytes as possible. - :type size: int - :rtype: unicode - """ - chars = self._read(size) - - # If linebuffer is not empty, then include it in the result - if self.linebuffer: - chars = "".join(self.linebuffer) + chars - self.linebuffer = None - self._rewind_numchars = None - - return chars - - def discard_line(self): - if self.linebuffer and len(self.linebuffer) > 1: - line = self.linebuffer.pop(0) - self._rewind_numchars += len(line) - else: - self.stream.readline() - - def readline(self, size=None): - """ - Read a line of text, decode it using this reader's encoding, - and return the resulting unicode string. - - :param size: The maximum number of bytes to read. If no - newline is encountered before ``size`` bytes have been read, - then the returned value may not be a complete line of text. - :type size: int - """ - # If we have a non-empty linebuffer, then return the first - # line from it. (Note that the last element of linebuffer may - # not be a complete line; so let _read() deal with it.) - if self.linebuffer and len(self.linebuffer) > 1: - line = self.linebuffer.pop(0) - self._rewind_numchars += len(line) - return line - - readsize = size or 72 - chars = "" - - # If there's a remaining incomplete line in the buffer, add it. - if self.linebuffer: - chars += self.linebuffer.pop() - self.linebuffer = None - - while True: - startpos = self.stream.tell() - len(self.bytebuffer) - new_chars = self._read(readsize) - - # If we're at a '\r', then read one extra character, since - # it might be a '\n', to get the proper line ending. - if new_chars and new_chars.endswith("\r"): - new_chars += self._read(1) - - chars += new_chars - lines = chars.splitlines(True) - if len(lines) > 1: - line = lines[0] - self.linebuffer = lines[1:] - self._rewind_numchars = len(new_chars) - (len(chars) - len(line)) - self._rewind_checkpoint = startpos - break - elif len(lines) == 1: - line0withend = lines[0] - line0withoutend = lines[0].splitlines(False)[0] - if line0withend != line0withoutend: # complete line - line = line0withend - break - - if not new_chars or size is not None: - line = chars - break - - # Read successively larger blocks of text. - if readsize < 8000: - readsize *= 2 - - return line - - def readlines(self, sizehint=None, keepends=True): - """ - Read this file's contents, decode them using this reader's - encoding, and return it as a list of unicode lines. - - :rtype: list(unicode) - :param sizehint: Ignored. - :param keepends: If false, then strip newlines. - """ - return self.read().splitlines(keepends) - - def next(self): - """Return the next decoded line from the underlying stream.""" - line = self.readline() - if line: - return line - else: - raise StopIteration - - def __next__(self): - return self.next() - - def __iter__(self): - """Return self""" - return self - - def __del__(self): - # let garbage collector deal with still opened streams - if not self.closed: - self.close() - - def __enter__(self): - return self - - def __exit__(self, type, value, traceback): - self.close() - - def xreadlines(self): - """Return self""" - return self - - # ///////////////////////////////////////////////////////////////// - # Pass-through methods & properties - # ///////////////////////////////////////////////////////////////// - - @property - def closed(self): - """True if the underlying stream is closed.""" - return self.stream.closed - - @property - def name(self): - """The name of the underlying stream.""" - return self.stream.name - - @property - def mode(self): - """The mode of the underlying stream.""" - return self.stream.mode - - def close(self): - """ - Close the underlying stream. - """ - self.stream.close() - - # ///////////////////////////////////////////////////////////////// - # Seek and tell - # ///////////////////////////////////////////////////////////////// - - def seek(self, offset, whence=0): - """ - Move the stream to a new file position. If the reader is - maintaining any buffers, then they will be cleared. - - :param offset: A byte count offset. - :param whence: If 0, then the offset is from the start of the file - (offset should be positive), if 1, then the offset is from the - current position (offset may be positive or negative); and if 2, - then the offset is from the end of the file (offset should - typically be negative). - """ - if whence == 1: - raise ValueError( - "Relative seek is not supported for " - "SeekableUnicodeStreamReader -- consider " - "using char_seek_forward() instead." - ) - self.stream.seek(offset, whence) - self.linebuffer = None - self.bytebuffer = b"" - self._rewind_numchars = None - self._rewind_checkpoint = self.stream.tell() - - def char_seek_forward(self, offset): - """ - Move the read pointer forward by ``offset`` characters. - """ - if offset < 0: - raise ValueError("Negative offsets are not supported") - # Clear all buffers. - self.seek(self.tell()) - # Perform the seek operation. - self._char_seek_forward(offset) - - def _char_seek_forward(self, offset, est_bytes=None): - """ - Move the file position forward by ``offset`` characters, - ignoring all buffers. - - :param est_bytes: A hint, giving an estimate of the number of - bytes that will be needed to move forward by ``offset`` chars. - Defaults to ``offset``. - """ - if est_bytes is None: - est_bytes = offset - bytes = b"" - - while True: - # Read in a block of bytes. - newbytes = self.stream.read(est_bytes - len(bytes)) - bytes += newbytes - - # Decode the bytes to characters. - chars, bytes_decoded = self._incr_decode(bytes) - - # If we got the right number of characters, then seek - # backwards over any truncated characters, and return. - if len(chars) == offset: - self.stream.seek(-len(bytes) + bytes_decoded, 1) - return - - # If we went too far, then we can back-up until we get it - # right, using the bytes we've already read. - if len(chars) > offset: - while len(chars) > offset: - # Assume at least one byte/char. - est_bytes += offset - len(chars) - chars, bytes_decoded = self._incr_decode(bytes[:est_bytes]) - self.stream.seek(-len(bytes) + bytes_decoded, 1) - return - - # Otherwise, we haven't read enough bytes yet; loop again. - est_bytes += offset - len(chars) - - def tell(self): - """ - Return the current file position on the underlying byte - stream. If this reader is maintaining any buffers, then the - returned file position will be the position of the beginning - of those buffers. - """ - # If nothing's buffered, then just return our current filepos: - if self.linebuffer is None: - return self.stream.tell() - len(self.bytebuffer) - - # Otherwise, we'll need to backtrack the filepos until we - # reach the beginning of the buffer. - - # Store our original file position, so we can return here. - orig_filepos = self.stream.tell() - - # Calculate an estimate of where we think the newline is. - bytes_read = (orig_filepos - len(self.bytebuffer)) - self._rewind_checkpoint - buf_size = sum(len(line) for line in self.linebuffer) - est_bytes = int( - bytes_read * self._rewind_numchars / (self._rewind_numchars + buf_size) - ) - - self.stream.seek(self._rewind_checkpoint) - self._char_seek_forward(self._rewind_numchars, est_bytes) - filepos = self.stream.tell() - - # Sanity check - if self.DEBUG: - self.stream.seek(filepos) - check1 = self._incr_decode(self.stream.read(50))[0] - check2 = "".join(self.linebuffer) - assert check1.startswith(check2) or check2.startswith(check1) - - # Return to our original filepos (so we don't have to throw - # out our buffer.) - self.stream.seek(orig_filepos) - - # Return the calculated filepos - return filepos - - # ///////////////////////////////////////////////////////////////// - # Helper methods - # ///////////////////////////////////////////////////////////////// - - def _read(self, size=None): - """ - Read up to ``size`` bytes from the underlying stream, decode - them using this reader's encoding, and return the resulting - unicode string. ``linebuffer`` is not included in the result. - """ - if size == 0: - return "" - - # Skip past the byte order marker, if present. - if self._bom and self.stream.tell() == 0: - self.stream.read(self._bom) - - # Read the requested number of bytes. - if size is None: - new_bytes = self.stream.read() - else: - new_bytes = self.stream.read(size) - bytes = self.bytebuffer + new_bytes - - # Decode the bytes into unicode characters - chars, bytes_decoded = self._incr_decode(bytes) - - # If we got bytes but couldn't decode any, then read further. - if (size is not None) and (not chars) and (len(new_bytes) > 0): - while not chars: - new_bytes = self.stream.read(1) - if not new_bytes: - break # end of file. - bytes += new_bytes - chars, bytes_decoded = self._incr_decode(bytes) - - # Record any bytes we didn't consume. - self.bytebuffer = bytes[bytes_decoded:] - - # Return the result - return chars - - def _incr_decode(self, bytes): - """ - Decode the given byte string into a unicode string, using this - reader's encoding. If an exception is encountered that - appears to be caused by a truncation error, then just decode - the byte string without the bytes that cause the trunctaion - error. - - Return a tuple ``(chars, num_consumed)``, where ``chars`` is - the decoded unicode string, and ``num_consumed`` is the - number of bytes that were consumed. - """ - while True: - try: - return self.decode(bytes, "strict") - except UnicodeDecodeError as exc: - # If the exception occurs at the end of the string, - # then assume that it's a truncation error. - if exc.end == len(bytes): - return self.decode(bytes[: exc.start], self.errors) - - # Otherwise, if we're being strict, then raise it. - elif self.errors == "strict": - raise - - # If we're not strict, then re-process it with our - # errors setting. This *may* raise an exception. - else: - return self.decode(bytes, self.errors) - - _BOM_TABLE = { - "utf8": [(codecs.BOM_UTF8, None)], - "utf16": [(codecs.BOM_UTF16_LE, "utf16-le"), (codecs.BOM_UTF16_BE, "utf16-be")], - "utf16le": [(codecs.BOM_UTF16_LE, None)], - "utf16be": [(codecs.BOM_UTF16_BE, None)], - "utf32": [(codecs.BOM_UTF32_LE, "utf32-le"), (codecs.BOM_UTF32_BE, "utf32-be")], - "utf32le": [(codecs.BOM_UTF32_LE, None)], - "utf32be": [(codecs.BOM_UTF32_BE, None)], - } - - def _check_bom(self): - # Normalize our encoding name - enc = re.sub("[ -]", "", self.encoding.lower()) - - # Look up our encoding in the BOM table. - bom_info = self._BOM_TABLE.get(enc) - - if bom_info: - # Read a prefix, to check against the BOM(s) - bytes = self.stream.read(16) - self.stream.seek(0) - - # Check for each possible BOM. - for (bom, new_encoding) in bom_info: - if bytes.startswith(bom): - if new_encoding: - self.encoding = new_encoding - return len(bom) - - return None - - -__all__ = [ - "path", - "PathPointer", - "FileSystemPathPointer", - "BufferedGzipFile", - "GzipFileSystemPathPointer", - "GzipFileSystemPathPointer", - "find", - "retrieve", - "FORMATS", - "AUTO_FORMATS", - "load", - "show_cfg", - "clear_cache", - "LazyLoader", - "OpenOnDemandZipFile", - "GzipFileSystemPathPointer", - "SeekableUnicodeStreamReader", -] diff --git a/pipeline/nltk/decorators.py b/pipeline/nltk/decorators.py deleted file mode 100644 index 3a0fae1852afd47a2290b41ce94843aca36aa05f..0000000000000000000000000000000000000000 --- a/pipeline/nltk/decorators.py +++ /dev/null @@ -1,251 +0,0 @@ -""" -Decorator module by Michele Simionato -Copyright Michele Simionato, distributed under the terms of the BSD License (see below). -http://www.phyast.pitt.edu/~micheles/python/documentation.html - -Included in NLTK for its support of a nice memoization decorator. -""" - -__docformat__ = "restructuredtext en" - -## The basic trick is to generate the source code for the decorated function -## with the right signature and to evaluate it. -## Uncomment the statement 'print >> sys.stderr, func_src' in _decorator -## to understand what is going on. - -__all__ = ["decorator", "new_wrapper", "getinfo"] - -import sys - -# Hack to keep NLTK's "tokenize" module from colliding with the "tokenize" in -# the Python standard library. -OLD_SYS_PATH = sys.path[:] -sys.path = [p for p in sys.path if p and "nltk" not in str(p)] -import inspect - -sys.path = OLD_SYS_PATH - - -def __legacysignature(signature): - """ - For retrocompatibility reasons, we don't use a standard Signature. - Instead, we use the string generated by this method. - Basically, from a Signature we create a string and remove the default values. - """ - listsignature = str(signature)[1:-1].split(",") - for counter, param in enumerate(listsignature): - if param.count("=") > 0: - listsignature[counter] = param[0 : param.index("=")].strip() - else: - listsignature[counter] = param.strip() - return ", ".join(listsignature) - - -def getinfo(func): - """ - Returns an info dictionary containing: - - name (the name of the function : str) - - argnames (the names of the arguments : list) - - defaults (the values of the default arguments : tuple) - - signature (the signature : str) - - fullsignature (the full signature : Signature) - - doc (the docstring : str) - - module (the module name : str) - - dict (the function __dict__ : str) - - >>> def f(self, x=1, y=2, *args, **kw): pass - - >>> info = getinfo(f) - - >>> info["name"] - 'f' - >>> info["argnames"] - ['self', 'x', 'y', 'args', 'kw'] - - >>> info["defaults"] - (1, 2) - - >>> info["signature"] - 'self, x, y, *args, **kw' - - >>> info["fullsignature"] - - """ - assert inspect.ismethod(func) or inspect.isfunction(func) - argspec = inspect.getfullargspec(func) - regargs, varargs, varkwargs = argspec[:3] - argnames = list(regargs) - if varargs: - argnames.append(varargs) - if varkwargs: - argnames.append(varkwargs) - fullsignature = inspect.signature(func) - # Convert Signature to str - signature = __legacysignature(fullsignature) - - # pypy compatibility - if hasattr(func, "__closure__"): - _closure = func.__closure__ - _globals = func.__globals__ - else: - _closure = func.func_closure - _globals = func.func_globals - - return dict( - name=func.__name__, - argnames=argnames, - signature=signature, - fullsignature=fullsignature, - defaults=func.__defaults__, - doc=func.__doc__, - module=func.__module__, - dict=func.__dict__, - globals=_globals, - closure=_closure, - ) - - -def update_wrapper(wrapper, model, infodict=None): - "akin to functools.update_wrapper" - infodict = infodict or getinfo(model) - wrapper.__name__ = infodict["name"] - wrapper.__doc__ = infodict["doc"] - wrapper.__module__ = infodict["module"] - wrapper.__dict__.update(infodict["dict"]) - wrapper.__defaults__ = infodict["defaults"] - wrapper.undecorated = model - return wrapper - - -def new_wrapper(wrapper, model): - """ - An improvement over functools.update_wrapper. The wrapper is a generic - callable object. It works by generating a copy of the wrapper with the - right signature and by updating the copy, not the original. - Moreovoer, 'model' can be a dictionary with keys 'name', 'doc', 'module', - 'dict', 'defaults'. - """ - if isinstance(model, dict): - infodict = model - else: # assume model is a function - infodict = getinfo(model) - assert ( - not "_wrapper_" in infodict["argnames"] - ), '"_wrapper_" is a reserved argument name!' - src = "lambda %(signature)s: _wrapper_(%(signature)s)" % infodict - funcopy = eval(src, dict(_wrapper_=wrapper)) - return update_wrapper(funcopy, model, infodict) - - -# helper used in decorator_factory -def __call__(self, func): - return new_wrapper(lambda *a, **k: self.call(func, *a, **k), func) - - -def decorator_factory(cls): - """ - Take a class with a ``.caller`` method and return a callable decorator - object. It works by adding a suitable __call__ method to the class; - it raises a TypeError if the class already has a nontrivial __call__ - method. - """ - attrs = set(dir(cls)) - if "__call__" in attrs: - raise TypeError( - "You cannot decorate a class with a nontrivial " "__call__ method" - ) - if "call" not in attrs: - raise TypeError("You cannot decorate a class without a " ".call method") - cls.__call__ = __call__ - return cls - - -def decorator(caller): - """ - General purpose decorator factory: takes a caller function as - input and returns a decorator with the same attributes. - A caller function is any function like this:: - - def caller(func, *args, **kw): - # do something - return func(*args, **kw) - - Here is an example of usage: - - >>> @decorator - ... def chatty(f, *args, **kw): - ... print("Calling %r" % f.__name__) - ... return f(*args, **kw) - - >>> chatty.__name__ - 'chatty' - - >>> @chatty - ... def f(): pass - ... - >>> f() - Calling 'f' - - decorator can also take in input a class with a .caller method; in this - case it converts the class into a factory of callable decorator objects. - See the documentation for an example. - """ - if inspect.isclass(caller): - return decorator_factory(caller) - - def _decorator(func): # the real meat is here - infodict = getinfo(func) - argnames = infodict["argnames"] - assert not ( - "_call_" in argnames or "_func_" in argnames - ), "You cannot use _call_ or _func_ as argument names!" - src = "lambda %(signature)s: _call_(_func_, %(signature)s)" % infodict - # import sys; print >> sys.stderr, src # for debugging purposes - dec_func = eval(src, dict(_func_=func, _call_=caller)) - return update_wrapper(dec_func, func, infodict) - - return update_wrapper(_decorator, caller) - - -def getattr_(obj, name, default_thunk): - "Similar to .setdefault in dictionaries." - try: - return getattr(obj, name) - except AttributeError: - default = default_thunk() - setattr(obj, name, default) - return default - - -@decorator -def memoize(func, *args): - dic = getattr_(func, "memoize_dic", dict) - # memoize_dic is created at the first call - if args in dic: - return dic[args] - result = func(*args) - dic[args] = result - return result - - -########################## LEGALESE ############################### - -## Redistributions of source code must retain the above copyright -## notice, this list of conditions and the following disclaimer. -## Redistributions in bytecode form must reproduce the above copyright -## notice, this list of conditions and the following disclaimer in -## the documentation and/or other materials provided with the -## distribution. - -## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -## "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -## LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -## A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -## HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -## INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -## BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS -## OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -## ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR -## TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -## USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH -## DAMAGE. diff --git a/pipeline/nltk/downloader.py b/pipeline/nltk/downloader.py deleted file mode 100644 index 71519238755062c698a1d82ffa0984b3ccb5ba92..0000000000000000000000000000000000000000 --- a/pipeline/nltk/downloader.py +++ /dev/null @@ -1,2559 +0,0 @@ -# Natural Language Toolkit: Corpus & Model Downloader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -The NLTK corpus and module downloader. This module defines several -interfaces which can be used to download corpora, models, and other -data packages that can be used with NLTK. - -Downloading Packages -==================== -If called with no arguments, ``download()`` will display an interactive -interface which can be used to download and install new packages. -If Tkinter is available, then a graphical interface will be shown, -otherwise a simple text interface will be provided. - -Individual packages can be downloaded by calling the ``download()`` -function with a single argument, giving the package identifier for the -package that should be downloaded: - - >>> download('treebank') # doctest: +SKIP - [nltk_data] Downloading package 'treebank'... - [nltk_data] Unzipping corpora/treebank.zip. - -NLTK also provides a number of \"package collections\", consisting of -a group of related packages. To download all packages in a -colleciton, simply call ``download()`` with the collection's -identifier: - - >>> download('all-corpora') # doctest: +SKIP - [nltk_data] Downloading package 'abc'... - [nltk_data] Unzipping corpora/abc.zip. - [nltk_data] Downloading package 'alpino'... - [nltk_data] Unzipping corpora/alpino.zip. - ... - [nltk_data] Downloading package 'words'... - [nltk_data] Unzipping corpora/words.zip. - -Download Directory -================== -By default, packages are installed in either a system-wide directory -(if Python has sufficient access to write to it); or in the current -user's home directory. However, the ``download_dir`` argument may be -used to specify a different installation target, if desired. - -See ``Downloader.default_download_dir()`` for more a detailed -description of how the default download directory is chosen. - -NLTK Download Server -==================== -Before downloading any packages, the corpus and module downloader -contacts the NLTK download server, to retrieve an index file -describing the available packages. By default, this index file is -loaded from ``https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml``. -If necessary, it is possible to create a new ``Downloader`` object, -specifying a different URL for the package index file. - -Usage:: - - python nltk/downloader.py [-d DATADIR] [-q] [-f] [-k] PACKAGE_IDS - -or:: - - python -m nltk.downloader [-d DATADIR] [-q] [-f] [-k] PACKAGE_IDS -""" -# ---------------------------------------------------------------------- - -""" - - 0 1 2 3 -[label][----][label][----] -[column ][column ] - -Notes -===== -Handling data files.. Some questions: - -* Should the data files be kept zipped or unzipped? I say zipped. - -* Should the data files be kept in svn at all? Advantages: history; - automatic version numbers; 'svn up' could be used rather than the - downloader to update the corpora. Disadvantages: they're big, - which makes working from svn a bit of a pain. And we're planning - to potentially make them much bigger. I don't think we want - people to have to download 400MB corpora just to use nltk from svn. - -* Compromise: keep the data files in trunk/data rather than in - trunk/nltk. That way you can check them out in svn if you want - to; but you don't need to, and you can use the downloader instead. - -* Also: keep models in mind. When we change the code, we'd - potentially like the models to get updated. This could require a - little thought. - -* So.. let's assume we have a trunk/data directory, containing a bunch - of packages. The packages should be kept as zip files, because we - really shouldn't be editing them much (well -- we may edit models - more, but they tend to be binary-ish files anyway, where diffs - aren't that helpful). So we'll have trunk/data, with a bunch of - files like abc.zip and treebank.zip and propbank.zip. For each - package we could also have eg treebank.xml and propbank.xml, - describing the contents of the package (name, copyright, license, - etc). Collections would also have .xml files. Finally, we would - pull all these together to form a single index.xml file. Some - directory structure wouldn't hurt. So how about:: - - /trunk/data/ ....................... root of data svn - index.xml ........................ main index file - src/ ............................. python scripts - packages/ ........................ dir for packages - corpora/ ....................... zip & xml files for corpora - grammars/ ...................... zip & xml files for grammars - taggers/ ....................... zip & xml files for taggers - tokenizers/ .................... zip & xml files for tokenizers - etc. - collections/ ..................... xml files for collections - - Where the root (/trunk/data) would contain a makefile; and src/ - would contain a script to update the info.xml file. It could also - contain scripts to rebuild some of the various model files. The - script that builds index.xml should probably check that each zip - file expands entirely into a single subdir, whose name matches the - package's uid. - -Changes I need to make: - - in index: change "size" to "filesize" or "compressed-size" - - in index: add "unzipped-size" - - when checking status: check both compressed & uncompressed size. - uncompressed size is important to make sure we detect a problem - if something got partially unzipped. define new status values - to differentiate stale vs corrupt vs corruptly-uncompressed?? - (we shouldn't need to re-download the file if the zip file is ok - but it didn't get uncompressed fully.) - - add other fields to the index: author, license, copyright, contact, - etc. - -the current grammars/ package would become a single new package (eg -toy-grammars or book-grammars). - -xml file should have: - - authorship info - - license info - - copyright info - - contact info - - info about what type of data/annotation it contains? - - recommended corpus reader? - -collections can contain other collections. they can also contain -multiple package types (corpora & models). Have a single 'basics' -package that includes everything we talk about in the book? - -n.b.: there will have to be a fallback to the punkt tokenizer, in case -they didn't download that model. - -default: unzip or not? - -""" -import functools -import itertools -import os -import shutil -import subprocess -import sys -import textwrap -import threading -import time -import warnings -import zipfile -from hashlib import md5 -from xml.etree import ElementTree - -try: - TKINTER = True - from tkinter import Button, Canvas, Entry, Frame, IntVar, Label, Menu, TclError, Tk - from tkinter.messagebox import showerror - - from nltk.draw.table import Table - from nltk.draw.util import ShowText -except ImportError: - TKINTER = False - TclError = ValueError - -from urllib.error import HTTPError, URLError -from urllib.request import urlopen - -import nltk - -# urllib2 = nltk.internals.import_from_stdlib('urllib2') - - -###################################################################### -# Directory entry objects (from the data server's index file) -###################################################################### - - -class Package: - """ - A directory entry for a downloadable package. These entries are - extracted from the XML index file that is downloaded by - ``Downloader``. Each package consists of a single file; but if - that file is a zip file, then it can be automatically decompressed - when the package is installed. - """ - - def __init__( - self, - id, - url, - name=None, - subdir="", - size=None, - unzipped_size=None, - checksum=None, - svn_revision=None, - copyright="Unknown", - contact="Unknown", - license="Unknown", - author="Unknown", - unzip=True, - **kw, - ): - self.id = id - """A unique identifier for this package.""" - - self.name = name or id - """A string name for this package.""" - - self.subdir = subdir - """The subdirectory where this package should be installed. - E.g., ``'corpora'`` or ``'taggers'``.""" - - self.url = url - """A URL that can be used to download this package's file.""" - - self.size = int(size) - """The filesize (in bytes) of the package file.""" - - self.unzipped_size = int(unzipped_size) - """The total filesize of the files contained in the package's - zipfile.""" - - self.checksum = checksum - """The MD-5 checksum of the package file.""" - - self.svn_revision = svn_revision - """A subversion revision number for this package.""" - - self.copyright = copyright - """Copyright holder for this package.""" - - self.contact = contact - """Name & email of the person who should be contacted with - questions about this package.""" - - self.license = license - """License information for this package.""" - - self.author = author - """Author of this package.""" - - ext = os.path.splitext(url.split("/")[-1])[1] - self.filename = os.path.join(subdir, id + ext) - """The filename that should be used for this package's file. It - is formed by joining ``self.subdir`` with ``self.id``, and - using the same extension as ``url``.""" - - self.unzip = bool(int(unzip)) # '0' or '1' - """A flag indicating whether this corpus should be unzipped by - default.""" - - # Include any other attributes provided by the XML file. - self.__dict__.update(kw) - - @staticmethod - def fromxml(xml): - if isinstance(xml, str): - xml = ElementTree.parse(xml) - for key in xml.attrib: - xml.attrib[key] = str(xml.attrib[key]) - return Package(**xml.attrib) - - def __lt__(self, other): - return self.id < other.id - - def __repr__(self): - return "" % self.id - - -class Collection: - """ - A directory entry for a collection of downloadable packages. - These entries are extracted from the XML index file that is - downloaded by ``Downloader``. - """ - - def __init__(self, id, children, name=None, **kw): - self.id = id - """A unique identifier for this collection.""" - - self.name = name or id - """A string name for this collection.""" - - self.children = children - """A list of the ``Collections`` or ``Packages`` directly - contained by this collection.""" - - self.packages = None - """A list of ``Packages`` contained by this collection or any - collections it recursively contains.""" - - # Include any other attributes provided by the XML file. - self.__dict__.update(kw) - - @staticmethod - def fromxml(xml): - if isinstance(xml, str): - xml = ElementTree.parse(xml) - for key in xml.attrib: - xml.attrib[key] = str(xml.attrib[key]) - children = [child.get("ref") for child in xml.findall("item")] - return Collection(children=children, **xml.attrib) - - def __lt__(self, other): - return self.id < other.id - - def __repr__(self): - return "" % self.id - - -###################################################################### -# Message Passing Objects -###################################################################### - - -class DownloaderMessage: - """A status message object, used by ``incr_download`` to - communicate its progress.""" - - -class StartCollectionMessage(DownloaderMessage): - """Data server has started working on a collection of packages.""" - - def __init__(self, collection): - self.collection = collection - - -class FinishCollectionMessage(DownloaderMessage): - """Data server has finished working on a collection of packages.""" - - def __init__(self, collection): - self.collection = collection - - -class StartPackageMessage(DownloaderMessage): - """Data server has started working on a package.""" - - def __init__(self, package): - self.package = package - - -class FinishPackageMessage(DownloaderMessage): - """Data server has finished working on a package.""" - - def __init__(self, package): - self.package = package - - -class StartDownloadMessage(DownloaderMessage): - """Data server has started downloading a package.""" - - def __init__(self, package): - self.package = package - - -class FinishDownloadMessage(DownloaderMessage): - """Data server has finished downloading a package.""" - - def __init__(self, package): - self.package = package - - -class StartUnzipMessage(DownloaderMessage): - """Data server has started unzipping a package.""" - - def __init__(self, package): - self.package = package - - -class FinishUnzipMessage(DownloaderMessage): - """Data server has finished unzipping a package.""" - - def __init__(self, package): - self.package = package - - -class UpToDateMessage(DownloaderMessage): - """The package download file is already up-to-date""" - - def __init__(self, package): - self.package = package - - -class StaleMessage(DownloaderMessage): - """The package download file is out-of-date or corrupt""" - - def __init__(self, package): - self.package = package - - -class ErrorMessage(DownloaderMessage): - """Data server encountered an error""" - - def __init__(self, package, message): - self.package = package - if isinstance(message, Exception): - self.message = str(message) - else: - self.message = message - - -class ProgressMessage(DownloaderMessage): - """Indicates how much progress the data server has made""" - - def __init__(self, progress): - self.progress = progress - - -class SelectDownloadDirMessage(DownloaderMessage): - """Indicates what download directory the data server is using""" - - def __init__(self, download_dir): - self.download_dir = download_dir - - -###################################################################### -# NLTK Data Server -###################################################################### - - -class Downloader: - """ - A class used to access the NLTK data server, which can be used to - download corpora and other data packages. - """ - - # ///////////////////////////////////////////////////////////////// - # Configuration - # ///////////////////////////////////////////////////////////////// - - INDEX_TIMEOUT = 60 * 60 # 1 hour - """The amount of time after which the cached copy of the data - server index will be considered 'stale,' and will be - re-downloaded.""" - - DEFAULT_URL = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml" - """The default URL for the NLTK data server's index. An - alternative URL can be specified when creating a new - ``Downloader`` object.""" - - # ///////////////////////////////////////////////////////////////// - # Status Constants - # ///////////////////////////////////////////////////////////////// - - INSTALLED = "installed" - """A status string indicating that a package or collection is - installed and up-to-date.""" - NOT_INSTALLED = "not installed" - """A status string indicating that a package or collection is - not installed.""" - STALE = "out of date" - """A status string indicating that a package or collection is - corrupt or out-of-date.""" - PARTIAL = "partial" - """A status string indicating that a collection is partially - installed (i.e., only some of its packages are installed.)""" - - # ///////////////////////////////////////////////////////////////// - # Constructor - # ///////////////////////////////////////////////////////////////// - - def __init__(self, server_index_url=None, download_dir=None): - self._url = server_index_url or self.DEFAULT_URL - """The URL for the data server's index file.""" - - self._collections = {} - """Dictionary from collection identifier to ``Collection``""" - - self._packages = {} - """Dictionary from package identifier to ``Package``""" - - self._download_dir = download_dir - """The default directory to which packages will be downloaded.""" - - self._index = None - """The XML index file downloaded from the data server""" - - self._index_timestamp = None - """Time at which ``self._index`` was downloaded. If it is more - than ``INDEX_TIMEOUT`` seconds old, it will be re-downloaded.""" - - self._status_cache = {} - """Dictionary from package/collection identifier to status - string (``INSTALLED``, ``NOT_INSTALLED``, ``STALE``, or - ``PARTIAL``). Cache is used for packages only, not - collections.""" - - self._errors = None - """Flag for telling if all packages got successfully downloaded or not.""" - - # decide where we're going to save things to. - if self._download_dir is None: - self._download_dir = self.default_download_dir() - - # ///////////////////////////////////////////////////////////////// - # Information - # ///////////////////////////////////////////////////////////////// - - def list( - self, - download_dir=None, - show_packages=True, - show_collections=True, - header=True, - more_prompt=False, - skip_installed=False, - ): - lines = 0 # for more_prompt - if download_dir is None: - download_dir = self._download_dir - print("Using default data directory (%s)" % download_dir) - if header: - print("=" * (26 + len(self._url))) - print(" Data server index for <%s>" % self._url) - print("=" * (26 + len(self._url))) - lines += 3 # for more_prompt - stale = partial = False - - categories = [] - if show_packages: - categories.append("packages") - if show_collections: - categories.append("collections") - for category in categories: - print("%s:" % category.capitalize()) - lines += 1 # for more_prompt - for info in sorted(getattr(self, category)(), key=str): - status = self.status(info, download_dir) - if status == self.INSTALLED and skip_installed: - continue - if status == self.STALE: - stale = True - if status == self.PARTIAL: - partial = True - prefix = { - self.INSTALLED: "*", - self.STALE: "-", - self.PARTIAL: "P", - self.NOT_INSTALLED: " ", - }[status] - name = textwrap.fill( - "-" * 27 + (info.name or info.id), 75, subsequent_indent=27 * " " - )[27:] - print(" [{}] {} {}".format(prefix, info.id.ljust(20, "."), name)) - lines += len(name.split("\n")) # for more_prompt - if more_prompt and lines > 20: - user_input = input("Hit Enter to continue: ") - if user_input.lower() in ("x", "q"): - return - lines = 0 - print() - msg = "([*] marks installed packages" - if stale: - msg += "; [-] marks out-of-date or corrupt packages" - if partial: - msg += "; [P] marks partially installed collections" - print(textwrap.fill(msg + ")", subsequent_indent=" ", width=76)) - - def packages(self): - self._update_index() - return self._packages.values() - - def corpora(self): - self._update_index() - return [pkg for (id, pkg) in self._packages.items() if pkg.subdir == "corpora"] - - def models(self): - self._update_index() - return [pkg for (id, pkg) in self._packages.items() if pkg.subdir != "corpora"] - - def collections(self): - self._update_index() - return self._collections.values() - - # ///////////////////////////////////////////////////////////////// - # Downloading - # ///////////////////////////////////////////////////////////////// - - def _info_or_id(self, info_or_id): - if isinstance(info_or_id, str): - return self.info(info_or_id) - else: - return info_or_id - - # [xx] When during downloading is it 'safe' to abort? Only unsafe - # time is *during* an unzip -- we don't want to leave a - # partially-unzipped corpus in place because we wouldn't notice - # it. But if we had the exact total size of the unzipped corpus, - # then that would be fine. Then we could abort anytime we want! - # So this is really what we should do. That way the threaded - # downloader in the gui can just kill the download thread anytime - # it wants. - - def incr_download(self, info_or_id, download_dir=None, force=False): - # If they didn't specify a download_dir, then use the default one. - if download_dir is None: - download_dir = self._download_dir - yield SelectDownloadDirMessage(download_dir) - - # If they gave us a list of ids, then download each one. - if isinstance(info_or_id, (list, tuple)): - yield from self._download_list(info_or_id, download_dir, force) - return - - # Look up the requested collection or package. - try: - info = self._info_or_id(info_or_id) - except (OSError, ValueError) as e: - yield ErrorMessage(None, f"Error loading {info_or_id}: {e}") - return - - # Handle collections. - if isinstance(info, Collection): - yield StartCollectionMessage(info) - yield from self.incr_download(info.children, download_dir, force) - yield FinishCollectionMessage(info) - - # Handle Packages (delegate to a helper function). - else: - yield from self._download_package(info, download_dir, force) - - def _num_packages(self, item): - if isinstance(item, Package): - return 1 - else: - return len(item.packages) - - def _download_list(self, items, download_dir, force): - # Look up the requested items. - for i in range(len(items)): - try: - items[i] = self._info_or_id(items[i]) - except (OSError, ValueError) as e: - yield ErrorMessage(items[i], e) - return - - # Download each item, re-scaling their progress. - num_packages = sum(self._num_packages(item) for item in items) - progress = 0 - for i, item in enumerate(items): - if isinstance(item, Package): - delta = 1.0 / num_packages - else: - delta = len(item.packages) / num_packages - for msg in self.incr_download(item, download_dir, force): - if isinstance(msg, ProgressMessage): - yield ProgressMessage(progress + msg.progress * delta) - else: - yield msg - - progress += 100 * delta - - def _download_package(self, info, download_dir, force): - yield StartPackageMessage(info) - yield ProgressMessage(0) - - # Do we already have the current version? - status = self.status(info, download_dir) - if not force and status == self.INSTALLED: - yield UpToDateMessage(info) - yield ProgressMessage(100) - yield FinishPackageMessage(info) - return - - # Remove the package from our status cache - self._status_cache.pop(info.id, None) - - # Check for (and remove) any old/stale version. - filepath = os.path.join(download_dir, info.filename) - if os.path.exists(filepath): - if status == self.STALE: - yield StaleMessage(info) - os.remove(filepath) - - # Ensure the download_dir exists - if not os.path.exists(download_dir): - os.makedirs(download_dir) - if not os.path.exists(os.path.join(download_dir, info.subdir)): - os.makedirs(os.path.join(download_dir, info.subdir)) - - # Download the file. This will raise an IOError if the url - # is not found. - yield StartDownloadMessage(info) - yield ProgressMessage(5) - try: - infile = urlopen(info.url) - with open(filepath, "wb") as outfile: - num_blocks = max(1, info.size / (1024 * 16)) - for block in itertools.count(): - s = infile.read(1024 * 16) # 16k blocks. - outfile.write(s) - if not s: - break - if block % 2 == 0: # how often? - yield ProgressMessage(min(80, 5 + 75 * (block / num_blocks))) - infile.close() - except OSError as e: - yield ErrorMessage( - info, - "Error downloading %r from <%s>:" "\n %s" % (info.id, info.url, e), - ) - return - yield FinishDownloadMessage(info) - yield ProgressMessage(80) - - # If it's a zipfile, uncompress it. - if info.filename.endswith(".zip"): - zipdir = os.path.join(download_dir, info.subdir) - # Unzip if we're unzipping by default; *or* if it's already - # been unzipped (presumably a previous version). - if info.unzip or os.path.exists(os.path.join(zipdir, info.id)): - yield StartUnzipMessage(info) - for msg in _unzip_iter(filepath, zipdir, verbose=False): - # Somewhat of a hack, but we need a proper package reference - msg.package = info - yield msg - yield FinishUnzipMessage(info) - - yield FinishPackageMessage(info) - - def download( - self, - info_or_id=None, - download_dir=None, - quiet=False, - force=False, - prefix="[nltk_data] ", - halt_on_error=True, - raise_on_error=False, - print_error_to=sys.stderr, - ): - - print_to = functools.partial(print, file=print_error_to) - # If no info or id is given, then use the interactive shell. - if info_or_id is None: - # [xx] hmm -- changing self._download_dir here seems like - # the wrong thing to do. Maybe the _interactive_download - # function should make a new copy of self to use? - if download_dir is not None: - self._download_dir = download_dir - self._interactive_download() - return True - - else: - # Define a helper function for displaying output: - def show(s, prefix2=""): - print_to( - textwrap.fill( - s, - initial_indent=prefix + prefix2, - subsequent_indent=prefix + prefix2 + " " * 4, - ) - ) - - for msg in self.incr_download(info_or_id, download_dir, force): - # Error messages - if isinstance(msg, ErrorMessage): - show(msg.message) - if raise_on_error: - raise ValueError(msg.message) - if halt_on_error: - return False - self._errors = True - if not quiet: - print_to("Error installing package. Retry? [n/y/e]") - choice = input().strip() - if choice in ["y", "Y"]: - if not self.download( - msg.package.id, - download_dir, - quiet, - force, - prefix, - halt_on_error, - raise_on_error, - ): - return False - elif choice in ["e", "E"]: - return False - - # All other messages - if not quiet: - # Collection downloading messages: - if isinstance(msg, StartCollectionMessage): - show("Downloading collection %r" % msg.collection.id) - prefix += " | " - print_to(prefix) - elif isinstance(msg, FinishCollectionMessage): - print_to(prefix) - prefix = prefix[:-4] - if self._errors: - show( - "Downloaded collection %r with errors" - % msg.collection.id - ) - else: - show("Done downloading collection %s" % msg.collection.id) - - # Package downloading messages: - elif isinstance(msg, StartPackageMessage): - show( - "Downloading package %s to %s..." - % (msg.package.id, download_dir) - ) - elif isinstance(msg, UpToDateMessage): - show("Package %s is already up-to-date!" % msg.package.id, " ") - # elif isinstance(msg, StaleMessage): - # show('Package %s is out-of-date or corrupt' % - # msg.package.id, ' ') - elif isinstance(msg, StartUnzipMessage): - show("Unzipping %s." % msg.package.filename, " ") - - # Data directory message: - elif isinstance(msg, SelectDownloadDirMessage): - download_dir = msg.download_dir - return True - - def is_stale(self, info_or_id, download_dir=None): - return self.status(info_or_id, download_dir) == self.STALE - - def is_installed(self, info_or_id, download_dir=None): - return self.status(info_or_id, download_dir) == self.INSTALLED - - def clear_status_cache(self, id=None): - if id is None: - self._status_cache.clear() - else: - self._status_cache.pop(id, None) - - def status(self, info_or_id, download_dir=None): - """ - Return a constant describing the status of the given package - or collection. Status can be one of ``INSTALLED``, - ``NOT_INSTALLED``, ``STALE``, or ``PARTIAL``. - """ - if download_dir is None: - download_dir = self._download_dir - info = self._info_or_id(info_or_id) - - # Handle collections: - if isinstance(info, Collection): - pkg_status = [self.status(pkg.id) for pkg in info.packages] - if self.STALE in pkg_status: - return self.STALE - elif self.PARTIAL in pkg_status: - return self.PARTIAL - elif self.INSTALLED in pkg_status and self.NOT_INSTALLED in pkg_status: - return self.PARTIAL - elif self.NOT_INSTALLED in pkg_status: - return self.NOT_INSTALLED - else: - return self.INSTALLED - - # Handle packages: - else: - filepath = os.path.join(download_dir, info.filename) - if download_dir != self._download_dir: - return self._pkg_status(info, filepath) - else: - if info.id not in self._status_cache: - self._status_cache[info.id] = self._pkg_status(info, filepath) - return self._status_cache[info.id] - - def _pkg_status(self, info, filepath): - if not os.path.exists(filepath): - return self.NOT_INSTALLED - - # Check if the file has the correct size. - try: - filestat = os.stat(filepath) - except OSError: - return self.NOT_INSTALLED - if filestat.st_size != int(info.size): - return self.STALE - - # Check if the file's checksum matches - if md5_hexdigest(filepath) != info.checksum: - return self.STALE - - # If it's a zipfile, and it's been at least partially - # unzipped, then check if it's been fully unzipped. - if filepath.endswith(".zip"): - unzipdir = filepath[:-4] - if not os.path.exists(unzipdir): - return self.INSTALLED # but not unzipped -- ok! - if not os.path.isdir(unzipdir): - return self.STALE - - unzipped_size = sum( - os.stat(os.path.join(d, f)).st_size - for d, _, files in os.walk(unzipdir) - for f in files - ) - if unzipped_size != info.unzipped_size: - return self.STALE - - # Otherwise, everything looks good. - return self.INSTALLED - - def update(self, quiet=False, prefix="[nltk_data] "): - """ - Re-download any packages whose status is STALE. - """ - self.clear_status_cache() - for pkg in self.packages(): - if self.status(pkg) == self.STALE: - self.download(pkg, quiet=quiet, prefix=prefix) - - # ///////////////////////////////////////////////////////////////// - # Index - # ///////////////////////////////////////////////////////////////// - - def _update_index(self, url=None): - """A helper function that ensures that self._index is - up-to-date. If the index is older than self.INDEX_TIMEOUT, - then download it again.""" - # Check if the index is already up-to-date. If so, do nothing. - if not ( - self._index is None - or url is not None - or time.time() - self._index_timestamp > self.INDEX_TIMEOUT - ): - return - - # If a URL was specified, then update our URL. - self._url = url or self._url - - # Download the index file. - self._index = nltk.internals.ElementWrapper( - ElementTree.parse(urlopen(self._url)).getroot() - ) - self._index_timestamp = time.time() - - # Build a dictionary of packages. - packages = [Package.fromxml(p) for p in self._index.findall("packages/package")] - self._packages = {p.id: p for p in packages} - - # Build a dictionary of collections. - collections = [ - Collection.fromxml(c) for c in self._index.findall("collections/collection") - ] - self._collections = {c.id: c for c in collections} - - # Replace identifiers with actual children in collection.children. - for collection in self._collections.values(): - for i, child_id in enumerate(collection.children): - if child_id in self._packages: - collection.children[i] = self._packages[child_id] - elif child_id in self._collections: - collection.children[i] = self._collections[child_id] - else: - print( - "removing collection member with no package: {}".format( - child_id - ) - ) - del collection.children[i] - - # Fill in collection.packages for each collection. - for collection in self._collections.values(): - packages = {} - queue = [collection] - for child in queue: - if isinstance(child, Collection): - queue.extend(child.children) - elif isinstance(child, Package): - packages[child.id] = child - else: - pass - collection.packages = packages.values() - - # Flush the status cache - self._status_cache.clear() - - def index(self): - """ - Return the XML index describing the packages available from - the data server. If necessary, this index will be downloaded - from the data server. - """ - self._update_index() - return self._index - - def info(self, id): - """Return the ``Package`` or ``Collection`` record for the - given item.""" - self._update_index() - if id in self._packages: - return self._packages[id] - if id in self._collections: - return self._collections[id] - raise ValueError("Package %r not found in index" % id) - - def xmlinfo(self, id): - """Return the XML info record for the given item""" - self._update_index() - for package in self._index.findall("packages/package"): - if package.get("id") == id: - return package - for collection in self._index.findall("collections/collection"): - if collection.get("id") == id: - return collection - raise ValueError("Package %r not found in index" % id) - - # ///////////////////////////////////////////////////////////////// - # URL & Data Directory - # ///////////////////////////////////////////////////////////////// - - def _get_url(self): - """The URL for the data server's index file.""" - return self._url - - def _set_url(self, url): - """ - Set a new URL for the data server. If we're unable to contact - the given url, then the original url is kept. - """ - original_url = self._url - try: - self._update_index(url) - except: - self._url = original_url - raise - - url = property(_get_url, _set_url) - - def default_download_dir(self): - """ - Return the directory to which packages will be downloaded by - default. This value can be overridden using the constructor, - or on a case-by-case basis using the ``download_dir`` argument when - calling ``download()``. - - On Windows, the default download directory is - ``PYTHONHOME/lib/nltk``, where *PYTHONHOME* is the - directory containing Python, e.g. ``C:\\Python25``. - - On all other platforms, the default directory is the first of - the following which exists or which can be created with write - permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``, - ``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``. - """ - # Check if we are on GAE where we cannot write into filesystem. - if "APPENGINE_RUNTIME" in os.environ: - return - - # Check if we have sufficient permissions to install in a - # variety of system-wide locations. - for nltkdir in nltk.data.path: - if os.path.exists(nltkdir) and nltk.internals.is_writable(nltkdir): - return nltkdir - - # On Windows, use %APPDATA% - if sys.platform == "win32" and "APPDATA" in os.environ: - homedir = os.environ["APPDATA"] - - # Otherwise, install in the user's home directory. - else: - homedir = os.path.expanduser("~/") - if homedir == "~/": - raise ValueError("Could not find a default download directory") - - # append "nltk_data" to the home directory - return os.path.join(homedir, "nltk_data") - - def _get_download_dir(self): - """ - The default directory to which packages will be downloaded. - This defaults to the value returned by ``default_download_dir()``. - To override this default on a case-by-case basis, use the - ``download_dir`` argument when calling ``download()``. - """ - return self._download_dir - - def _set_download_dir(self, download_dir): - self._download_dir = download_dir - # Clear the status cache. - self._status_cache.clear() - - download_dir = property(_get_download_dir, _set_download_dir) - - # ///////////////////////////////////////////////////////////////// - # Interactive Shell - # ///////////////////////////////////////////////////////////////// - - def _interactive_download(self): - # Try the GUI first; if that doesn't work, try the simple - # interactive shell. - if TKINTER: - try: - DownloaderGUI(self).mainloop() - except TclError: - DownloaderShell(self).run() - else: - DownloaderShell(self).run() - - -class DownloaderShell: - def __init__(self, dataserver): - self._ds = dataserver - - def _simple_interactive_menu(self, *options): - print("-" * 75) - spc = (68 - sum(len(o) for o in options)) // (len(options) - 1) * " " - print(" " + spc.join(options)) - print("-" * 75) - - def run(self): - print("NLTK Downloader") - while True: - self._simple_interactive_menu( - "d) Download", - "l) List", - " u) Update", - "c) Config", - "h) Help", - "q) Quit", - ) - user_input = input("Downloader> ").strip() - if not user_input: - print() - continue - command = user_input.lower().split()[0] - args = user_input.split()[1:] - try: - if command == "l": - print() - self._ds.list(self._ds.download_dir, header=False, more_prompt=True) - elif command == "h": - self._simple_interactive_help() - elif command == "c": - self._simple_interactive_config() - elif command in ("q", "x"): - return - elif command == "d": - self._simple_interactive_download(args) - elif command == "u": - self._simple_interactive_update() - else: - print("Command %r unrecognized" % user_input) - except HTTPError as e: - print("Error reading from server: %s" % e) - except URLError as e: - print("Error connecting to server: %s" % e.reason) - # try checking if user_input is a package name, & - # downloading it? - print() - - def _simple_interactive_download(self, args): - if args: - for arg in args: - try: - self._ds.download(arg, prefix=" ") - except (OSError, ValueError) as e: - print(e) - else: - while True: - print() - print("Download which package (l=list; x=cancel)?") - user_input = input(" Identifier> ") - if user_input.lower() == "l": - self._ds.list( - self._ds.download_dir, - header=False, - more_prompt=True, - skip_installed=True, - ) - continue - elif user_input.lower() in ("x", "q", ""): - return - elif user_input: - for id in user_input.split(): - try: - self._ds.download(id, prefix=" ") - except (OSError, ValueError) as e: - print(e) - break - - def _simple_interactive_update(self): - while True: - stale_packages = [] - stale = partial = False - for info in sorted(getattr(self._ds, "packages")(), key=str): - if self._ds.status(info) == self._ds.STALE: - stale_packages.append((info.id, info.name)) - - print() - if stale_packages: - print("Will update following packages (o=ok; x=cancel)") - for pid, pname in stale_packages: - name = textwrap.fill( - "-" * 27 + (pname), 75, subsequent_indent=27 * " " - )[27:] - print(" [ ] {} {}".format(pid.ljust(20, "."), name)) - print() - - user_input = input(" Identifier> ") - if user_input.lower() == "o": - for pid, pname in stale_packages: - try: - self._ds.download(pid, prefix=" ") - except (OSError, ValueError) as e: - print(e) - break - elif user_input.lower() in ("x", "q", ""): - return - else: - print("Nothing to update.") - return - - def _simple_interactive_help(self): - print() - print("Commands:") - print( - " d) Download a package or collection u) Update out of date packages" - ) - print(" l) List packages & collections h) Help") - print(" c) View & Modify Configuration q) Quit") - - def _show_config(self): - print() - print("Data Server:") - print(" - URL: <%s>" % self._ds.url) - print(" - %d Package Collections Available" % len(self._ds.collections())) - print(" - %d Individual Packages Available" % len(self._ds.packages())) - print() - print("Local Machine:") - print(" - Data directory: %s" % self._ds.download_dir) - - def _simple_interactive_config(self): - self._show_config() - while True: - print() - self._simple_interactive_menu( - "s) Show Config", "u) Set Server URL", "d) Set Data Dir", "m) Main Menu" - ) - user_input = input("Config> ").strip().lower() - if user_input == "s": - self._show_config() - elif user_input == "d": - new_dl_dir = input(" New Directory> ").strip() - if new_dl_dir in ("", "x", "q", "X", "Q"): - print(" Cancelled!") - elif os.path.isdir(new_dl_dir): - self._ds.download_dir = new_dl_dir - else: - print("Directory %r not found! Create it first." % new_dl_dir) - elif user_input == "u": - new_url = input(" New URL> ").strip() - if new_url in ("", "x", "q", "X", "Q"): - print(" Cancelled!") - else: - if not new_url.startswith(("http://", "https://")): - new_url = "http://" + new_url - try: - self._ds.url = new_url - except Exception as e: - print(f"Error reading <{new_url!r}>:\n {e}") - elif user_input == "m": - break - - -class DownloaderGUI: - """ - Graphical interface for downloading packages from the NLTK data - server. - """ - - # ///////////////////////////////////////////////////////////////// - # Column Configuration - # ///////////////////////////////////////////////////////////////// - - COLUMNS = [ - "", - "Identifier", - "Name", - "Size", - "Status", - "Unzipped Size", - "Copyright", - "Contact", - "License", - "Author", - "Subdir", - "Checksum", - ] - """A list of the names of columns. This controls the order in - which the columns will appear. If this is edited, then - ``_package_to_columns()`` may need to be edited to match.""" - - COLUMN_WEIGHTS = {"": 0, "Name": 5, "Size": 0, "Status": 0} - """A dictionary specifying how columns should be resized when the - table is resized. Columns with weight 0 will not be resized at - all; and columns with high weight will be resized more. - Default weight (for columns not explicitly listed) is 1.""" - - COLUMN_WIDTHS = { - "": 1, - "Identifier": 20, - "Name": 45, - "Size": 10, - "Unzipped Size": 10, - "Status": 12, - } - """A dictionary specifying how wide each column should be, in - characters. The default width (for columns not explicitly - listed) is specified by ``DEFAULT_COLUMN_WIDTH``.""" - - DEFAULT_COLUMN_WIDTH = 30 - """The default width for columns that are not explicitly listed - in ``COLUMN_WIDTHS``.""" - - INITIAL_COLUMNS = ["", "Identifier", "Name", "Size", "Status"] - """The set of columns that should be displayed by default.""" - - # Perform a few import-time sanity checks to make sure that the - # column configuration variables are defined consistently: - for c in COLUMN_WEIGHTS: - assert c in COLUMNS - for c in COLUMN_WIDTHS: - assert c in COLUMNS - for c in INITIAL_COLUMNS: - assert c in COLUMNS - - # ///////////////////////////////////////////////////////////////// - # Color Configuration - # ///////////////////////////////////////////////////////////////// - - _BACKDROP_COLOR = ("#000", "#ccc") - - _ROW_COLOR = { - Downloader.INSTALLED: ("#afa", "#080"), - Downloader.PARTIAL: ("#ffa", "#880"), - Downloader.STALE: ("#faa", "#800"), - Downloader.NOT_INSTALLED: ("#fff", "#888"), - } - - _MARK_COLOR = ("#000", "#ccc") - - # _FRONT_TAB_COLOR = ('#ccf', '#008') - # _BACK_TAB_COLOR = ('#88a', '#448') - _FRONT_TAB_COLOR = ("#fff", "#45c") - _BACK_TAB_COLOR = ("#aaa", "#67a") - - _PROGRESS_COLOR = ("#f00", "#aaa") - - _TAB_FONT = "helvetica -16 bold" - - # ///////////////////////////////////////////////////////////////// - # Constructor - # ///////////////////////////////////////////////////////////////// - - def __init__(self, dataserver, use_threads=True): - self._ds = dataserver - self._use_threads = use_threads - - # For the threaded downloader: - self._download_lock = threading.Lock() - self._download_msg_queue = [] - self._download_abort_queue = [] - self._downloading = False - - # For tkinter after callbacks: - self._afterid = {} - - # A message log. - self._log_messages = [] - self._log_indent = 0 - self._log("NLTK Downloader Started!") - - # Create the main window. - top = self.top = Tk() - top.geometry("+50+50") - top.title("NLTK Downloader") - top.configure(background=self._BACKDROP_COLOR[1]) - - # Set up some bindings now, in case anything goes wrong. - top.bind("", self.destroy) - top.bind("", self.destroy) - self._destroyed = False - - self._column_vars = {} - - # Initialize the GUI. - self._init_widgets() - self._init_menu() - try: - self._fill_table() - except HTTPError as e: - showerror("Error reading from server", e) - except URLError as e: - showerror("Error connecting to server", e.reason) - - self._show_info() - self._select_columns() - self._table.select(0) - - # Make sure we get notified when we're destroyed, so we can - # cancel any download in progress. - self._table.bind("", self._destroy) - - def _log(self, msg): - self._log_messages.append( - "{} {}{}".format(time.ctime(), " | " * self._log_indent, msg) - ) - - # ///////////////////////////////////////////////////////////////// - # Internals - # ///////////////////////////////////////////////////////////////// - - def _init_widgets(self): - # Create the top-level frame structures - f1 = Frame(self.top, relief="raised", border=2, padx=8, pady=0) - f1.pack(sid="top", expand=True, fill="both") - f1.grid_rowconfigure(2, weight=1) - f1.grid_columnconfigure(0, weight=1) - Frame(f1, height=8).grid(column=0, row=0) # spacer - tabframe = Frame(f1) - tabframe.grid(column=0, row=1, sticky="news") - tableframe = Frame(f1) - tableframe.grid(column=0, row=2, sticky="news") - buttonframe = Frame(f1) - buttonframe.grid(column=0, row=3, sticky="news") - Frame(f1, height=8).grid(column=0, row=4) # spacer - infoframe = Frame(f1) - infoframe.grid(column=0, row=5, sticky="news") - Frame(f1, height=8).grid(column=0, row=6) # spacer - progressframe = Frame( - self.top, padx=3, pady=3, background=self._BACKDROP_COLOR[1] - ) - progressframe.pack(side="bottom", fill="x") - self.top["border"] = 0 - self.top["highlightthickness"] = 0 - - # Create the tabs - self._tab_names = ["Collections", "Corpora", "Models", "All Packages"] - self._tabs = {} - for i, tab in enumerate(self._tab_names): - label = Label(tabframe, text=tab, font=self._TAB_FONT) - label.pack(side="left", padx=((i + 1) % 2) * 10) - label.bind("", self._select_tab) - self._tabs[tab.lower()] = label - - # Create the table. - column_weights = [self.COLUMN_WEIGHTS.get(column, 1) for column in self.COLUMNS] - self._table = Table( - tableframe, - self.COLUMNS, - column_weights=column_weights, - highlightthickness=0, - listbox_height=16, - reprfunc=self._table_reprfunc, - ) - self._table.columnconfig(0, foreground=self._MARK_COLOR[0]) # marked - for i, column in enumerate(self.COLUMNS): - width = self.COLUMN_WIDTHS.get(column, self.DEFAULT_COLUMN_WIDTH) - self._table.columnconfig(i, width=width) - self._table.pack(expand=True, fill="both") - self._table.focus() - self._table.bind_to_listboxes("", self._download) - self._table.bind("", self._table_mark) - self._table.bind("", self._download) - self._table.bind("", self._prev_tab) - self._table.bind("", self._next_tab) - self._table.bind("", self._mark_all) - - # Create entry boxes for URL & download_dir - infoframe.grid_columnconfigure(1, weight=1) - - info = [ - ("url", "Server Index:", self._set_url), - ("download_dir", "Download Directory:", self._set_download_dir), - ] - self._info = {} - for (i, (key, label, callback)) in enumerate(info): - Label(infoframe, text=label).grid(column=0, row=i, sticky="e") - entry = Entry( - infoframe, - font="courier", - relief="groove", - disabledforeground="#007aff", - foreground="#007aff", - ) - self._info[key] = (entry, callback) - entry.bind("", self._info_save) - entry.bind("", lambda e, key=key: self._info_edit(key)) - entry.grid(column=1, row=i, sticky="ew") - - # If the user edits url or download_dir, and then clicks outside - # the entry box, then save their results. - self.top.bind("", self._info_save) - - # Create Download & Refresh buttons. - self._download_button = Button( - buttonframe, text="Download", command=self._download, width=8 - ) - self._download_button.pack(side="left") - self._refresh_button = Button( - buttonframe, text="Refresh", command=self._refresh, width=8 - ) - self._refresh_button.pack(side="right") - - # Create Progress bar - self._progresslabel = Label( - progressframe, - text="", - foreground=self._BACKDROP_COLOR[0], - background=self._BACKDROP_COLOR[1], - ) - self._progressbar = Canvas( - progressframe, - width=200, - height=16, - background=self._PROGRESS_COLOR[1], - relief="sunken", - border=1, - ) - self._init_progressbar() - self._progressbar.pack(side="right") - self._progresslabel.pack(side="left") - - def _init_menu(self): - menubar = Menu(self.top) - - filemenu = Menu(menubar, tearoff=0) - filemenu.add_command( - label="Download", underline=0, command=self._download, accelerator="Return" - ) - filemenu.add_separator() - filemenu.add_command( - label="Change Server Index", - underline=7, - command=lambda: self._info_edit("url"), - ) - filemenu.add_command( - label="Change Download Directory", - underline=0, - command=lambda: self._info_edit("download_dir"), - ) - filemenu.add_separator() - filemenu.add_command(label="Show Log", underline=5, command=self._show_log) - filemenu.add_separator() - filemenu.add_command( - label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x" - ) - menubar.add_cascade(label="File", underline=0, menu=filemenu) - - # Create a menu to control which columns of the table are - # shown. n.b.: we never hide the first two columns (mark and - # identifier). - viewmenu = Menu(menubar, tearoff=0) - for column in self._table.column_names[2:]: - var = IntVar(self.top) - assert column not in self._column_vars - self._column_vars[column] = var - if column in self.INITIAL_COLUMNS: - var.set(1) - viewmenu.add_checkbutton( - label=column, underline=0, variable=var, command=self._select_columns - ) - menubar.add_cascade(label="View", underline=0, menu=viewmenu) - - # Create a sort menu - # [xx] this should be selectbuttons; and it should include - # reversed sorts as options. - sortmenu = Menu(menubar, tearoff=0) - for column in self._table.column_names[1:]: - sortmenu.add_command( - label="Sort by %s" % column, - command=(lambda c=column: self._table.sort_by(c, "ascending")), - ) - sortmenu.add_separator() - # sortmenu.add_command(label='Descending Sort:') - for column in self._table.column_names[1:]: - sortmenu.add_command( - label="Reverse sort by %s" % column, - command=(lambda c=column: self._table.sort_by(c, "descending")), - ) - menubar.add_cascade(label="Sort", underline=0, menu=sortmenu) - - helpmenu = Menu(menubar, tearoff=0) - helpmenu.add_command(label="About", underline=0, command=self.about) - helpmenu.add_command( - label="Instructions", underline=0, command=self.help, accelerator="F1" - ) - menubar.add_cascade(label="Help", underline=0, menu=helpmenu) - self.top.bind("", self.help) - - self.top.config(menu=menubar) - - def _select_columns(self): - for (column, var) in self._column_vars.items(): - if var.get(): - self._table.show_column(column) - else: - self._table.hide_column(column) - - def _refresh(self): - self._ds.clear_status_cache() - try: - self._fill_table() - except HTTPError as e: - showerror("Error reading from server", e) - except URLError as e: - showerror("Error connecting to server", e.reason) - self._table.select(0) - - def _info_edit(self, info_key): - self._info_save() # just in case. - (entry, callback) = self._info[info_key] - entry["state"] = "normal" - entry["relief"] = "sunken" - entry.focus() - - def _info_save(self, e=None): - focus = self._table - for entry, callback in self._info.values(): - if entry["state"] == "disabled": - continue - if e is not None and e.widget is entry and e.keysym != "Return": - focus = entry - else: - entry["state"] = "disabled" - entry["relief"] = "groove" - callback(entry.get()) - focus.focus() - - def _table_reprfunc(self, row, col, val): - if self._table.column_names[col].endswith("Size"): - if isinstance(val, str): - return " %s" % val - elif val < 1024**2: - return " %.1f KB" % (val / 1024.0**1) - elif val < 1024**3: - return " %.1f MB" % (val / 1024.0**2) - else: - return " %.1f GB" % (val / 1024.0**3) - - if col in (0, ""): - return str(val) - else: - return " %s" % val - - def _set_url(self, url): - if url == self._ds.url: - return - try: - self._ds.url = url - self._fill_table() - except OSError as e: - showerror("Error Setting Server Index", str(e)) - self._show_info() - - def _set_download_dir(self, download_dir): - if self._ds.download_dir == download_dir: - return - # check if the dir exists, and if not, ask if we should create it? - - # Clear our status cache, & re-check what's installed - self._ds.download_dir = download_dir - try: - self._fill_table() - except HTTPError as e: - showerror("Error reading from server", e) - except URLError as e: - showerror("Error connecting to server", e.reason) - self._show_info() - - def _show_info(self): - print("showing info", self._ds.url) - for entry, cb in self._info.values(): - entry["state"] = "normal" - entry.delete(0, "end") - self._info["url"][0].insert(0, self._ds.url) - self._info["download_dir"][0].insert(0, self._ds.download_dir) - for entry, cb in self._info.values(): - entry["state"] = "disabled" - - def _prev_tab(self, *e): - for i, tab in enumerate(self._tab_names): - if tab.lower() == self._tab and i > 0: - self._tab = self._tab_names[i - 1].lower() - try: - return self._fill_table() - except HTTPError as e: - showerror("Error reading from server", e) - except URLError as e: - showerror("Error connecting to server", e.reason) - - def _next_tab(self, *e): - for i, tab in enumerate(self._tab_names): - if tab.lower() == self._tab and i < (len(self._tabs) - 1): - self._tab = self._tab_names[i + 1].lower() - try: - return self._fill_table() - except HTTPError as e: - showerror("Error reading from server", e) - except URLError as e: - showerror("Error connecting to server", e.reason) - - def _select_tab(self, event): - self._tab = event.widget["text"].lower() - try: - self._fill_table() - except HTTPError as e: - showerror("Error reading from server", e) - except URLError as e: - showerror("Error connecting to server", e.reason) - - _tab = "collections" - # _tab = 'corpora' - _rows = None - - def _fill_table(self): - selected_row = self._table.selected_row() - self._table.clear() - if self._tab == "all packages": - items = self._ds.packages() - elif self._tab == "corpora": - items = self._ds.corpora() - elif self._tab == "models": - items = self._ds.models() - elif self._tab == "collections": - items = self._ds.collections() - else: - assert 0, "bad tab value %r" % self._tab - rows = [self._package_to_columns(item) for item in items] - self._table.extend(rows) - - # Highlight the active tab. - for tab, label in self._tabs.items(): - if tab == self._tab: - label.configure( - foreground=self._FRONT_TAB_COLOR[0], - background=self._FRONT_TAB_COLOR[1], - ) - else: - label.configure( - foreground=self._BACK_TAB_COLOR[0], - background=self._BACK_TAB_COLOR[1], - ) - - self._table.sort_by("Identifier", order="ascending") - self._color_table() - self._table.select(selected_row) - - # This is a hack, because the scrollbar isn't updating its - # position right -- I'm not sure what the underlying cause is - # though. (This is on OS X w/ python 2.5) The length of - # delay that's necessary seems to depend on how fast the - # comptuer is. :-/ - self.top.after(150, self._table._scrollbar.set, *self._table._mlb.yview()) - self.top.after(300, self._table._scrollbar.set, *self._table._mlb.yview()) - - def _update_table_status(self): - for row_num in range(len(self._table)): - status = self._ds.status(self._table[row_num, "Identifier"]) - self._table[row_num, "Status"] = status - self._color_table() - - def _download(self, *e): - # If we're using threads, then delegate to the threaded - # downloader instead. - if self._use_threads: - return self._download_threaded(*e) - - marked = [ - self._table[row, "Identifier"] - for row in range(len(self._table)) - if self._table[row, 0] != "" - ] - selection = self._table.selected_row() - if not marked and selection is not None: - marked = [self._table[selection, "Identifier"]] - - download_iter = self._ds.incr_download(marked, self._ds.download_dir) - self._log_indent = 0 - self._download_cb(download_iter, marked) - - _DL_DELAY = 10 - - def _download_cb(self, download_iter, ids): - try: - msg = next(download_iter) - except StopIteration: - # self._fill_table(sort=False) - self._update_table_status() - afterid = self.top.after(10, self._show_progress, 0) - self._afterid["_download_cb"] = afterid - return - - def show(s): - self._progresslabel["text"] = s - self._log(s) - - if isinstance(msg, ProgressMessage): - self._show_progress(msg.progress) - elif isinstance(msg, ErrorMessage): - show(msg.message) - if msg.package is not None: - self._select(msg.package.id) - self._show_progress(None) - return # halt progress. - elif isinstance(msg, StartCollectionMessage): - show("Downloading collection %s" % msg.collection.id) - self._log_indent += 1 - elif isinstance(msg, StartPackageMessage): - show("Downloading package %s" % msg.package.id) - elif isinstance(msg, UpToDateMessage): - show("Package %s is up-to-date!" % msg.package.id) - # elif isinstance(msg, StaleMessage): - # show('Package %s is out-of-date or corrupt' % msg.package.id) - elif isinstance(msg, FinishDownloadMessage): - show("Finished downloading %r." % msg.package.id) - elif isinstance(msg, StartUnzipMessage): - show("Unzipping %s" % msg.package.filename) - elif isinstance(msg, FinishCollectionMessage): - self._log_indent -= 1 - show("Finished downloading collection %r." % msg.collection.id) - self._clear_mark(msg.collection.id) - elif isinstance(msg, FinishPackageMessage): - self._clear_mark(msg.package.id) - afterid = self.top.after(self._DL_DELAY, self._download_cb, download_iter, ids) - self._afterid["_download_cb"] = afterid - - def _select(self, id): - for row in range(len(self._table)): - if self._table[row, "Identifier"] == id: - self._table.select(row) - return - - def _color_table(self): - # Color rows according to status. - for row in range(len(self._table)): - bg, sbg = self._ROW_COLOR[self._table[row, "Status"]] - fg, sfg = ("black", "white") - self._table.rowconfig( - row, - foreground=fg, - selectforeground=sfg, - background=bg, - selectbackground=sbg, - ) - # Color the marked column - self._table.itemconfigure( - row, 0, foreground=self._MARK_COLOR[0], background=self._MARK_COLOR[1] - ) - - def _clear_mark(self, id): - for row in range(len(self._table)): - if self._table[row, "Identifier"] == id: - self._table[row, 0] = "" - - def _mark_all(self, *e): - for row in range(len(self._table)): - self._table[row, 0] = "X" - - def _table_mark(self, *e): - selection = self._table.selected_row() - if selection >= 0: - if self._table[selection][0] != "": - self._table[selection, 0] = "" - else: - self._table[selection, 0] = "X" - self._table.select(delta=1) - - def _show_log(self): - text = "\n".join(self._log_messages) - ShowText(self.top, "NLTK Downloader Log", text) - - def _package_to_columns(self, pkg): - """ - Given a package, return a list of values describing that - package, one for each column in ``self.COLUMNS``. - """ - row = [] - for column_index, column_name in enumerate(self.COLUMNS): - if column_index == 0: # Mark: - row.append("") - elif column_name == "Identifier": - row.append(pkg.id) - elif column_name == "Status": - row.append(self._ds.status(pkg)) - else: - attr = column_name.lower().replace(" ", "_") - row.append(getattr(pkg, attr, "n/a")) - return row - - # ///////////////////////////////////////////////////////////////// - # External Interface - # ///////////////////////////////////////////////////////////////// - - def destroy(self, *e): - if self._destroyed: - return - self.top.destroy() - self._destroyed = True - - def _destroy(self, *e): - if self.top is not None: - for afterid in self._afterid.values(): - self.top.after_cancel(afterid) - - # Abort any download in progress. - if self._downloading and self._use_threads: - self._abort_download() - - # Make sure the garbage collector destroys these now; - # otherwise, they may get destroyed when we're not in the main - # thread, which would make Tkinter unhappy. - self._column_vars.clear() - - def mainloop(self, *args, **kwargs): - self.top.mainloop(*args, **kwargs) - - # ///////////////////////////////////////////////////////////////// - # HELP - # ///////////////////////////////////////////////////////////////// - - HELP = textwrap.dedent( - """\ - This tool can be used to download a variety of corpora and models - that can be used with NLTK. Each corpus or model is distributed - in a single zip file, known as a \"package file.\" You can - download packages individually, or you can download pre-defined - collections of packages. - - When you download a package, it will be saved to the \"download - directory.\" A default download directory is chosen when you run - - the downloader; but you may also select a different download - directory. On Windows, the default download directory is - - - \"package.\" - - The NLTK downloader can be used to download a variety of corpora, - models, and other data packages. - - Keyboard shortcuts:: - [return]\t Download - [up]\t Select previous package - [down]\t Select next package - [left]\t Select previous tab - [right]\t Select next tab - """ - ) - - def help(self, *e): - # The default font's not very legible; try using 'fixed' instead. - try: - ShowText( - self.top, - "Help: NLTK Downloader", - self.HELP.strip(), - width=75, - font="fixed", - ) - except: - ShowText(self.top, "Help: NLTK Downloader", self.HELP.strip(), width=75) - - def about(self, *e): - ABOUT = "NLTK Downloader\n" + "Written by Edward Loper" - TITLE = "About: NLTK Downloader" - try: - from tkinter.messagebox import Message - - Message(message=ABOUT, title=TITLE).show() - except ImportError: - ShowText(self.top, TITLE, ABOUT) - - # ///////////////////////////////////////////////////////////////// - # Progress Bar - # ///////////////////////////////////////////////////////////////// - - _gradient_width = 5 - - def _init_progressbar(self): - c = self._progressbar - width, height = int(c["width"]), int(c["height"]) - for i in range(0, (int(c["width"]) * 2) // self._gradient_width): - c.create_line( - i * self._gradient_width + 20, - -20, - i * self._gradient_width - height - 20, - height + 20, - width=self._gradient_width, - fill="#%02x0000" % (80 + abs(i % 6 - 3) * 12), - ) - c.addtag_all("gradient") - c.itemconfig("gradient", state="hidden") - - # This is used to display progress - c.addtag_withtag( - "redbox", c.create_rectangle(0, 0, 0, 0, fill=self._PROGRESS_COLOR[0]) - ) - - def _show_progress(self, percent): - c = self._progressbar - if percent is None: - c.coords("redbox", 0, 0, 0, 0) - c.itemconfig("gradient", state="hidden") - else: - width, height = int(c["width"]), int(c["height"]) - x = percent * int(width) // 100 + 1 - c.coords("redbox", 0, 0, x, height + 1) - - def _progress_alive(self): - c = self._progressbar - if not self._downloading: - c.itemconfig("gradient", state="hidden") - else: - c.itemconfig("gradient", state="normal") - x1, y1, x2, y2 = c.bbox("gradient") - if x1 <= -100: - c.move("gradient", (self._gradient_width * 6) - 4, 0) - else: - c.move("gradient", -4, 0) - afterid = self.top.after(200, self._progress_alive) - self._afterid["_progress_alive"] = afterid - - # ///////////////////////////////////////////////////////////////// - # Threaded downloader - # ///////////////////////////////////////////////////////////////// - - def _download_threaded(self, *e): - # If the user tries to start a new download while we're already - # downloading something, then abort the current download instead. - if self._downloading: - self._abort_download() - return - - # Change the 'download' button to an 'abort' button. - self._download_button["text"] = "Cancel" - - marked = [ - self._table[row, "Identifier"] - for row in range(len(self._table)) - if self._table[row, 0] != "" - ] - selection = self._table.selected_row() - if not marked and selection is not None: - marked = [self._table[selection, "Identifier"]] - - # Create a new data server object for the download operation, - # just in case the user modifies our data server during the - # download (e.g., clicking 'refresh' or editing the index url). - ds = Downloader(self._ds.url, self._ds.download_dir) - - # Start downloading in a separate thread. - assert self._download_msg_queue == [] - assert self._download_abort_queue == [] - self._DownloadThread( - ds, - marked, - self._download_lock, - self._download_msg_queue, - self._download_abort_queue, - ).start() - - # Monitor the download message queue & display its progress. - self._log_indent = 0 - self._downloading = True - self._monitor_message_queue() - - # Display an indication that we're still alive and well by - # cycling the progress bar. - self._progress_alive() - - def _abort_download(self): - if self._downloading: - self._download_lock.acquire() - self._download_abort_queue.append("abort") - self._download_lock.release() - - class _DownloadThread(threading.Thread): - def __init__(self, data_server, items, lock, message_queue, abort): - self.data_server = data_server - self.items = items - self.lock = lock - self.message_queue = message_queue - self.abort = abort - threading.Thread.__init__(self) - - def run(self): - for msg in self.data_server.incr_download(self.items): - self.lock.acquire() - self.message_queue.append(msg) - # Check if we've been told to kill ourselves: - if self.abort: - self.message_queue.append("aborted") - self.lock.release() - return - self.lock.release() - self.lock.acquire() - self.message_queue.append("finished") - self.lock.release() - - _MONITOR_QUEUE_DELAY = 100 - - def _monitor_message_queue(self): - def show(s): - self._progresslabel["text"] = s - self._log(s) - - # Try to acquire the lock; if it's busy, then just try again later. - if not self._download_lock.acquire(): - return - for msg in self._download_msg_queue: - - # Done downloading? - if msg == "finished" or msg == "aborted": - # self._fill_table(sort=False) - self._update_table_status() - self._downloading = False - self._download_button["text"] = "Download" - del self._download_msg_queue[:] - del self._download_abort_queue[:] - self._download_lock.release() - if msg == "aborted": - show("Download aborted!") - self._show_progress(None) - else: - afterid = self.top.after(100, self._show_progress, None) - self._afterid["_monitor_message_queue"] = afterid - return - - # All other messages - elif isinstance(msg, ProgressMessage): - self._show_progress(msg.progress) - elif isinstance(msg, ErrorMessage): - show(msg.message) - if msg.package is not None: - self._select(msg.package.id) - self._show_progress(None) - self._downloading = False - return # halt progress. - elif isinstance(msg, StartCollectionMessage): - show("Downloading collection %r" % msg.collection.id) - self._log_indent += 1 - elif isinstance(msg, StartPackageMessage): - self._ds.clear_status_cache(msg.package.id) - show("Downloading package %r" % msg.package.id) - elif isinstance(msg, UpToDateMessage): - show("Package %s is up-to-date!" % msg.package.id) - # elif isinstance(msg, StaleMessage): - # show('Package %s is out-of-date or corrupt; updating it' % - # msg.package.id) - elif isinstance(msg, FinishDownloadMessage): - show("Finished downloading %r." % msg.package.id) - elif isinstance(msg, StartUnzipMessage): - show("Unzipping %s" % msg.package.filename) - elif isinstance(msg, FinishUnzipMessage): - show("Finished installing %s" % msg.package.id) - elif isinstance(msg, FinishCollectionMessage): - self._log_indent -= 1 - show("Finished downloading collection %r." % msg.collection.id) - self._clear_mark(msg.collection.id) - elif isinstance(msg, FinishPackageMessage): - self._update_table_status() - self._clear_mark(msg.package.id) - - # Let the user know when we're aborting a download (but - # waiting for a good point to abort it, so we don't end up - # with a partially unzipped package or anything like that). - if self._download_abort_queue: - self._progresslabel["text"] = "Aborting download..." - - # Clear the message queue and then release the lock - del self._download_msg_queue[:] - self._download_lock.release() - - # Check the queue again after MONITOR_QUEUE_DELAY msec. - afterid = self.top.after(self._MONITOR_QUEUE_DELAY, self._monitor_message_queue) - self._afterid["_monitor_message_queue"] = afterid - - -###################################################################### -# Helper Functions -###################################################################### -# [xx] It may make sense to move these to nltk.internals. - - -def md5_hexdigest(file): - """ - Calculate and return the MD5 checksum for a given file. - ``file`` may either be a filename or an open stream. - """ - if isinstance(file, str): - with open(file, "rb") as infile: - return _md5_hexdigest(infile) - return _md5_hexdigest(file) - - -def _md5_hexdigest(fp): - md5_digest = md5() - while True: - block = fp.read(1024 * 16) # 16k blocks - if not block: - break - md5_digest.update(block) - return md5_digest.hexdigest() - - -# change this to periodically yield progress messages? -# [xx] get rid of topdir parameter -- we should be checking -# this when we build the index, anyway. -def unzip(filename, root, verbose=True): - """ - Extract the contents of the zip file ``filename`` into the - directory ``root``. - """ - for message in _unzip_iter(filename, root, verbose): - if isinstance(message, ErrorMessage): - raise Exception(message) - - -def _unzip_iter(filename, root, verbose=True): - if verbose: - sys.stdout.write("Unzipping %s" % os.path.split(filename)[1]) - sys.stdout.flush() - - try: - zf = zipfile.ZipFile(filename) - except zipfile.error as e: - yield ErrorMessage(filename, "Error with downloaded zip file") - return - except Exception as e: - yield ErrorMessage(filename, e) - return - - zf.extractall(root) - - if verbose: - print() - - -###################################################################### -# Index Builder -###################################################################### -# This may move to a different file sometime. - - -def build_index(root, base_url): - """ - Create a new data.xml index file, by combining the xml description - files for various packages and collections. ``root`` should be the - path to a directory containing the package xml and zip files; and - the collection xml files. The ``root`` directory is expected to - have the following subdirectories:: - - root/ - packages/ .................. subdirectory for packages - corpora/ ................. zip & xml files for corpora - grammars/ ................ zip & xml files for grammars - taggers/ ................. zip & xml files for taggers - tokenizers/ .............. zip & xml files for tokenizers - etc. - collections/ ............... xml files for collections - - For each package, there should be two files: ``package.zip`` - (where *package* is the package name) - which contains the package itself as a compressed zip file; and - ``package.xml``, which is an xml description of the package. The - zipfile ``package.zip`` should expand to a single subdirectory - named ``package/``. The base filename ``package`` must match - the identifier given in the package's xml file. - - For each collection, there should be a single file ``collection.zip`` - describing the collection, where *collection* is the name of the collection. - - All identifiers (for both packages and collections) must be unique. - """ - # Find all packages. - packages = [] - for pkg_xml, zf, subdir in _find_packages(os.path.join(root, "packages")): - zipstat = os.stat(zf.filename) - url = f"{base_url}/{subdir}/{os.path.split(zf.filename)[1]}" - unzipped_size = sum(zf_info.file_size for zf_info in zf.infolist()) - - # Fill in several fields of the package xml with calculated values. - pkg_xml.set("unzipped_size", "%s" % unzipped_size) - pkg_xml.set("size", "%s" % zipstat.st_size) - pkg_xml.set("checksum", "%s" % md5_hexdigest(zf.filename)) - pkg_xml.set("subdir", subdir) - # pkg_xml.set('svn_revision', _svn_revision(zf.filename)) - if not pkg_xml.get("url"): - pkg_xml.set("url", url) - - # Record the package. - packages.append(pkg_xml) - - # Find all collections - collections = list(_find_collections(os.path.join(root, "collections"))) - - # Check that all UIDs are unique - uids = set() - for item in packages + collections: - if item.get("id") in uids: - raise ValueError("Duplicate UID: %s" % item.get("id")) - uids.add(item.get("id")) - - # Put it all together - top_elt = ElementTree.Element("nltk_data") - top_elt.append(ElementTree.Element("packages")) - top_elt[0].extend(sorted(packages, key=lambda package: package.get("id"))) - top_elt.append(ElementTree.Element("collections")) - top_elt[1].extend(sorted(collections, key=lambda collection: collection.get("id"))) - - _indent_xml(top_elt) - return top_elt - - -def _indent_xml(xml, prefix=""): - """ - Helper for ``build_index()``: Given an XML ``ElementTree``, modify it - (and its descendents) ``text`` and ``tail`` attributes to generate - an indented tree, where each nested element is indented by 2 - spaces with respect to its parent. - """ - if len(xml) > 0: - xml.text = (xml.text or "").strip() + "\n" + prefix + " " - for child in xml: - _indent_xml(child, prefix + " ") - for child in xml[:-1]: - child.tail = (child.tail or "").strip() + "\n" + prefix + " " - xml[-1].tail = (xml[-1].tail or "").strip() + "\n" + prefix - - -def _check_package(pkg_xml, zipfilename, zf): - """ - Helper for ``build_index()``: Perform some checks to make sure that - the given package is consistent. - """ - # The filename must patch the id given in the XML file. - uid = os.path.splitext(os.path.split(zipfilename)[1])[0] - if pkg_xml.get("id") != uid: - raise ValueError( - "package identifier mismatch ({} vs {})".format(pkg_xml.get("id"), uid) - ) - - # Zip file must expand to a subdir whose name matches uid. - if sum((name != uid and not name.startswith(uid + "/")) for name in zf.namelist()): - raise ValueError( - "Zipfile %s.zip does not expand to a single " - "subdirectory %s/" % (uid, uid) - ) - - -# update for git? -def _svn_revision(filename): - """ - Helper for ``build_index()``: Calculate the subversion revision - number for a given file (by using ``subprocess`` to run ``svn``). - """ - p = subprocess.Popen( - ["svn", "status", "-v", filename], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - (stdout, stderr) = p.communicate() - if p.returncode != 0 or stderr or not stdout: - raise ValueError( - "Error determining svn_revision for %s: %s" - % (os.path.split(filename)[1], textwrap.fill(stderr)) - ) - return stdout.split()[2] - - -def _find_collections(root): - """ - Helper for ``build_index()``: Yield a list of ElementTree.Element - objects, each holding the xml for a single package collection. - """ - for dirname, _subdirs, files in os.walk(root): - for filename in files: - if filename.endswith(".xml"): - xmlfile = os.path.join(dirname, filename) - yield ElementTree.parse(xmlfile).getroot() - - -def _find_packages(root): - """ - Helper for ``build_index()``: Yield a list of tuples - ``(pkg_xml, zf, subdir)``, where: - - ``pkg_xml`` is an ``ElementTree.Element`` holding the xml for a - package - - ``zf`` is a ``zipfile.ZipFile`` for the package's contents. - - ``subdir`` is the subdirectory (relative to ``root``) where - the package was found (e.g. 'corpora' or 'grammars'). - """ - from nltk.corpus.reader.util import _path_from - - # Find all packages. - packages = [] - for dirname, subdirs, files in os.walk(root): - relpath = "/".join(_path_from(root, dirname)) - for filename in files: - if filename.endswith(".xml"): - xmlfilename = os.path.join(dirname, filename) - zipfilename = xmlfilename[:-4] + ".zip" - try: - zf = zipfile.ZipFile(zipfilename) - except Exception as e: - raise ValueError(f"Error reading file {zipfilename!r}!\n{e}") from e - try: - pkg_xml = ElementTree.parse(xmlfilename).getroot() - except Exception as e: - raise ValueError(f"Error reading file {xmlfilename!r}!\n{e}") from e - - # Check that the UID matches the filename - uid = os.path.split(xmlfilename[:-4])[1] - if pkg_xml.get("id") != uid: - raise ValueError( - "package identifier mismatch (%s " - "vs %s)" % (pkg_xml.get("id"), uid) - ) - - # Check that the zipfile expands to a subdir whose - # name matches the uid. - if sum( - (name != uid and not name.startswith(uid + "/")) - for name in zf.namelist() - ): - raise ValueError( - "Zipfile %s.zip does not expand to a " - "single subdirectory %s/" % (uid, uid) - ) - - yield pkg_xml, zf, relpath - - elif filename.endswith(".zip"): - # Warn user in case a .xml does not exist for a .zip - resourcename = os.path.splitext(filename)[0] - xmlfilename = os.path.join(dirname, resourcename + ".xml") - if not os.path.exists(xmlfilename): - warnings.warn( - f"{filename} exists, but {resourcename + '.xml'} cannot be found! " - f"This could mean that {resourcename} can not be downloaded.", - stacklevel=2, - ) - - # Don't recurse into svn subdirectories: - try: - subdirs.remove(".svn") - except ValueError: - pass - - -###################################################################### -# Main: -###################################################################### - -# There should be a command-line interface - -# Aliases -_downloader = Downloader() -download = _downloader.download - - -def download_shell(): - DownloaderShell(_downloader).run() - - -def download_gui(): - DownloaderGUI(_downloader).mainloop() - - -def update(): - _downloader.update() - - -if __name__ == "__main__": - from optparse import OptionParser - - parser = OptionParser() - parser.add_option( - "-d", - "--dir", - dest="dir", - help="download package to directory DIR", - metavar="DIR", - ) - parser.add_option( - "-q", - "--quiet", - dest="quiet", - action="store_true", - default=False, - help="work quietly", - ) - parser.add_option( - "-f", - "--force", - dest="force", - action="store_true", - default=False, - help="download even if already installed", - ) - parser.add_option( - "-e", - "--exit-on-error", - dest="halt_on_error", - action="store_true", - default=False, - help="exit if an error occurs", - ) - parser.add_option( - "-u", - "--url", - dest="server_index_url", - default=os.environ.get("NLTK_DOWNLOAD_URL"), - help="download server index url", - ) - - (options, args) = parser.parse_args() - - downloader = Downloader(server_index_url=options.server_index_url) - - if args: - for pkg_id in args: - rv = downloader.download( - info_or_id=pkg_id, - download_dir=options.dir, - quiet=options.quiet, - force=options.force, - halt_on_error=options.halt_on_error, - ) - if rv == False and options.halt_on_error: - break - else: - downloader.download( - download_dir=options.dir, - quiet=options.quiet, - force=options.force, - halt_on_error=options.halt_on_error, - ) diff --git a/pipeline/nltk/draw/__init__.py b/pipeline/nltk/draw/__init__.py deleted file mode 100644 index 4e3d4308b5e4be658f94a175631eadc62c84008b..0000000000000000000000000000000000000000 --- a/pipeline/nltk/draw/__init__.py +++ /dev/null @@ -1,27 +0,0 @@ -# Natural Language Toolkit: graphical representations package -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird -# URL: -# For license information, see LICENSE.TXT - -# Import Tkinter-based modules if Tkinter is installed -try: - import tkinter -except ImportError: - import warnings - - warnings.warn("nltk.draw package not loaded (please install Tkinter library).") -else: - from nltk.draw.cfg import ProductionList, CFGEditor, CFGDemo - from nltk.draw.tree import ( - TreeSegmentWidget, - tree_to_treesegment, - TreeWidget, - TreeView, - draw_trees, - ) - from nltk.draw.table import Table - -from nltk.draw.dispersion import dispersion_plot diff --git a/pipeline/nltk/draw/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/draw/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index df1a4e05eb7bcfb1c9a5cc2e09463062124ab904..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/draw/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/draw/__pycache__/cfg.cpython-39.pyc b/pipeline/nltk/draw/__pycache__/cfg.cpython-39.pyc deleted file mode 100644 index 5f3a5c582aa36fe9b2705808b2278185668a56c2..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/draw/__pycache__/cfg.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/draw/__pycache__/dispersion.cpython-39.pyc b/pipeline/nltk/draw/__pycache__/dispersion.cpython-39.pyc deleted file mode 100644 index 21ca3d201ce34c6e35e9aa288d85f1ba9cb68189..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/draw/__pycache__/dispersion.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/draw/__pycache__/table.cpython-39.pyc b/pipeline/nltk/draw/__pycache__/table.cpython-39.pyc deleted file mode 100644 index c5c7984daa16cd5d39e7cfa0088028b441aec30b..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/draw/__pycache__/table.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/draw/__pycache__/tree.cpython-39.pyc b/pipeline/nltk/draw/__pycache__/tree.cpython-39.pyc deleted file mode 100644 index 8fb6a39bedb82d834774ec052887e5a5f92c1264..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/draw/__pycache__/tree.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/draw/__pycache__/util.cpython-39.pyc b/pipeline/nltk/draw/__pycache__/util.cpython-39.pyc deleted file mode 100644 index eb50ebd043d5160cc3ccf65ba2e374797e0a6473..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/draw/__pycache__/util.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/draw/cfg.py b/pipeline/nltk/draw/cfg.py deleted file mode 100644 index 650162abf095d439cf7ca2ba3f0f36c81f0ed041..0000000000000000000000000000000000000000 --- a/pipeline/nltk/draw/cfg.py +++ /dev/null @@ -1,859 +0,0 @@ -# Natural Language Toolkit: CFG visualization -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -Visualization tools for CFGs. -""" - -# Idea for a nice demo: -# - 3 panes: grammar, treelet, working area -# - grammar is a list of productions -# - when you select a production, the treelet that it licenses appears -# in the treelet area -# - the working area has the text on the bottom, and S at top. When -# you select a production, it shows (ghosted) the locations where -# that production's treelet could be attached to either the text -# or the tree rooted at S. -# - the user can drag the treelet onto one of those (or click on them?) -# - the user can delete pieces of the tree from the working area -# (right click?) -# - connecting top to bottom? drag one NP onto another? -# -# +-------------------------------------------------------------+ -# | S -> NP VP | S | -# |[NP -> Det N ]| / \ | -# | ... | NP VP | -# | N -> 'dog' | | -# | N -> 'cat' | | -# | ... | | -# +--------------+ | -# | NP | Det N | -# | / \ | | | | -# | Det N | the cat saw the dog | -# | | | -# +--------------+----------------------------------------------+ -# -# Operations: -# - connect a new treelet -- drag or click shadow -# - delete a treelet -- right click -# - if only connected to top, delete everything below -# - if only connected to bottom, delete everything above -# - connect top & bottom -- drag a leaf to a root or a root to a leaf -# - disconnect top & bottom -- right click -# - if connected to top & bottom, then disconnect - -import re -from tkinter import ( - Button, - Canvas, - Entry, - Frame, - IntVar, - Label, - Scrollbar, - Text, - Tk, - Toplevel, -) - -from nltk.draw.tree import TreeSegmentWidget, tree_to_treesegment -from nltk.draw.util import ( - CanvasFrame, - ColorizedList, - ShowText, - SymbolWidget, - TextWidget, -) -from nltk.grammar import CFG, Nonterminal, _read_cfg_production, nonterminals -from nltk.tree import Tree - -###################################################################### -# Production List -###################################################################### - - -class ProductionList(ColorizedList): - ARROW = SymbolWidget.SYMBOLS["rightarrow"] - - def _init_colortags(self, textwidget, options): - textwidget.tag_config("terminal", foreground="#006000") - textwidget.tag_config("arrow", font="symbol", underline="0") - textwidget.tag_config( - "nonterminal", foreground="blue", font=("helvetica", -12, "bold") - ) - - def _item_repr(self, item): - contents = [] - contents.append(("%s\t" % item.lhs(), "nonterminal")) - contents.append((self.ARROW, "arrow")) - for elt in item.rhs(): - if isinstance(elt, Nonterminal): - contents.append((" %s" % elt.symbol(), "nonterminal")) - else: - contents.append((" %r" % elt, "terminal")) - return contents - - -###################################################################### -# CFG Editor -###################################################################### - -_CFGEditor_HELP = """ - -The CFG Editor can be used to create or modify context free grammars. -A context free grammar consists of a start symbol and a list of -productions. The start symbol is specified by the text entry field in -the upper right hand corner of the editor; and the list of productions -are specified in the main text editing box. - -Every non-blank line specifies a single production. Each production -has the form "LHS -> RHS," where LHS is a single nonterminal, and RHS -is a list of nonterminals and terminals. - -Nonterminals must be a single word, such as S or NP or NP_subj. -Currently, nonterminals must consists of alphanumeric characters and -underscores (_). Nonterminals are colored blue. If you place the -mouse over any nonterminal, then all occurrences of that nonterminal -will be highlighted. - -Terminals must be surrounded by single quotes (') or double -quotes(\"). For example, "dog" and "New York" are terminals. -Currently, the string within the quotes must consist of alphanumeric -characters, underscores, and spaces. - -To enter a new production, go to a blank line, and type a nonterminal, -followed by an arrow (->), followed by a sequence of terminals and -nonterminals. Note that "->" (dash + greater-than) is automatically -converted to an arrow symbol. When you move your cursor to a -different line, your production will automatically be colorized. If -there are any errors, they will be highlighted in red. - -Note that the order of the productions is significant for some -algorithms. To re-order the productions, use cut and paste to move -them. - -Use the buttons at the bottom of the window when you are done editing -the CFG: - - Ok: apply the new CFG, and exit the editor. - - Apply: apply the new CFG, and do not exit the editor. - - Reset: revert to the original CFG, and do not exit the editor. - - Cancel: revert to the original CFG, and exit the editor. - -""" - - -class CFGEditor: - """ - A dialog window for creating and editing context free grammars. - ``CFGEditor`` imposes the following restrictions: - - - All nonterminals must be strings consisting of word - characters. - - All terminals must be strings consisting of word characters - and space characters. - """ - - # Regular expressions used by _analyze_line. Precompile them, so - # we can process the text faster. - ARROW = SymbolWidget.SYMBOLS["rightarrow"] - _LHS_RE = re.compile(r"(^\s*\w+\s*)(->|(" + ARROW + "))") - _ARROW_RE = re.compile(r"\s*(->|(" + ARROW + r"))\s*") - _PRODUCTION_RE = re.compile( - r"(^\s*\w+\s*)" - + "(->|(" # LHS - + ARROW - + r"))\s*" - + r"((\w+|'[\w ]*'|\"[\w ]*\"|\|)\s*)*$" # arrow - ) # RHS - _TOKEN_RE = re.compile("\\w+|->|'[\\w ]+'|\"[\\w ]+\"|(" + ARROW + ")") - _BOLD = ("helvetica", -12, "bold") - - def __init__(self, parent, cfg=None, set_cfg_callback=None): - self._parent = parent - if cfg is not None: - self._cfg = cfg - else: - self._cfg = CFG(Nonterminal("S"), []) - self._set_cfg_callback = set_cfg_callback - - self._highlight_matching_nonterminals = 1 - - # Create the top-level window. - self._top = Toplevel(parent) - self._init_bindings() - - self._init_startframe() - self._startframe.pack(side="top", fill="x", expand=0) - self._init_prodframe() - self._prodframe.pack(side="top", fill="both", expand=1) - self._init_buttons() - self._buttonframe.pack(side="bottom", fill="x", expand=0) - - self._textwidget.focus() - - def _init_startframe(self): - frame = self._startframe = Frame(self._top) - self._start = Entry(frame) - self._start.pack(side="right") - Label(frame, text="Start Symbol:").pack(side="right") - Label(frame, text="Productions:").pack(side="left") - self._start.insert(0, self._cfg.start().symbol()) - - def _init_buttons(self): - frame = self._buttonframe = Frame(self._top) - Button(frame, text="Ok", command=self._ok, underline=0, takefocus=0).pack( - side="left" - ) - Button(frame, text="Apply", command=self._apply, underline=0, takefocus=0).pack( - side="left" - ) - Button(frame, text="Reset", command=self._reset, underline=0, takefocus=0).pack( - side="left" - ) - Button( - frame, text="Cancel", command=self._cancel, underline=0, takefocus=0 - ).pack(side="left") - Button(frame, text="Help", command=self._help, underline=0, takefocus=0).pack( - side="right" - ) - - def _init_bindings(self): - self._top.title("CFG Editor") - self._top.bind("", self._cancel) - self._top.bind("", self._cancel) - self._top.bind("", self._cancel) - # self._top.bind('', self._cancel) - self._top.bind("", self._cancel) - self._top.bind("", self._cancel) - # self._top.bind('', self._cancel) - self._top.bind("", self._cancel) - - self._top.bind("", self._ok) - self._top.bind("", self._ok) - self._top.bind("", self._apply) - self._top.bind("", self._apply) - self._top.bind("", self._reset) - self._top.bind("", self._reset) - self._top.bind("", self._help) - self._top.bind("", self._help) - self._top.bind("", self._help) - - def _init_prodframe(self): - self._prodframe = Frame(self._top) - - # Create the basic Text widget & scrollbar. - self._textwidget = Text( - self._prodframe, background="#e0e0e0", exportselection=1 - ) - self._textscroll = Scrollbar(self._prodframe, takefocus=0, orient="vertical") - self._textwidget.config(yscrollcommand=self._textscroll.set) - self._textscroll.config(command=self._textwidget.yview) - self._textscroll.pack(side="right", fill="y") - self._textwidget.pack(expand=1, fill="both", side="left") - - # Initialize the colorization tags. Each nonterminal gets its - # own tag, so they aren't listed here. - self._textwidget.tag_config("terminal", foreground="#006000") - self._textwidget.tag_config("arrow", font="symbol") - self._textwidget.tag_config("error", background="red") - - # Keep track of what line they're on. We use that to remember - # to re-analyze a line whenever they leave it. - self._linenum = 0 - - # Expand "->" to an arrow. - self._top.bind(">", self._replace_arrows) - - # Re-colorize lines when appropriate. - self._top.bind("<>", self._analyze) - self._top.bind("", self._check_analyze) - self._top.bind("", self._check_analyze) - - # Tab cycles focus. (why doesn't this work??) - def cycle(e, textwidget=self._textwidget): - textwidget.tk_focusNext().focus() - - self._textwidget.bind("", cycle) - - prod_tuples = [(p.lhs(), [p.rhs()]) for p in self._cfg.productions()] - for i in range(len(prod_tuples) - 1, 0, -1): - if prod_tuples[i][0] == prod_tuples[i - 1][0]: - if () in prod_tuples[i][1]: - continue - if () in prod_tuples[i - 1][1]: - continue - print(prod_tuples[i - 1][1]) - print(prod_tuples[i][1]) - prod_tuples[i - 1][1].extend(prod_tuples[i][1]) - del prod_tuples[i] - - for lhs, rhss in prod_tuples: - print(lhs, rhss) - s = "%s ->" % lhs - for rhs in rhss: - for elt in rhs: - if isinstance(elt, Nonterminal): - s += " %s" % elt - else: - s += " %r" % elt - s += " |" - s = s[:-2] + "\n" - self._textwidget.insert("end", s) - - self._analyze() - - # # Add the producitons to the text widget, and colorize them. - # prod_by_lhs = {} - # for prod in self._cfg.productions(): - # if len(prod.rhs()) > 0: - # prod_by_lhs.setdefault(prod.lhs(),[]).append(prod) - # for (lhs, prods) in prod_by_lhs.items(): - # self._textwidget.insert('end', '%s ->' % lhs) - # self._textwidget.insert('end', self._rhs(prods[0])) - # for prod in prods[1:]: - # print '\t|'+self._rhs(prod), - # self._textwidget.insert('end', '\t|'+self._rhs(prod)) - # print - # self._textwidget.insert('end', '\n') - # for prod in self._cfg.productions(): - # if len(prod.rhs()) == 0: - # self._textwidget.insert('end', '%s' % prod) - # self._analyze() - - # def _rhs(self, prod): - # s = '' - # for elt in prod.rhs(): - # if isinstance(elt, Nonterminal): s += ' %s' % elt.symbol() - # else: s += ' %r' % elt - # return s - - def _clear_tags(self, linenum): - """ - Remove all tags (except ``arrow`` and ``sel``) from the given - line of the text widget used for editing the productions. - """ - start = "%d.0" % linenum - end = "%d.end" % linenum - for tag in self._textwidget.tag_names(): - if tag not in ("arrow", "sel"): - self._textwidget.tag_remove(tag, start, end) - - def _check_analyze(self, *e): - """ - Check if we've moved to a new line. If we have, then remove - all colorization from the line we moved to, and re-colorize - the line that we moved from. - """ - linenum = int(self._textwidget.index("insert").split(".")[0]) - if linenum != self._linenum: - self._clear_tags(linenum) - self._analyze_line(self._linenum) - self._linenum = linenum - - def _replace_arrows(self, *e): - """ - Replace any ``'->'`` text strings with arrows (char \\256, in - symbol font). This searches the whole buffer, but is fast - enough to be done anytime they press '>'. - """ - arrow = "1.0" - while True: - arrow = self._textwidget.search("->", arrow, "end+1char") - if arrow == "": - break - self._textwidget.delete(arrow, arrow + "+2char") - self._textwidget.insert(arrow, self.ARROW, "arrow") - self._textwidget.insert(arrow, "\t") - - arrow = "1.0" - while True: - arrow = self._textwidget.search(self.ARROW, arrow + "+1char", "end+1char") - if arrow == "": - break - self._textwidget.tag_add("arrow", arrow, arrow + "+1char") - - def _analyze_token(self, match, linenum): - """ - Given a line number and a regexp match for a token on that - line, colorize the token. Note that the regexp match gives us - the token's text, start index (on the line), and end index (on - the line). - """ - # What type of token is it? - if match.group()[0] in "'\"": - tag = "terminal" - elif match.group() in ("->", self.ARROW): - tag = "arrow" - else: - # If it's a nonterminal, then set up new bindings, so we - # can highlight all instances of that nonterminal when we - # put the mouse over it. - tag = "nonterminal_" + match.group() - if tag not in self._textwidget.tag_names(): - self._init_nonterminal_tag(tag) - - start = "%d.%d" % (linenum, match.start()) - end = "%d.%d" % (linenum, match.end()) - self._textwidget.tag_add(tag, start, end) - - def _init_nonterminal_tag(self, tag, foreground="blue"): - self._textwidget.tag_config(tag, foreground=foreground, font=CFGEditor._BOLD) - if not self._highlight_matching_nonterminals: - return - - def enter(e, textwidget=self._textwidget, tag=tag): - textwidget.tag_config(tag, background="#80ff80") - - def leave(e, textwidget=self._textwidget, tag=tag): - textwidget.tag_config(tag, background="") - - self._textwidget.tag_bind(tag, "", enter) - self._textwidget.tag_bind(tag, "", leave) - - def _analyze_line(self, linenum): - """ - Colorize a given line. - """ - # Get rid of any tags that were previously on the line. - self._clear_tags(linenum) - - # Get the line line's text string. - line = self._textwidget.get(repr(linenum) + ".0", repr(linenum) + ".end") - - # If it's a valid production, then colorize each token. - if CFGEditor._PRODUCTION_RE.match(line): - # It's valid; Use _TOKEN_RE to tokenize the production, - # and call analyze_token on each token. - def analyze_token(match, self=self, linenum=linenum): - self._analyze_token(match, linenum) - return "" - - CFGEditor._TOKEN_RE.sub(analyze_token, line) - elif line.strip() != "": - # It's invalid; show the user where the error is. - self._mark_error(linenum, line) - - def _mark_error(self, linenum, line): - """ - Mark the location of an error in a line. - """ - arrowmatch = CFGEditor._ARROW_RE.search(line) - if not arrowmatch: - # If there's no arrow at all, highlight the whole line. - start = "%d.0" % linenum - end = "%d.end" % linenum - elif not CFGEditor._LHS_RE.match(line): - # Otherwise, if the LHS is bad, highlight it. - start = "%d.0" % linenum - end = "%d.%d" % (linenum, arrowmatch.start()) - else: - # Otherwise, highlight the RHS. - start = "%d.%d" % (linenum, arrowmatch.end()) - end = "%d.end" % linenum - - # If we're highlighting 0 chars, highlight the whole line. - if self._textwidget.compare(start, "==", end): - start = "%d.0" % linenum - end = "%d.end" % linenum - self._textwidget.tag_add("error", start, end) - - def _analyze(self, *e): - """ - Replace ``->`` with arrows, and colorize the entire buffer. - """ - self._replace_arrows() - numlines = int(self._textwidget.index("end").split(".")[0]) - for linenum in range(1, numlines + 1): # line numbers start at 1. - self._analyze_line(linenum) - - def _parse_productions(self): - """ - Parse the current contents of the textwidget buffer, to create - a list of productions. - """ - productions = [] - - # Get the text, normalize it, and split it into lines. - text = self._textwidget.get("1.0", "end") - text = re.sub(self.ARROW, "->", text) - text = re.sub("\t", " ", text) - lines = text.split("\n") - - # Convert each line to a CFG production - for line in lines: - line = line.strip() - if line == "": - continue - productions += _read_cfg_production(line) - # if line.strip() == '': continue - # if not CFGEditor._PRODUCTION_RE.match(line): - # raise ValueError('Bad production string %r' % line) - # - # (lhs_str, rhs_str) = line.split('->') - # lhs = Nonterminal(lhs_str.strip()) - # rhs = [] - # def parse_token(match, rhs=rhs): - # token = match.group() - # if token[0] in "'\"": rhs.append(token[1:-1]) - # else: rhs.append(Nonterminal(token)) - # return '' - # CFGEditor._TOKEN_RE.sub(parse_token, rhs_str) - # - # productions.append(Production(lhs, *rhs)) - - return productions - - def _destroy(self, *e): - if self._top is None: - return - self._top.destroy() - self._top = None - - def _ok(self, *e): - self._apply() - self._destroy() - - def _apply(self, *e): - productions = self._parse_productions() - start = Nonterminal(self._start.get()) - cfg = CFG(start, productions) - if self._set_cfg_callback is not None: - self._set_cfg_callback(cfg) - - def _reset(self, *e): - self._textwidget.delete("1.0", "end") - for production in self._cfg.productions(): - self._textwidget.insert("end", "%s\n" % production) - self._analyze() - if self._set_cfg_callback is not None: - self._set_cfg_callback(self._cfg) - - def _cancel(self, *e): - try: - self._reset() - except: - pass - self._destroy() - - def _help(self, *e): - # The default font's not very legible; try using 'fixed' instead. - try: - ShowText( - self._parent, - "Help: Chart Parser Demo", - (_CFGEditor_HELP).strip(), - width=75, - font="fixed", - ) - except: - ShowText( - self._parent, - "Help: Chart Parser Demo", - (_CFGEditor_HELP).strip(), - width=75, - ) - - -###################################################################### -# New Demo (built tree based on cfg) -###################################################################### - - -class CFGDemo: - def __init__(self, grammar, text): - self._grammar = grammar - self._text = text - - # Set up the main window. - self._top = Tk() - self._top.title("Context Free Grammar Demo") - - # Base font size - self._size = IntVar(self._top) - self._size.set(12) # = medium - - # Set up the key bindings - self._init_bindings(self._top) - - # Create the basic frames - frame1 = Frame(self._top) - frame1.pack(side="left", fill="y", expand=0) - self._init_menubar(self._top) - self._init_buttons(self._top) - self._init_grammar(frame1) - self._init_treelet(frame1) - self._init_workspace(self._top) - - # ////////////////////////////////////////////////// - # Initialization - # ////////////////////////////////////////////////// - - def _init_bindings(self, top): - top.bind("", self.destroy) - - def _init_menubar(self, parent): - pass - - def _init_buttons(self, parent): - pass - - def _init_grammar(self, parent): - self._prodlist = ProductionList(parent, self._grammar, width=20) - self._prodlist.pack(side="top", fill="both", expand=1) - self._prodlist.focus() - self._prodlist.add_callback("select", self._selectprod_cb) - self._prodlist.add_callback("move", self._selectprod_cb) - - def _init_treelet(self, parent): - self._treelet_canvas = Canvas(parent, background="white") - self._treelet_canvas.pack(side="bottom", fill="x") - self._treelet = None - - def _init_workspace(self, parent): - self._workspace = CanvasFrame(parent, background="white") - self._workspace.pack(side="right", fill="both", expand=1) - self._tree = None - self.reset_workspace() - - # ////////////////////////////////////////////////// - # Workspace - # ////////////////////////////////////////////////// - - def reset_workspace(self): - c = self._workspace.canvas() - fontsize = int(self._size.get()) - node_font = ("helvetica", -(fontsize + 4), "bold") - leaf_font = ("helvetica", -(fontsize + 2)) - - # Remove the old tree - if self._tree is not None: - self._workspace.remove_widget(self._tree) - - # The root of the tree. - start = self._grammar.start().symbol() - rootnode = TextWidget(c, start, font=node_font, draggable=1) - - # The leaves of the tree. - leaves = [] - for word in self._text: - leaves.append(TextWidget(c, word, font=leaf_font, draggable=1)) - - # Put it all together into one tree - self._tree = TreeSegmentWidget(c, rootnode, leaves, color="white") - - # Add it to the workspace. - self._workspace.add_widget(self._tree) - - # Move the leaves to the bottom of the workspace. - for leaf in leaves: - leaf.move(0, 100) - - # self._nodes = {start:1} - # self._leaves = dict([(l,1) for l in leaves]) - - def workspace_markprod(self, production): - pass - - def _markproduction(self, prod, tree=None): - if tree is None: - tree = self._tree - for i in range(len(tree.subtrees()) - len(prod.rhs())): - if tree["color", i] == "white": - self._markproduction # FIXME: Is this necessary at all? - - for j, node in enumerate(prod.rhs()): - widget = tree.subtrees()[i + j] - if ( - isinstance(node, Nonterminal) - and isinstance(widget, TreeSegmentWidget) - and node.symbol == widget.label().text() - ): - pass # matching nonterminal - elif ( - isinstance(node, str) - and isinstance(widget, TextWidget) - and node == widget.text() - ): - pass # matching nonterminal - else: - break - else: - # Everything matched! - print("MATCH AT", i) - - # ////////////////////////////////////////////////// - # Grammar - # ////////////////////////////////////////////////// - - def _selectprod_cb(self, production): - canvas = self._treelet_canvas - - self._prodlist.highlight(production) - if self._treelet is not None: - self._treelet.destroy() - - # Convert the production to a tree. - rhs = production.rhs() - for (i, elt) in enumerate(rhs): - if isinstance(elt, Nonterminal): - elt = Tree(elt) - tree = Tree(production.lhs().symbol(), *rhs) - - # Draw the tree in the treelet area. - fontsize = int(self._size.get()) - node_font = ("helvetica", -(fontsize + 4), "bold") - leaf_font = ("helvetica", -(fontsize + 2)) - self._treelet = tree_to_treesegment( - canvas, tree, node_font=node_font, leaf_font=leaf_font - ) - self._treelet["draggable"] = 1 - - # Center the treelet. - (x1, y1, x2, y2) = self._treelet.bbox() - w, h = int(canvas["width"]), int(canvas["height"]) - self._treelet.move((w - x1 - x2) / 2, (h - y1 - y2) / 2) - - # Mark the places where we can add it to the workspace. - self._markproduction(production) - - def destroy(self, *args): - self._top.destroy() - - def mainloop(self, *args, **kwargs): - self._top.mainloop(*args, **kwargs) - - -def demo2(): - from nltk import CFG, Nonterminal, Production - - nonterminals = "S VP NP PP P N Name V Det" - (S, VP, NP, PP, P, N, Name, V, Det) = (Nonterminal(s) for s in nonterminals.split()) - productions = ( - # Syntactic Productions - Production(S, [NP, VP]), - Production(NP, [Det, N]), - Production(NP, [NP, PP]), - Production(VP, [VP, PP]), - Production(VP, [V, NP, PP]), - Production(VP, [V, NP]), - Production(PP, [P, NP]), - Production(PP, []), - Production(PP, ["up", "over", NP]), - # Lexical Productions - Production(NP, ["I"]), - Production(Det, ["the"]), - Production(Det, ["a"]), - Production(N, ["man"]), - Production(V, ["saw"]), - Production(P, ["in"]), - Production(P, ["with"]), - Production(N, ["park"]), - Production(N, ["dog"]), - Production(N, ["statue"]), - Production(Det, ["my"]), - ) - grammar = CFG(S, productions) - - text = "I saw a man in the park".split() - d = CFGDemo(grammar, text) - d.mainloop() - - -###################################################################### -# Old Demo -###################################################################### - - -def demo(): - from nltk import CFG, Nonterminal - - nonterminals = "S VP NP PP P N Name V Det" - (S, VP, NP, PP, P, N, Name, V, Det) = (Nonterminal(s) for s in nonterminals.split()) - - grammar = CFG.fromstring( - """ - S -> NP VP - PP -> P NP - NP -> Det N - NP -> NP PP - VP -> V NP - VP -> VP PP - Det -> 'a' - Det -> 'the' - Det -> 'my' - NP -> 'I' - N -> 'dog' - N -> 'man' - N -> 'park' - N -> 'statue' - V -> 'saw' - P -> 'in' - P -> 'up' - P -> 'over' - P -> 'with' - """ - ) - - def cb(grammar): - print(grammar) - - top = Tk() - editor = CFGEditor(top, grammar, cb) - Label(top, text="\nTesting CFG Editor\n").pack() - Button(top, text="Quit", command=top.destroy).pack() - top.mainloop() - - -def demo3(): - from nltk import Production - - (S, VP, NP, PP, P, N, Name, V, Det) = nonterminals( - "S, VP, NP, PP, P, N, Name, V, Det" - ) - - productions = ( - # Syntactic Productions - Production(S, [NP, VP]), - Production(NP, [Det, N]), - Production(NP, [NP, PP]), - Production(VP, [VP, PP]), - Production(VP, [V, NP, PP]), - Production(VP, [V, NP]), - Production(PP, [P, NP]), - Production(PP, []), - Production(PP, ["up", "over", NP]), - # Lexical Productions - Production(NP, ["I"]), - Production(Det, ["the"]), - Production(Det, ["a"]), - Production(N, ["man"]), - Production(V, ["saw"]), - Production(P, ["in"]), - Production(P, ["with"]), - Production(N, ["park"]), - Production(N, ["dog"]), - Production(N, ["statue"]), - Production(Det, ["my"]), - ) - - t = Tk() - - def destroy(e, t=t): - t.destroy() - - t.bind("q", destroy) - p = ProductionList(t, productions) - p.pack(expand=1, fill="both") - p.add_callback("select", p.markonly) - p.add_callback("move", p.markonly) - p.focus() - p.mark(productions[2]) - p.mark(productions[8]) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/draw/dispersion.py b/pipeline/nltk/draw/dispersion.py deleted file mode 100644 index 0991194dc42e1c258b6e62c3e8dfb71d44bb3ce6..0000000000000000000000000000000000000000 --- a/pipeline/nltk/draw/dispersion.py +++ /dev/null @@ -1,63 +0,0 @@ -# Natural Language Toolkit: Dispersion Plots -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# URL: -# For license information, see LICENSE.TXT - -""" -A utility for displaying lexical dispersion. -""" - - -def dispersion_plot(text, words, ignore_case=False, title="Lexical Dispersion Plot"): - """ - Generate a lexical dispersion plot. - - :param text: The source text - :type text: list(str) or iter(str) - :param words: The target words - :type words: list of str - :param ignore_case: flag to set if case should be ignored when searching text - :type ignore_case: bool - :return: a matplotlib Axes object that may still be modified before plotting - :rtype: Axes - """ - - try: - import matplotlib.pyplot as plt - except ImportError as e: - raise ImportError( - "The plot function requires matplotlib to be installed. " - "See https://matplotlib.org/" - ) from e - - word2y = { - word.casefold() if ignore_case else word: y - for y, word in enumerate(reversed(words)) - } - xs, ys = [], [] - for x, token in enumerate(text): - token = token.casefold() if ignore_case else token - y = word2y.get(token) - if y is not None: - xs.append(x) - ys.append(y) - - _, ax = plt.subplots() - ax.plot(xs, ys, "|") - ax.set_yticks(list(range(len(words))), words, color="C0") - ax.set_ylim(-1, len(words)) - ax.set_title(title) - ax.set_xlabel("Word Offset") - return ax - - -if __name__ == "__main__": - import matplotlib.pyplot as plt - - from nltk.corpus import gutenberg - - words = ["Elinor", "Marianne", "Edward", "Willoughby"] - dispersion_plot(gutenberg.words("austen-sense.txt"), words) - plt.show() diff --git a/pipeline/nltk/draw/table.py b/pipeline/nltk/draw/table.py deleted file mode 100644 index 0d3526d5f1bf223684a1293dd5ff32ef6cbbbf55..0000000000000000000000000000000000000000 --- a/pipeline/nltk/draw/table.py +++ /dev/null @@ -1,1177 +0,0 @@ -# Natural Language Toolkit: Table widget -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -Tkinter widgets for displaying multi-column listboxes and tables. -""" - -import operator -from tkinter import Frame, Label, Listbox, Scrollbar, Tk - -###################################################################### -# Multi-Column Listbox -###################################################################### - - -class MultiListbox(Frame): - """ - A multi-column listbox, where the current selection applies to an - entire row. Based on the MultiListbox Tkinter widget - recipe from the Python Cookbook (https://code.activestate.com/recipes/52266/) - - For the most part, ``MultiListbox`` methods delegate to its - contained listboxes. For any methods that do not have docstrings, - see ``Tkinter.Listbox`` for a description of what that method does. - """ - - # ///////////////////////////////////////////////////////////////// - # Configuration - # ///////////////////////////////////////////////////////////////// - - #: Default configuration values for the frame. - FRAME_CONFIG = dict(background="#888", takefocus=True, highlightthickness=1) - - #: Default configurations for the column labels. - LABEL_CONFIG = dict( - borderwidth=1, - relief="raised", - font="helvetica -16 bold", - background="#444", - foreground="white", - ) - - #: Default configuration for the column listboxes. - LISTBOX_CONFIG = dict( - borderwidth=1, - selectborderwidth=0, - highlightthickness=0, - exportselection=False, - selectbackground="#888", - activestyle="none", - takefocus=False, - ) - - # ///////////////////////////////////////////////////////////////// - # Constructor - # ///////////////////////////////////////////////////////////////// - - def __init__(self, master, columns, column_weights=None, cnf={}, **kw): - """ - Construct a new multi-column listbox widget. - - :param master: The widget that should contain the new - multi-column listbox. - - :param columns: Specifies what columns should be included in - the new multi-column listbox. If ``columns`` is an integer, - then it is the number of columns to include. If it is - a list, then its length indicates the number of columns - to include; and each element of the list will be used as - a label for the corresponding column. - - :param cnf, kw: Configuration parameters for this widget. - Use ``label_*`` to configure all labels; and ``listbox_*`` - to configure all listboxes. E.g.: - >>> root = Tk() # doctest: +SKIP - >>> MultiListbox(root, ["Subject", "Sender", "Date"], label_foreground='red').pack() # doctest: +SKIP - """ - # If columns was specified as an int, convert it to a list. - if isinstance(columns, int): - columns = list(range(columns)) - include_labels = False - else: - include_labels = True - - if len(columns) == 0: - raise ValueError("Expected at least one column") - - # Instance variables - self._column_names = tuple(columns) - self._listboxes = [] - self._labels = [] - - # Pick a default value for column_weights, if none was specified. - if column_weights is None: - column_weights = [1] * len(columns) - elif len(column_weights) != len(columns): - raise ValueError("Expected one column_weight for each column") - self._column_weights = column_weights - - # Configure our widgets. - Frame.__init__(self, master, **self.FRAME_CONFIG) - self.grid_rowconfigure(1, weight=1) - for i, label in enumerate(self._column_names): - self.grid_columnconfigure(i, weight=column_weights[i]) - - # Create a label for the column - if include_labels: - l = Label(self, text=label, **self.LABEL_CONFIG) - self._labels.append(l) - l.grid(column=i, row=0, sticky="news", padx=0, pady=0) - l.column_index = i - - # Create a listbox for the column - lb = Listbox(self, **self.LISTBOX_CONFIG) - self._listboxes.append(lb) - lb.grid(column=i, row=1, sticky="news", padx=0, pady=0) - lb.column_index = i - - # Clicking or dragging selects: - lb.bind("", self._select) - lb.bind("", self._select) - # Scroll wheel scrolls: - lb.bind("", lambda e: self._scroll(-1)) - lb.bind("", lambda e: self._scroll(+1)) - lb.bind("", lambda e: self._scroll(e.delta)) - # Button 2 can be used to scan: - lb.bind("", lambda e: self.scan_mark(e.x, e.y)) - lb.bind("", lambda e: self.scan_dragto(e.x, e.y)) - # Dragging outside the window has no effect (disable - # the default listbox behavior, which scrolls): - lb.bind("", lambda e: "break") - # Columns can be resized by dragging them: - lb.bind("", self._resize_column) - - # Columns can be resized by dragging them. (This binding is - # used if they click on the grid between columns:) - self.bind("", self._resize_column) - - # Set up key bindings for the widget: - self.bind("", lambda e: self.select(delta=-1)) - self.bind("", lambda e: self.select(delta=1)) - self.bind("", lambda e: self.select(delta=-self._pagesize())) - self.bind("", lambda e: self.select(delta=self._pagesize())) - - # Configuration customizations - self.configure(cnf, **kw) - - # ///////////////////////////////////////////////////////////////// - # Column Resizing - # ///////////////////////////////////////////////////////////////// - - def _resize_column(self, event): - """ - Callback used to resize a column of the table. Return ``True`` - if the column is actually getting resized (if the user clicked - on the far left or far right 5 pixels of a label); and - ``False`` otherwies. - """ - # If we're already waiting for a button release, then ignore - # the new button press. - if event.widget.bind(""): - return False - - # Decide which column (if any) to resize. - self._resize_column_index = None - if event.widget is self: - for i, lb in enumerate(self._listboxes): - if abs(event.x - (lb.winfo_x() + lb.winfo_width())) < 10: - self._resize_column_index = i - elif event.x > (event.widget.winfo_width() - 5): - self._resize_column_index = event.widget.column_index - elif event.x < 5 and event.widget.column_index != 0: - self._resize_column_index = event.widget.column_index - 1 - - # Bind callbacks that are used to resize it. - if self._resize_column_index is not None: - event.widget.bind("", self._resize_column_motion_cb) - event.widget.bind( - "" % event.num, self._resize_column_buttonrelease_cb - ) - return True - else: - return False - - def _resize_column_motion_cb(self, event): - lb = self._listboxes[self._resize_column_index] - charwidth = lb.winfo_width() / lb["width"] - - x1 = event.x + event.widget.winfo_x() - x2 = lb.winfo_x() + lb.winfo_width() - - lb["width"] = max(3, lb["width"] + (x1 - x2) // charwidth) - - def _resize_column_buttonrelease_cb(self, event): - event.widget.unbind("" % event.num) - event.widget.unbind("") - - # ///////////////////////////////////////////////////////////////// - # Properties - # ///////////////////////////////////////////////////////////////// - - @property - def column_names(self): - """ - A tuple containing the names of the columns used by this - multi-column listbox. - """ - return self._column_names - - @property - def column_labels(self): - """ - A tuple containing the ``Tkinter.Label`` widgets used to - display the label of each column. If this multi-column - listbox was created without labels, then this will be an empty - tuple. These widgets will all be augmented with a - ``column_index`` attribute, which can be used to determine - which column they correspond to. This can be convenient, - e.g., when defining callbacks for bound events. - """ - return tuple(self._labels) - - @property - def listboxes(self): - """ - A tuple containing the ``Tkinter.Listbox`` widgets used to - display individual columns. These widgets will all be - augmented with a ``column_index`` attribute, which can be used - to determine which column they correspond to. This can be - convenient, e.g., when defining callbacks for bound events. - """ - return tuple(self._listboxes) - - # ///////////////////////////////////////////////////////////////// - # Mouse & Keyboard Callback Functions - # ///////////////////////////////////////////////////////////////// - - def _select(self, e): - i = e.widget.nearest(e.y) - self.selection_clear(0, "end") - self.selection_set(i) - self.activate(i) - self.focus() - - def _scroll(self, delta): - for lb in self._listboxes: - lb.yview_scroll(delta, "unit") - return "break" - - def _pagesize(self): - """:return: The number of rows that makes up one page""" - return int(self.index("@0,1000000")) - int(self.index("@0,0")) - - # ///////////////////////////////////////////////////////////////// - # Row selection - # ///////////////////////////////////////////////////////////////// - - def select(self, index=None, delta=None, see=True): - """ - Set the selected row. If ``index`` is specified, then select - row ``index``. Otherwise, if ``delta`` is specified, then move - the current selection by ``delta`` (negative numbers for up, - positive numbers for down). This will not move the selection - past the top or the bottom of the list. - - :param see: If true, then call ``self.see()`` with the newly - selected index, to ensure that it is visible. - """ - if (index is not None) and (delta is not None): - raise ValueError("specify index or delta, but not both") - - # If delta was given, then calculate index. - if delta is not None: - if len(self.curselection()) == 0: - index = -1 + delta - else: - index = int(self.curselection()[0]) + delta - - # Clear all selected rows. - self.selection_clear(0, "end") - - # Select the specified index - if index is not None: - index = min(max(index, 0), self.size() - 1) - # self.activate(index) - self.selection_set(index) - if see: - self.see(index) - - # ///////////////////////////////////////////////////////////////// - # Configuration - # ///////////////////////////////////////////////////////////////// - - def configure(self, cnf={}, **kw): - """ - Configure this widget. Use ``label_*`` to configure all - labels; and ``listbox_*`` to configure all listboxes. E.g.: - - >>> master = Tk() # doctest: +SKIP - >>> mlb = MultiListbox(master, 5) # doctest: +SKIP - >>> mlb.configure(label_foreground='red') # doctest: +SKIP - >>> mlb.configure(listbox_foreground='red') # doctest: +SKIP - """ - cnf = dict(list(cnf.items()) + list(kw.items())) - for (key, val) in list(cnf.items()): - if key.startswith("label_") or key.startswith("label-"): - for label in self._labels: - label.configure({key[6:]: val}) - elif key.startswith("listbox_") or key.startswith("listbox-"): - for listbox in self._listboxes: - listbox.configure({key[8:]: val}) - else: - Frame.configure(self, {key: val}) - - def __setitem__(self, key, val): - """ - Configure this widget. This is equivalent to - ``self.configure({key,val``)}. See ``configure()``. - """ - self.configure({key: val}) - - def rowconfigure(self, row_index, cnf={}, **kw): - """ - Configure all table cells in the given row. Valid keyword - arguments are: ``background``, ``bg``, ``foreground``, ``fg``, - ``selectbackground``, ``selectforeground``. - """ - for lb in self._listboxes: - lb.itemconfigure(row_index, cnf, **kw) - - def columnconfigure(self, col_index, cnf={}, **kw): - """ - Configure all table cells in the given column. Valid keyword - arguments are: ``background``, ``bg``, ``foreground``, ``fg``, - ``selectbackground``, ``selectforeground``. - """ - lb = self._listboxes[col_index] - - cnf = dict(list(cnf.items()) + list(kw.items())) - for (key, val) in list(cnf.items()): - if key in ( - "background", - "bg", - "foreground", - "fg", - "selectbackground", - "selectforeground", - ): - for i in range(lb.size()): - lb.itemconfigure(i, {key: val}) - else: - lb.configure({key: val}) - - def itemconfigure(self, row_index, col_index, cnf=None, **kw): - """ - Configure the table cell at the given row and column. Valid - keyword arguments are: ``background``, ``bg``, ``foreground``, - ``fg``, ``selectbackground``, ``selectforeground``. - """ - lb = self._listboxes[col_index] - return lb.itemconfigure(row_index, cnf, **kw) - - # ///////////////////////////////////////////////////////////////// - # Value Access - # ///////////////////////////////////////////////////////////////// - - def insert(self, index, *rows): - """ - Insert the given row or rows into the table, at the given - index. Each row value should be a tuple of cell values, one - for each column in the row. Index may be an integer or any of - the special strings (such as ``'end'``) accepted by - ``Tkinter.Listbox``. - """ - for elt in rows: - if len(elt) != len(self._column_names): - raise ValueError( - "rows should be tuples whose length " - "is equal to the number of columns" - ) - for (lb, elts) in zip(self._listboxes, list(zip(*rows))): - lb.insert(index, *elts) - - def get(self, first, last=None): - """ - Return the value(s) of the specified row(s). If ``last`` is - not specified, then return a single row value; otherwise, - return a list of row values. Each row value is a tuple of - cell values, one for each column in the row. - """ - values = [lb.get(first, last) for lb in self._listboxes] - if last: - return [tuple(row) for row in zip(*values)] - else: - return tuple(values) - - def bbox(self, row, col): - """ - Return the bounding box for the given table cell, relative to - this widget's top-left corner. The bounding box is a tuple - of integers ``(left, top, width, height)``. - """ - dx, dy, _, _ = self.grid_bbox(row=0, column=col) - x, y, w, h = self._listboxes[col].bbox(row) - return int(x) + int(dx), int(y) + int(dy), int(w), int(h) - - # ///////////////////////////////////////////////////////////////// - # Hide/Show Columns - # ///////////////////////////////////////////////////////////////// - - def hide_column(self, col_index): - """ - Hide the given column. The column's state is still - maintained: its values will still be returned by ``get()``, and - you must supply its values when calling ``insert()``. It is - safe to call this on a column that is already hidden. - - :see: ``show_column()`` - """ - if self._labels: - self._labels[col_index].grid_forget() - self.listboxes[col_index].grid_forget() - self.grid_columnconfigure(col_index, weight=0) - - def show_column(self, col_index): - """ - Display a column that has been hidden using ``hide_column()``. - It is safe to call this on a column that is not hidden. - """ - weight = self._column_weights[col_index] - if self._labels: - self._labels[col_index].grid( - column=col_index, row=0, sticky="news", padx=0, pady=0 - ) - self._listboxes[col_index].grid( - column=col_index, row=1, sticky="news", padx=0, pady=0 - ) - self.grid_columnconfigure(col_index, weight=weight) - - # ///////////////////////////////////////////////////////////////// - # Binding Methods - # ///////////////////////////////////////////////////////////////// - - def bind_to_labels(self, sequence=None, func=None, add=None): - """ - Add a binding to each ``Tkinter.Label`` widget in this - mult-column listbox that will call ``func`` in response to the - event sequence. - - :return: A list of the identifiers of replaced binding - functions (if any), allowing for their deletion (to - prevent a memory leak). - """ - return [label.bind(sequence, func, add) for label in self.column_labels] - - def bind_to_listboxes(self, sequence=None, func=None, add=None): - """ - Add a binding to each ``Tkinter.Listbox`` widget in this - mult-column listbox that will call ``func`` in response to the - event sequence. - - :return: A list of the identifiers of replaced binding - functions (if any), allowing for their deletion (to - prevent a memory leak). - """ - for listbox in self.listboxes: - listbox.bind(sequence, func, add) - - def bind_to_columns(self, sequence=None, func=None, add=None): - """ - Add a binding to each ``Tkinter.Label`` and ``Tkinter.Listbox`` - widget in this mult-column listbox that will call ``func`` in - response to the event sequence. - - :return: A list of the identifiers of replaced binding - functions (if any), allowing for their deletion (to - prevent a memory leak). - """ - return self.bind_to_labels(sequence, func, add) + self.bind_to_listboxes( - sequence, func, add - ) - - # ///////////////////////////////////////////////////////////////// - # Simple Delegation - # ///////////////////////////////////////////////////////////////// - - # These methods delegate to the first listbox: - def curselection(self, *args, **kwargs): - return self._listboxes[0].curselection(*args, **kwargs) - - def selection_includes(self, *args, **kwargs): - return self._listboxes[0].selection_includes(*args, **kwargs) - - def itemcget(self, *args, **kwargs): - return self._listboxes[0].itemcget(*args, **kwargs) - - def size(self, *args, **kwargs): - return self._listboxes[0].size(*args, **kwargs) - - def index(self, *args, **kwargs): - return self._listboxes[0].index(*args, **kwargs) - - def nearest(self, *args, **kwargs): - return self._listboxes[0].nearest(*args, **kwargs) - - # These methods delegate to each listbox (and return None): - def activate(self, *args, **kwargs): - for lb in self._listboxes: - lb.activate(*args, **kwargs) - - def delete(self, *args, **kwargs): - for lb in self._listboxes: - lb.delete(*args, **kwargs) - - def scan_mark(self, *args, **kwargs): - for lb in self._listboxes: - lb.scan_mark(*args, **kwargs) - - def scan_dragto(self, *args, **kwargs): - for lb in self._listboxes: - lb.scan_dragto(*args, **kwargs) - - def see(self, *args, **kwargs): - for lb in self._listboxes: - lb.see(*args, **kwargs) - - def selection_anchor(self, *args, **kwargs): - for lb in self._listboxes: - lb.selection_anchor(*args, **kwargs) - - def selection_clear(self, *args, **kwargs): - for lb in self._listboxes: - lb.selection_clear(*args, **kwargs) - - def selection_set(self, *args, **kwargs): - for lb in self._listboxes: - lb.selection_set(*args, **kwargs) - - def yview(self, *args, **kwargs): - for lb in self._listboxes: - v = lb.yview(*args, **kwargs) - return v # if called with no arguments - - def yview_moveto(self, *args, **kwargs): - for lb in self._listboxes: - lb.yview_moveto(*args, **kwargs) - - def yview_scroll(self, *args, **kwargs): - for lb in self._listboxes: - lb.yview_scroll(*args, **kwargs) - - # ///////////////////////////////////////////////////////////////// - # Aliases - # ///////////////////////////////////////////////////////////////// - - itemconfig = itemconfigure - rowconfig = rowconfigure - columnconfig = columnconfigure - select_anchor = selection_anchor - select_clear = selection_clear - select_includes = selection_includes - select_set = selection_set - - # ///////////////////////////////////////////////////////////////// - # These listbox methods are not defined for multi-listbox - # ///////////////////////////////////////////////////////////////// - # def xview(self, *what): pass - # def xview_moveto(self, fraction): pass - # def xview_scroll(self, number, what): pass - - -###################################################################### -# Table -###################################################################### - - -class Table: - """ - A display widget for a table of values, based on a ``MultiListbox`` - widget. For many purposes, ``Table`` can be treated as a - list-of-lists. E.g., table[i] is a list of the values for row i; - and table.append(row) adds a new row with the given list of - values. Individual cells can be accessed using table[i,j], which - refers to the j-th column of the i-th row. This can be used to - both read and write values from the table. E.g.: - - >>> table[i,j] = 'hello' # doctest: +SKIP - - The column (j) can be given either as an index number, or as a - column name. E.g., the following prints the value in the 3rd row - for the 'First Name' column: - - >>> print(table[3, 'First Name']) # doctest: +SKIP - John - - You can configure the colors for individual rows, columns, or - cells using ``rowconfig()``, ``columnconfig()``, and ``itemconfig()``. - The color configuration for each row will be preserved if the - table is modified; however, when new rows are added, any color - configurations that have been made for *columns* will not be - applied to the new row. - - Note: Although ``Table`` acts like a widget in some ways (e.g., it - defines ``grid()``, ``pack()``, and ``bind()``), it is not itself a - widget; it just contains one. This is because widgets need to - define ``__getitem__()``, ``__setitem__()``, and ``__nonzero__()`` in - a way that's incompatible with the fact that ``Table`` behaves as a - list-of-lists. - - :ivar _mlb: The multi-column listbox used to display this table's data. - :ivar _rows: A list-of-lists used to hold the cell values of this - table. Each element of _rows is a row value, i.e., a list of - cell values, one for each column in the row. - """ - - def __init__( - self, - master, - column_names, - rows=None, - column_weights=None, - scrollbar=True, - click_to_sort=True, - reprfunc=None, - cnf={}, - **kw - ): - """ - Construct a new Table widget. - - :type master: Tkinter.Widget - :param master: The widget that should contain the new table. - :type column_names: list(str) - :param column_names: A list of names for the columns; these - names will be used to create labels for each column; - and can be used as an index when reading or writing - cell values from the table. - :type rows: list(list) - :param rows: A list of row values used to initialize the table. - Each row value should be a tuple of cell values, one for - each column in the row. - :type scrollbar: bool - :param scrollbar: If true, then create a scrollbar for the - new table widget. - :type click_to_sort: bool - :param click_to_sort: If true, then create bindings that will - sort the table's rows by a given column's values if the - user clicks on that colum's label. - :type reprfunc: function - :param reprfunc: If specified, then use this function to - convert each table cell value to a string suitable for - display. ``reprfunc`` has the following signature: - reprfunc(row_index, col_index, cell_value) -> str - (Note that the column is specified by index, not by name.) - :param cnf, kw: Configuration parameters for this widget's - contained ``MultiListbox``. See ``MultiListbox.__init__()`` - for details. - """ - self._num_columns = len(column_names) - self._reprfunc = reprfunc - self._frame = Frame(master) - - self._column_name_to_index = {c: i for (i, c) in enumerate(column_names)} - - # Make a copy of the rows & check that it's valid. - if rows is None: - self._rows = [] - else: - self._rows = [[v for v in row] for row in rows] - for row in self._rows: - self._checkrow(row) - - # Create our multi-list box. - self._mlb = MultiListbox(self._frame, column_names, column_weights, cnf, **kw) - self._mlb.pack(side="left", expand=True, fill="both") - - # Optional scrollbar - if scrollbar: - sb = Scrollbar(self._frame, orient="vertical", command=self._mlb.yview) - self._mlb.listboxes[0]["yscrollcommand"] = sb.set - # for listbox in self._mlb.listboxes: - # listbox['yscrollcommand'] = sb.set - sb.pack(side="right", fill="y") - self._scrollbar = sb - - # Set up sorting - self._sortkey = None - if click_to_sort: - for i, l in enumerate(self._mlb.column_labels): - l.bind("", self._sort) - - # Fill in our multi-list box. - self._fill_table() - - # ///////////////////////////////////////////////////////////////// - # { Widget-like Methods - # ///////////////////////////////////////////////////////////////// - # These all just delegate to either our frame or our MLB. - - def pack(self, *args, **kwargs): - """Position this table's main frame widget in its parent - widget. See ``Tkinter.Frame.pack()`` for more info.""" - self._frame.pack(*args, **kwargs) - - def grid(self, *args, **kwargs): - """Position this table's main frame widget in its parent - widget. See ``Tkinter.Frame.grid()`` for more info.""" - self._frame.grid(*args, **kwargs) - - def focus(self): - """Direct (keyboard) input foxus to this widget.""" - self._mlb.focus() - - def bind(self, sequence=None, func=None, add=None): - """Add a binding to this table's main frame that will call - ``func`` in response to the event sequence.""" - self._mlb.bind(sequence, func, add) - - def rowconfigure(self, row_index, cnf={}, **kw): - """:see: ``MultiListbox.rowconfigure()``""" - self._mlb.rowconfigure(row_index, cnf, **kw) - - def columnconfigure(self, col_index, cnf={}, **kw): - """:see: ``MultiListbox.columnconfigure()``""" - col_index = self.column_index(col_index) - self._mlb.columnconfigure(col_index, cnf, **kw) - - def itemconfigure(self, row_index, col_index, cnf=None, **kw): - """:see: ``MultiListbox.itemconfigure()``""" - col_index = self.column_index(col_index) - return self._mlb.itemconfigure(row_index, col_index, cnf, **kw) - - def bind_to_labels(self, sequence=None, func=None, add=None): - """:see: ``MultiListbox.bind_to_labels()``""" - return self._mlb.bind_to_labels(sequence, func, add) - - def bind_to_listboxes(self, sequence=None, func=None, add=None): - """:see: ``MultiListbox.bind_to_listboxes()``""" - return self._mlb.bind_to_listboxes(sequence, func, add) - - def bind_to_columns(self, sequence=None, func=None, add=None): - """:see: ``MultiListbox.bind_to_columns()``""" - return self._mlb.bind_to_columns(sequence, func, add) - - rowconfig = rowconfigure - columnconfig = columnconfigure - itemconfig = itemconfigure - - # ///////////////////////////////////////////////////////////////// - # { Table as list-of-lists - # ///////////////////////////////////////////////////////////////// - - def insert(self, row_index, rowvalue): - """ - Insert a new row into the table, so that its row index will be - ``row_index``. If the table contains any rows whose row index - is greater than or equal to ``row_index``, then they will be - shifted down. - - :param rowvalue: A tuple of cell values, one for each column - in the new row. - """ - self._checkrow(rowvalue) - self._rows.insert(row_index, rowvalue) - if self._reprfunc is not None: - rowvalue = [ - self._reprfunc(row_index, j, v) for (j, v) in enumerate(rowvalue) - ] - self._mlb.insert(row_index, rowvalue) - if self._DEBUG: - self._check_table_vs_mlb() - - def extend(self, rowvalues): - """ - Add new rows at the end of the table. - - :param rowvalues: A list of row values used to initialize the - table. Each row value should be a tuple of cell values, - one for each column in the row. - """ - for rowvalue in rowvalues: - self.append(rowvalue) - if self._DEBUG: - self._check_table_vs_mlb() - - def append(self, rowvalue): - """ - Add a new row to the end of the table. - - :param rowvalue: A tuple of cell values, one for each column - in the new row. - """ - self.insert(len(self._rows), rowvalue) - if self._DEBUG: - self._check_table_vs_mlb() - - def clear(self): - """ - Delete all rows in this table. - """ - self._rows = [] - self._mlb.delete(0, "end") - if self._DEBUG: - self._check_table_vs_mlb() - - def __getitem__(self, index): - """ - Return the value of a row or a cell in this table. If - ``index`` is an integer, then the row value for the ``index``th - row. This row value consists of a tuple of cell values, one - for each column in the row. If ``index`` is a tuple of two - integers, ``(i,j)``, then return the value of the cell in the - ``i``th row and the ``j``th column. - """ - if isinstance(index, slice): - raise ValueError("Slicing not supported") - elif isinstance(index, tuple) and len(index) == 2: - return self._rows[index[0]][self.column_index(index[1])] - else: - return tuple(self._rows[index]) - - def __setitem__(self, index, val): - """ - Replace the value of a row or a cell in this table with - ``val``. - - If ``index`` is an integer, then ``val`` should be a row value - (i.e., a tuple of cell values, one for each column). In this - case, the values of the ``index``th row of the table will be - replaced with the values in ``val``. - - If ``index`` is a tuple of integers, ``(i,j)``, then replace the - value of the cell in the ``i``th row and ``j``th column with - ``val``. - """ - if isinstance(index, slice): - raise ValueError("Slicing not supported") - - # table[i,j] = val - elif isinstance(index, tuple) and len(index) == 2: - i, j = index[0], self.column_index(index[1]) - config_cookie = self._save_config_info([i]) - self._rows[i][j] = val - if self._reprfunc is not None: - val = self._reprfunc(i, j, val) - self._mlb.listboxes[j].insert(i, val) - self._mlb.listboxes[j].delete(i + 1) - self._restore_config_info(config_cookie) - - # table[i] = val - else: - config_cookie = self._save_config_info([index]) - self._checkrow(val) - self._rows[index] = list(val) - if self._reprfunc is not None: - val = [self._reprfunc(index, j, v) for (j, v) in enumerate(val)] - self._mlb.insert(index, val) - self._mlb.delete(index + 1) - self._restore_config_info(config_cookie) - - def __delitem__(self, row_index): - """ - Delete the ``row_index``th row from this table. - """ - if isinstance(row_index, slice): - raise ValueError("Slicing not supported") - if isinstance(row_index, tuple) and len(row_index) == 2: - raise ValueError("Cannot delete a single cell!") - del self._rows[row_index] - self._mlb.delete(row_index) - if self._DEBUG: - self._check_table_vs_mlb() - - def __len__(self): - """ - :return: the number of rows in this table. - """ - return len(self._rows) - - def _checkrow(self, rowvalue): - """ - Helper function: check that a given row value has the correct - number of elements; and if not, raise an exception. - """ - if len(rowvalue) != self._num_columns: - raise ValueError( - "Row %r has %d columns; expected %d" - % (rowvalue, len(rowvalue), self._num_columns) - ) - - # ///////////////////////////////////////////////////////////////// - # Columns - # ///////////////////////////////////////////////////////////////// - - @property - def column_names(self): - """A list of the names of the columns in this table.""" - return self._mlb.column_names - - def column_index(self, i): - """ - If ``i`` is a valid column index integer, then return it as is. - Otherwise, check if ``i`` is used as the name for any column; - if so, return that column's index. Otherwise, raise a - ``KeyError`` exception. - """ - if isinstance(i, int) and 0 <= i < self._num_columns: - return i - else: - # This raises a key error if the column is not found. - return self._column_name_to_index[i] - - def hide_column(self, column_index): - """:see: ``MultiListbox.hide_column()``""" - self._mlb.hide_column(self.column_index(column_index)) - - def show_column(self, column_index): - """:see: ``MultiListbox.show_column()``""" - self._mlb.show_column(self.column_index(column_index)) - - # ///////////////////////////////////////////////////////////////// - # Selection - # ///////////////////////////////////////////////////////////////// - - def selected_row(self): - """ - Return the index of the currently selected row, or None if - no row is selected. To get the row value itself, use - ``table[table.selected_row()]``. - """ - sel = self._mlb.curselection() - if sel: - return int(sel[0]) - else: - return None - - def select(self, index=None, delta=None, see=True): - """:see: ``MultiListbox.select()``""" - self._mlb.select(index, delta, see) - - # ///////////////////////////////////////////////////////////////// - # Sorting - # ///////////////////////////////////////////////////////////////// - - def sort_by(self, column_index, order="toggle"): - """ - Sort the rows in this table, using the specified column's - values as a sort key. - - :param column_index: Specifies which column to sort, using - either a column index (int) or a column's label name - (str). - - :param order: Specifies whether to sort the values in - ascending or descending order: - - - ``'ascending'``: Sort from least to greatest. - - ``'descending'``: Sort from greatest to least. - - ``'toggle'``: If the most recent call to ``sort_by()`` - sorted the table by the same column (``column_index``), - then reverse the rows; otherwise sort in ascending - order. - """ - if order not in ("ascending", "descending", "toggle"): - raise ValueError( - 'sort_by(): order should be "ascending", ' '"descending", or "toggle".' - ) - column_index = self.column_index(column_index) - config_cookie = self._save_config_info(index_by_id=True) - - # Sort the rows. - if order == "toggle" and column_index == self._sortkey: - self._rows.reverse() - else: - self._rows.sort( - key=operator.itemgetter(column_index), reverse=(order == "descending") - ) - self._sortkey = column_index - - # Redraw the table. - self._fill_table() - self._restore_config_info(config_cookie, index_by_id=True, see=True) - if self._DEBUG: - self._check_table_vs_mlb() - - def _sort(self, event): - """Event handler for clicking on a column label -- sort by - that column.""" - column_index = event.widget.column_index - - # If they click on the far-left of far-right of a column's - # label, then resize rather than sorting. - if self._mlb._resize_column(event): - return "continue" - - # Otherwise, sort. - else: - self.sort_by(column_index) - return "continue" - - # ///////////////////////////////////////////////////////////////// - # { Table Drawing Helpers - # ///////////////////////////////////////////////////////////////// - - def _fill_table(self, save_config=True): - """ - Re-draw the table from scratch, by clearing out the table's - multi-column listbox; and then filling it in with values from - ``self._rows``. Note that any cell-, row-, or column-specific - color configuration that has been done will be lost. The - selection will also be lost -- i.e., no row will be selected - after this call completes. - """ - self._mlb.delete(0, "end") - for i, row in enumerate(self._rows): - if self._reprfunc is not None: - row = [self._reprfunc(i, j, v) for (j, v) in enumerate(row)] - self._mlb.insert("end", row) - - def _get_itemconfig(self, r, c): - return { - k: self._mlb.itemconfig(r, c, k)[-1] - for k in ( - "foreground", - "selectforeground", - "background", - "selectbackground", - ) - } - - def _save_config_info(self, row_indices=None, index_by_id=False): - """ - Return a 'cookie' containing information about which row is - selected, and what color configurations have been applied. - this information can the be re-applied to the table (after - making modifications) using ``_restore_config_info()``. Color - configuration information will be saved for any rows in - ``row_indices``, or in the entire table, if - ``row_indices=None``. If ``index_by_id=True``, the the cookie - will associate rows with their configuration information based - on the rows' python id. This is useful when performing - operations that re-arrange the rows (e.g. ``sort``). If - ``index_by_id=False``, then it is assumed that all rows will be - in the same order when ``_restore_config_info()`` is called. - """ - # Default value for row_indices is all rows. - if row_indices is None: - row_indices = list(range(len(self._rows))) - - # Look up our current selection. - selection = self.selected_row() - if index_by_id and selection is not None: - selection = id(self._rows[selection]) - - # Look up the color configuration info for each row. - if index_by_id: - config = { - id(self._rows[r]): [ - self._get_itemconfig(r, c) for c in range(self._num_columns) - ] - for r in row_indices - } - else: - config = { - r: [self._get_itemconfig(r, c) for c in range(self._num_columns)] - for r in row_indices - } - - return selection, config - - def _restore_config_info(self, cookie, index_by_id=False, see=False): - """ - Restore selection & color configuration information that was - saved using ``_save_config_info``. - """ - selection, config = cookie - - # Clear the selection. - if selection is None: - self._mlb.selection_clear(0, "end") - - # Restore selection & color config - if index_by_id: - for r, row in enumerate(self._rows): - if id(row) in config: - for c in range(self._num_columns): - self._mlb.itemconfigure(r, c, config[id(row)][c]) - if id(row) == selection: - self._mlb.select(r, see=see) - else: - if selection is not None: - self._mlb.select(selection, see=see) - for r in config: - for c in range(self._num_columns): - self._mlb.itemconfigure(r, c, config[r][c]) - - # ///////////////////////////////////////////////////////////////// - # Debugging (Invariant Checker) - # ///////////////////////////////////////////////////////////////// - - _DEBUG = False - """If true, then run ``_check_table_vs_mlb()`` after any operation - that modifies the table.""" - - def _check_table_vs_mlb(self): - """ - Verify that the contents of the table's ``_rows`` variable match - the contents of its multi-listbox (``_mlb``). This is just - included for debugging purposes, to make sure that the - list-modifying operations are working correctly. - """ - for col in self._mlb.listboxes: - assert len(self) == col.size() - for row in self: - assert len(row) == self._num_columns - assert self._num_columns == len(self._mlb.column_names) - # assert self._column_names == self._mlb.column_names - for i, row in enumerate(self): - for j, cell in enumerate(row): - if self._reprfunc is not None: - cell = self._reprfunc(i, j, cell) - assert self._mlb.get(i)[j] == cell - - -###################################################################### -# Demo/Test Function -###################################################################### - -# update this to use new WordNet API -def demo(): - root = Tk() - root.bind("", lambda e: root.destroy()) - - table = Table( - root, - "Word Synset Hypernym Hyponym".split(), - column_weights=[0, 1, 1, 1], - reprfunc=(lambda i, j, s: " %s" % s), - ) - table.pack(expand=True, fill="both") - - from nltk.corpus import brown, wordnet - - for word, pos in sorted(set(brown.tagged_words()[:500])): - if pos[0] != "N": - continue - word = word.lower() - for synset in wordnet.synsets(word): - try: - hyper_def = synset.hypernyms()[0].definition() - except: - hyper_def = "*none*" - try: - hypo_def = synset.hypernyms()[0].definition() - except: - hypo_def = "*none*" - table.append([word, synset.definition(), hyper_def, hypo_def]) - - table.columnconfig("Word", background="#afa") - table.columnconfig("Synset", background="#efe") - table.columnconfig("Hypernym", background="#fee") - table.columnconfig("Hyponym", background="#ffe") - for row in range(len(table)): - for column in ("Hypernym", "Hyponym"): - if table[row, column] == "*none*": - table.itemconfig( - row, column, foreground="#666", selectforeground="#666" - ) - root.mainloop() - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/draw/tree.py b/pipeline/nltk/draw/tree.py deleted file mode 100644 index 6a2791428fcab5a47dd6d88561971d6907f74084..0000000000000000000000000000000000000000 --- a/pipeline/nltk/draw/tree.py +++ /dev/null @@ -1,1129 +0,0 @@ -# Natural Language Toolkit: Graphical Representations for Trees -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -Graphically display a Tree. -""" - -from tkinter import IntVar, Menu, Tk - -from nltk.draw.util import ( - BoxWidget, - CanvasFrame, - CanvasWidget, - OvalWidget, - ParenWidget, - TextWidget, -) -from nltk.tree import Tree -from nltk.util import in_idle - -##////////////////////////////////////////////////////// -## Tree Segment -##////////////////////////////////////////////////////// - - -class TreeSegmentWidget(CanvasWidget): - """ - A canvas widget that displays a single segment of a hierarchical - tree. Each ``TreeSegmentWidget`` connects a single "node widget" - to a sequence of zero or more "subtree widgets". By default, the - bottom of the node is connected to the top of each subtree by a - single line. However, if the ``roof`` attribute is set, then a - single triangular "roof" will connect the node to all of its - children. - - Attributes: - - ``roof``: What sort of connection to draw between the node and - its subtrees. If ``roof`` is true, draw a single triangular - "roof" over the subtrees. If ``roof`` is false, draw a line - between each subtree and the node. Default value is false. - - ``xspace``: The amount of horizontal space to leave between - subtrees when managing this widget. Default value is 10. - - ``yspace``: The amount of space to place between the node and - its children when managing this widget. Default value is 15. - - ``color``: The color of the lines connecting the node to its - subtrees; and of the outline of the triangular roof. Default - value is ``'#006060'``. - - ``fill``: The fill color for the triangular roof. Default - value is ``''`` (no fill). - - ``width``: The width of the lines connecting the node to its - subtrees; and of the outline of the triangular roof. Default - value is 1. - - ``orientation``: Determines whether the tree branches downwards - or rightwards. Possible values are ``'horizontal'`` and - ``'vertical'``. The default value is ``'vertical'`` (i.e., - branch downwards). - - ``draggable``: whether the widget can be dragged by the user. - """ - - def __init__(self, canvas, label, subtrees, **attribs): - """ - :type node: - :type subtrees: list(CanvasWidgetI) - """ - self._label = label - self._subtrees = subtrees - - # Attributes - self._horizontal = 0 - self._roof = 0 - self._xspace = 10 - self._yspace = 15 - self._ordered = False - - # Create canvas objects. - self._lines = [canvas.create_line(0, 0, 0, 0, fill="#006060") for c in subtrees] - self._polygon = canvas.create_polygon( - 0, 0, fill="", state="hidden", outline="#006060" - ) - - # Register child widgets (label + subtrees) - self._add_child_widget(label) - for subtree in subtrees: - self._add_child_widget(subtree) - - # Are we currently managing? - self._managing = False - - CanvasWidget.__init__(self, canvas, **attribs) - - def __setitem__(self, attr, value): - canvas = self.canvas() - if attr == "roof": - self._roof = value - if self._roof: - for l in self._lines: - canvas.itemconfig(l, state="hidden") - canvas.itemconfig(self._polygon, state="normal") - else: - for l in self._lines: - canvas.itemconfig(l, state="normal") - canvas.itemconfig(self._polygon, state="hidden") - elif attr == "orientation": - if value == "horizontal": - self._horizontal = 1 - elif value == "vertical": - self._horizontal = 0 - else: - raise ValueError("orientation must be horizontal or vertical") - elif attr == "color": - for l in self._lines: - canvas.itemconfig(l, fill=value) - canvas.itemconfig(self._polygon, outline=value) - elif isinstance(attr, tuple) and attr[0] == "color": - # Set the color of an individual line. - l = self._lines[int(attr[1])] - canvas.itemconfig(l, fill=value) - elif attr == "fill": - canvas.itemconfig(self._polygon, fill=value) - elif attr == "width": - canvas.itemconfig(self._polygon, {attr: value}) - for l in self._lines: - canvas.itemconfig(l, {attr: value}) - elif attr in ("xspace", "yspace"): - if attr == "xspace": - self._xspace = value - elif attr == "yspace": - self._yspace = value - self.update(self._label) - elif attr == "ordered": - self._ordered = value - else: - CanvasWidget.__setitem__(self, attr, value) - - def __getitem__(self, attr): - if attr == "roof": - return self._roof - elif attr == "width": - return self.canvas().itemcget(self._polygon, attr) - elif attr == "color": - return self.canvas().itemcget(self._polygon, "outline") - elif isinstance(attr, tuple) and attr[0] == "color": - l = self._lines[int(attr[1])] - return self.canvas().itemcget(l, "fill") - elif attr == "xspace": - return self._xspace - elif attr == "yspace": - return self._yspace - elif attr == "orientation": - if self._horizontal: - return "horizontal" - else: - return "vertical" - elif attr == "ordered": - return self._ordered - else: - return CanvasWidget.__getitem__(self, attr) - - def label(self): - return self._label - - def subtrees(self): - return self._subtrees[:] - - def set_label(self, label): - """ - Set the node label to ``label``. - """ - self._remove_child_widget(self._label) - self._add_child_widget(label) - self._label = label - self.update(self._label) - - def replace_child(self, oldchild, newchild): - """ - Replace the child ``oldchild`` with ``newchild``. - """ - index = self._subtrees.index(oldchild) - self._subtrees[index] = newchild - self._remove_child_widget(oldchild) - self._add_child_widget(newchild) - self.update(newchild) - - def remove_child(self, child): - index = self._subtrees.index(child) - del self._subtrees[index] - self._remove_child_widget(child) - self.canvas().delete(self._lines.pop()) - self.update(self._label) - - def insert_child(self, index, child): - canvas = self.canvas() - self._subtrees.insert(index, child) - self._add_child_widget(child) - self._lines.append(canvas.create_line(0, 0, 0, 0, fill="#006060")) - self.update(self._label) - - # but.. lines??? - - def _tags(self): - if self._roof: - return [self._polygon] - else: - return self._lines - - def _subtree_top(self, child): - if isinstance(child, TreeSegmentWidget): - bbox = child.label().bbox() - else: - bbox = child.bbox() - if self._horizontal: - return (bbox[0], (bbox[1] + bbox[3]) / 2.0) - else: - return ((bbox[0] + bbox[2]) / 2.0, bbox[1]) - - def _node_bottom(self): - bbox = self._label.bbox() - if self._horizontal: - return (bbox[2], (bbox[1] + bbox[3]) / 2.0) - else: - return ((bbox[0] + bbox[2]) / 2.0, bbox[3]) - - def _update(self, child): - if len(self._subtrees) == 0: - return - if self._label.bbox() is None: - return # [XX] ??? - - # Which lines need to be redrawn? - if child is self._label: - need_update = self._subtrees - else: - need_update = [child] - - if self._ordered and not self._managing: - need_update = self._maintain_order(child) - - # Update the polygon. - (nodex, nodey) = self._node_bottom() - (xmin, ymin, xmax, ymax) = self._subtrees[0].bbox() - for subtree in self._subtrees[1:]: - bbox = subtree.bbox() - xmin = min(xmin, bbox[0]) - ymin = min(ymin, bbox[1]) - xmax = max(xmax, bbox[2]) - ymax = max(ymax, bbox[3]) - - if self._horizontal: - self.canvas().coords( - self._polygon, nodex, nodey, xmin, ymin, xmin, ymax, nodex, nodey - ) - else: - self.canvas().coords( - self._polygon, nodex, nodey, xmin, ymin, xmax, ymin, nodex, nodey - ) - - # Redraw all lines that need it. - for subtree in need_update: - (nodex, nodey) = self._node_bottom() - line = self._lines[self._subtrees.index(subtree)] - (subtreex, subtreey) = self._subtree_top(subtree) - self.canvas().coords(line, nodex, nodey, subtreex, subtreey) - - def _maintain_order(self, child): - if self._horizontal: - return self._maintain_order_horizontal(child) - else: - return self._maintain_order_vertical(child) - - def _maintain_order_vertical(self, child): - (left, top, right, bot) = child.bbox() - - if child is self._label: - # Check all the leaves - for subtree in self._subtrees: - (x1, y1, x2, y2) = subtree.bbox() - if bot + self._yspace > y1: - subtree.move(0, bot + self._yspace - y1) - - return self._subtrees - else: - moved = [child] - index = self._subtrees.index(child) - - # Check leaves to our right. - x = right + self._xspace - for i in range(index + 1, len(self._subtrees)): - (x1, y1, x2, y2) = self._subtrees[i].bbox() - if x > x1: - self._subtrees[i].move(x - x1, 0) - x += x2 - x1 + self._xspace - moved.append(self._subtrees[i]) - - # Check leaves to our left. - x = left - self._xspace - for i in range(index - 1, -1, -1): - (x1, y1, x2, y2) = self._subtrees[i].bbox() - if x < x2: - self._subtrees[i].move(x - x2, 0) - x -= x2 - x1 + self._xspace - moved.append(self._subtrees[i]) - - # Check the node - (x1, y1, x2, y2) = self._label.bbox() - if y2 > top - self._yspace: - self._label.move(0, top - self._yspace - y2) - moved = self._subtrees - - # Return a list of the nodes we moved - return moved - - def _maintain_order_horizontal(self, child): - (left, top, right, bot) = child.bbox() - - if child is self._label: - # Check all the leaves - for subtree in self._subtrees: - (x1, y1, x2, y2) = subtree.bbox() - if right + self._xspace > x1: - subtree.move(right + self._xspace - x1) - - return self._subtrees - else: - moved = [child] - index = self._subtrees.index(child) - - # Check leaves below us. - y = bot + self._yspace - for i in range(index + 1, len(self._subtrees)): - (x1, y1, x2, y2) = self._subtrees[i].bbox() - if y > y1: - self._subtrees[i].move(0, y - y1) - y += y2 - y1 + self._yspace - moved.append(self._subtrees[i]) - - # Check leaves above us - y = top - self._yspace - for i in range(index - 1, -1, -1): - (x1, y1, x2, y2) = self._subtrees[i].bbox() - if y < y2: - self._subtrees[i].move(0, y - y2) - y -= y2 - y1 + self._yspace - moved.append(self._subtrees[i]) - - # Check the node - (x1, y1, x2, y2) = self._label.bbox() - if x2 > left - self._xspace: - self._label.move(left - self._xspace - x2, 0) - moved = self._subtrees - - # Return a list of the nodes we moved - return moved - - def _manage_horizontal(self): - (nodex, nodey) = self._node_bottom() - - # Put the subtrees in a line. - y = 20 - for subtree in self._subtrees: - subtree_bbox = subtree.bbox() - dx = nodex - subtree_bbox[0] + self._xspace - dy = y - subtree_bbox[1] - subtree.move(dx, dy) - y += subtree_bbox[3] - subtree_bbox[1] + self._yspace - - # Find the center of their tops. - center = 0.0 - for subtree in self._subtrees: - center += self._subtree_top(subtree)[1] - center /= len(self._subtrees) - - # Center the subtrees with the node. - for subtree in self._subtrees: - subtree.move(0, nodey - center) - - def _manage_vertical(self): - (nodex, nodey) = self._node_bottom() - - # Put the subtrees in a line. - x = 0 - for subtree in self._subtrees: - subtree_bbox = subtree.bbox() - dy = nodey - subtree_bbox[1] + self._yspace - dx = x - subtree_bbox[0] - subtree.move(dx, dy) - x += subtree_bbox[2] - subtree_bbox[0] + self._xspace - - # Find the center of their tops. - center = 0.0 - for subtree in self._subtrees: - center += self._subtree_top(subtree)[0] / len(self._subtrees) - - # Center the subtrees with the node. - for subtree in self._subtrees: - subtree.move(nodex - center, 0) - - def _manage(self): - self._managing = True - (nodex, nodey) = self._node_bottom() - if len(self._subtrees) == 0: - return - - if self._horizontal: - self._manage_horizontal() - else: - self._manage_vertical() - - # Update lines to subtrees. - for subtree in self._subtrees: - self._update(subtree) - - self._managing = False - - def __repr__(self): - return f"[TreeSeg {self._label}: {self._subtrees}]" - - -def _tree_to_treeseg( - canvas, - t, - make_node, - make_leaf, - tree_attribs, - node_attribs, - leaf_attribs, - loc_attribs, -): - if isinstance(t, Tree): - label = make_node(canvas, t.label(), **node_attribs) - subtrees = [ - _tree_to_treeseg( - canvas, - child, - make_node, - make_leaf, - tree_attribs, - node_attribs, - leaf_attribs, - loc_attribs, - ) - for child in t - ] - return TreeSegmentWidget(canvas, label, subtrees, **tree_attribs) - else: - return make_leaf(canvas, t, **leaf_attribs) - - -def tree_to_treesegment( - canvas, t, make_node=TextWidget, make_leaf=TextWidget, **attribs -): - """ - Convert a Tree into a ``TreeSegmentWidget``. - - :param make_node: A ``CanvasWidget`` constructor or a function that - creates ``CanvasWidgets``. ``make_node`` is used to convert - the Tree's nodes into ``CanvasWidgets``. If no constructor - is specified, then ``TextWidget`` will be used. - :param make_leaf: A ``CanvasWidget`` constructor or a function that - creates ``CanvasWidgets``. ``make_leaf`` is used to convert - the Tree's leafs into ``CanvasWidgets``. If no constructor - is specified, then ``TextWidget`` will be used. - :param attribs: Attributes for the canvas widgets that make up the - returned ``TreeSegmentWidget``. Any attribute beginning with - ``'tree_'`` will be passed to all ``TreeSegmentWidgets`` (with - the ``'tree_'`` prefix removed. Any attribute beginning with - ``'node_'`` will be passed to all nodes. Any attribute - beginning with ``'leaf_'`` will be passed to all leaves. And - any attribute beginning with ``'loc_'`` will be passed to all - text locations (for Trees). - """ - # Process attribs. - tree_attribs = {} - node_attribs = {} - leaf_attribs = {} - loc_attribs = {} - - for (key, value) in list(attribs.items()): - if key[:5] == "tree_": - tree_attribs[key[5:]] = value - elif key[:5] == "node_": - node_attribs[key[5:]] = value - elif key[:5] == "leaf_": - leaf_attribs[key[5:]] = value - elif key[:4] == "loc_": - loc_attribs[key[4:]] = value - else: - raise ValueError("Bad attribute: %s" % key) - return _tree_to_treeseg( - canvas, - t, - make_node, - make_leaf, - tree_attribs, - node_attribs, - leaf_attribs, - loc_attribs, - ) - - -##////////////////////////////////////////////////////// -## Tree Widget -##////////////////////////////////////////////////////// - - -class TreeWidget(CanvasWidget): - """ - A canvas widget that displays a single Tree. - ``TreeWidget`` manages a group of ``TreeSegmentWidgets`` that are - used to display a Tree. - - Attributes: - - - ``node_attr``: Sets the attribute ``attr`` on all of the - node widgets for this ``TreeWidget``. - - ``node_attr``: Sets the attribute ``attr`` on all of the - leaf widgets for this ``TreeWidget``. - - ``loc_attr``: Sets the attribute ``attr`` on all of the - location widgets for this ``TreeWidget`` (if it was built from - a Tree). Note that a location widget is a ``TextWidget``. - - - ``xspace``: The amount of horizontal space to leave between - subtrees when managing this widget. Default value is 10. - - ``yspace``: The amount of space to place between the node and - its children when managing this widget. Default value is 15. - - - ``line_color``: The color of the lines connecting each expanded - node to its subtrees. - - ``roof_color``: The color of the outline of the triangular roof - for collapsed trees. - - ``roof_fill``: The fill color for the triangular roof for - collapsed trees. - - ``width`` - - - ``orientation``: Determines whether the tree branches downwards - or rightwards. Possible values are ``'horizontal'`` and - ``'vertical'``. The default value is ``'vertical'`` (i.e., - branch downwards). - - - ``shapeable``: whether the subtrees can be independently - dragged by the user. THIS property simply sets the - ``DRAGGABLE`` property on all of the ``TreeWidget``'s tree - segments. - - ``draggable``: whether the widget can be dragged by the user. - """ - - def __init__( - self, canvas, t, make_node=TextWidget, make_leaf=TextWidget, **attribs - ): - # Node & leaf canvas widget constructors - self._make_node = make_node - self._make_leaf = make_leaf - self._tree = t - - # Attributes. - self._nodeattribs = {} - self._leafattribs = {} - self._locattribs = {"color": "#008000"} - self._line_color = "#008080" - self._line_width = 1 - self._roof_color = "#008080" - self._roof_fill = "#c0c0c0" - self._shapeable = False - self._xspace = 10 - self._yspace = 10 - self._orientation = "vertical" - self._ordered = False - - # Build trees. - self._keys = {} # treeseg -> key - self._expanded_trees = {} - self._collapsed_trees = {} - self._nodes = [] - self._leaves = [] - # self._locs = [] - self._make_collapsed_trees(canvas, t, ()) - self._treeseg = self._make_expanded_tree(canvas, t, ()) - self._add_child_widget(self._treeseg) - - CanvasWidget.__init__(self, canvas, **attribs) - - def expanded_tree(self, *path_to_tree): - """ - Return the ``TreeSegmentWidget`` for the specified subtree. - - :param path_to_tree: A list of indices i1, i2, ..., in, where - the desired widget is the widget corresponding to - ``tree.children()[i1].children()[i2]....children()[in]``. - For the root, the path is ``()``. - """ - return self._expanded_trees[path_to_tree] - - def collapsed_tree(self, *path_to_tree): - """ - Return the ``TreeSegmentWidget`` for the specified subtree. - - :param path_to_tree: A list of indices i1, i2, ..., in, where - the desired widget is the widget corresponding to - ``tree.children()[i1].children()[i2]....children()[in]``. - For the root, the path is ``()``. - """ - return self._collapsed_trees[path_to_tree] - - def bind_click_trees(self, callback, button=1): - """ - Add a binding to all tree segments. - """ - for tseg in list(self._expanded_trees.values()): - tseg.bind_click(callback, button) - for tseg in list(self._collapsed_trees.values()): - tseg.bind_click(callback, button) - - def bind_drag_trees(self, callback, button=1): - """ - Add a binding to all tree segments. - """ - for tseg in list(self._expanded_trees.values()): - tseg.bind_drag(callback, button) - for tseg in list(self._collapsed_trees.values()): - tseg.bind_drag(callback, button) - - def bind_click_leaves(self, callback, button=1): - """ - Add a binding to all leaves. - """ - for leaf in self._leaves: - leaf.bind_click(callback, button) - for leaf in self._leaves: - leaf.bind_click(callback, button) - - def bind_drag_leaves(self, callback, button=1): - """ - Add a binding to all leaves. - """ - for leaf in self._leaves: - leaf.bind_drag(callback, button) - for leaf in self._leaves: - leaf.bind_drag(callback, button) - - def bind_click_nodes(self, callback, button=1): - """ - Add a binding to all nodes. - """ - for node in self._nodes: - node.bind_click(callback, button) - for node in self._nodes: - node.bind_click(callback, button) - - def bind_drag_nodes(self, callback, button=1): - """ - Add a binding to all nodes. - """ - for node in self._nodes: - node.bind_drag(callback, button) - for node in self._nodes: - node.bind_drag(callback, button) - - def _make_collapsed_trees(self, canvas, t, key): - if not isinstance(t, Tree): - return - make_node = self._make_node - make_leaf = self._make_leaf - - node = make_node(canvas, t.label(), **self._nodeattribs) - self._nodes.append(node) - leaves = [make_leaf(canvas, l, **self._leafattribs) for l in t.leaves()] - self._leaves += leaves - treeseg = TreeSegmentWidget( - canvas, - node, - leaves, - roof=1, - color=self._roof_color, - fill=self._roof_fill, - width=self._line_width, - ) - - self._collapsed_trees[key] = treeseg - self._keys[treeseg] = key - # self._add_child_widget(treeseg) - treeseg.hide() - - # Build trees for children. - for i in range(len(t)): - child = t[i] - self._make_collapsed_trees(canvas, child, key + (i,)) - - def _make_expanded_tree(self, canvas, t, key): - make_node = self._make_node - make_leaf = self._make_leaf - - if isinstance(t, Tree): - node = make_node(canvas, t.label(), **self._nodeattribs) - self._nodes.append(node) - children = t - subtrees = [ - self._make_expanded_tree(canvas, children[i], key + (i,)) - for i in range(len(children)) - ] - treeseg = TreeSegmentWidget( - canvas, node, subtrees, color=self._line_color, width=self._line_width - ) - self._expanded_trees[key] = treeseg - self._keys[treeseg] = key - return treeseg - else: - leaf = make_leaf(canvas, t, **self._leafattribs) - self._leaves.append(leaf) - return leaf - - def __setitem__(self, attr, value): - if attr[:5] == "node_": - for node in self._nodes: - node[attr[5:]] = value - elif attr[:5] == "leaf_": - for leaf in self._leaves: - leaf[attr[5:]] = value - elif attr == "line_color": - self._line_color = value - for tseg in list(self._expanded_trees.values()): - tseg["color"] = value - elif attr == "line_width": - self._line_width = value - for tseg in list(self._expanded_trees.values()): - tseg["width"] = value - for tseg in list(self._collapsed_trees.values()): - tseg["width"] = value - elif attr == "roof_color": - self._roof_color = value - for tseg in list(self._collapsed_trees.values()): - tseg["color"] = value - elif attr == "roof_fill": - self._roof_fill = value - for tseg in list(self._collapsed_trees.values()): - tseg["fill"] = value - elif attr == "shapeable": - self._shapeable = value - for tseg in list(self._expanded_trees.values()): - tseg["draggable"] = value - for tseg in list(self._collapsed_trees.values()): - tseg["draggable"] = value - for leaf in self._leaves: - leaf["draggable"] = value - elif attr == "xspace": - self._xspace = value - for tseg in list(self._expanded_trees.values()): - tseg["xspace"] = value - for tseg in list(self._collapsed_trees.values()): - tseg["xspace"] = value - self.manage() - elif attr == "yspace": - self._yspace = value - for tseg in list(self._expanded_trees.values()): - tseg["yspace"] = value - for tseg in list(self._collapsed_trees.values()): - tseg["yspace"] = value - self.manage() - elif attr == "orientation": - self._orientation = value - for tseg in list(self._expanded_trees.values()): - tseg["orientation"] = value - for tseg in list(self._collapsed_trees.values()): - tseg["orientation"] = value - self.manage() - elif attr == "ordered": - self._ordered = value - for tseg in list(self._expanded_trees.values()): - tseg["ordered"] = value - for tseg in list(self._collapsed_trees.values()): - tseg["ordered"] = value - else: - CanvasWidget.__setitem__(self, attr, value) - - def __getitem__(self, attr): - if attr[:5] == "node_": - return self._nodeattribs.get(attr[5:], None) - elif attr[:5] == "leaf_": - return self._leafattribs.get(attr[5:], None) - elif attr[:4] == "loc_": - return self._locattribs.get(attr[4:], None) - elif attr == "line_color": - return self._line_color - elif attr == "line_width": - return self._line_width - elif attr == "roof_color": - return self._roof_color - elif attr == "roof_fill": - return self._roof_fill - elif attr == "shapeable": - return self._shapeable - elif attr == "xspace": - return self._xspace - elif attr == "yspace": - return self._yspace - elif attr == "orientation": - return self._orientation - else: - return CanvasWidget.__getitem__(self, attr) - - def _tags(self): - return [] - - def _manage(self): - segs = list(self._expanded_trees.values()) + list( - self._collapsed_trees.values() - ) - for tseg in segs: - if tseg.hidden(): - tseg.show() - tseg.manage() - tseg.hide() - - def toggle_collapsed(self, treeseg): - """ - Collapse/expand a tree. - """ - old_treeseg = treeseg - if old_treeseg["roof"]: - new_treeseg = self._expanded_trees[self._keys[old_treeseg]] - else: - new_treeseg = self._collapsed_trees[self._keys[old_treeseg]] - - # Replace the old tree with the new tree. - if old_treeseg.parent() is self: - self._remove_child_widget(old_treeseg) - self._add_child_widget(new_treeseg) - self._treeseg = new_treeseg - else: - old_treeseg.parent().replace_child(old_treeseg, new_treeseg) - - # Move the new tree to where the old tree was. Show it first, - # so we can find its bounding box. - new_treeseg.show() - (newx, newy) = new_treeseg.label().bbox()[:2] - (oldx, oldy) = old_treeseg.label().bbox()[:2] - new_treeseg.move(oldx - newx, oldy - newy) - - # Hide the old tree - old_treeseg.hide() - - # We could do parent.manage() here instead, if we wanted. - new_treeseg.parent().update(new_treeseg) - - -##////////////////////////////////////////////////////// -## draw_trees -##////////////////////////////////////////////////////// - - -class TreeView: - def __init__(self, *trees): - from math import ceil, sqrt - - self._trees = trees - - self._top = Tk() - self._top.title("NLTK") - self._top.bind("", self.destroy) - self._top.bind("", self.destroy) - - cf = self._cframe = CanvasFrame(self._top) - self._top.bind("", self._cframe.print_to_file) - - # Size is variable. - self._size = IntVar(self._top) - self._size.set(12) - bold = ("helvetica", -self._size.get(), "bold") - helv = ("helvetica", -self._size.get()) - - # Lay the trees out in a square. - self._width = int(ceil(sqrt(len(trees)))) - self._widgets = [] - for i in range(len(trees)): - widget = TreeWidget( - cf.canvas(), - trees[i], - node_font=bold, - leaf_color="#008040", - node_color="#004080", - roof_color="#004040", - roof_fill="white", - line_color="#004040", - draggable=1, - leaf_font=helv, - ) - widget.bind_click_trees(widget.toggle_collapsed) - self._widgets.append(widget) - cf.add_widget(widget, 0, 0) - - self._layout() - self._cframe.pack(expand=1, fill="both") - self._init_menubar() - - def _layout(self): - i = x = y = ymax = 0 - width = self._width - for i in range(len(self._widgets)): - widget = self._widgets[i] - (oldx, oldy) = widget.bbox()[:2] - if i % width == 0: - y = ymax - x = 0 - widget.move(x - oldx, y - oldy) - x = widget.bbox()[2] + 10 - ymax = max(ymax, widget.bbox()[3] + 10) - - def _init_menubar(self): - menubar = Menu(self._top) - - filemenu = Menu(menubar, tearoff=0) - filemenu.add_command( - label="Print to Postscript", - underline=0, - command=self._cframe.print_to_file, - accelerator="Ctrl-p", - ) - filemenu.add_command( - label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x" - ) - menubar.add_cascade(label="File", underline=0, menu=filemenu) - - zoommenu = Menu(menubar, tearoff=0) - zoommenu.add_radiobutton( - label="Tiny", - variable=self._size, - underline=0, - value=10, - command=self.resize, - ) - zoommenu.add_radiobutton( - label="Small", - variable=self._size, - underline=0, - value=12, - command=self.resize, - ) - zoommenu.add_radiobutton( - label="Medium", - variable=self._size, - underline=0, - value=14, - command=self.resize, - ) - zoommenu.add_radiobutton( - label="Large", - variable=self._size, - underline=0, - value=28, - command=self.resize, - ) - zoommenu.add_radiobutton( - label="Huge", - variable=self._size, - underline=0, - value=50, - command=self.resize, - ) - menubar.add_cascade(label="Zoom", underline=0, menu=zoommenu) - - self._top.config(menu=menubar) - - def resize(self, *e): - bold = ("helvetica", -self._size.get(), "bold") - helv = ("helvetica", -self._size.get()) - xspace = self._size.get() - yspace = self._size.get() - for widget in self._widgets: - widget["node_font"] = bold - widget["leaf_font"] = helv - widget["xspace"] = xspace - widget["yspace"] = yspace - if self._size.get() < 20: - widget["line_width"] = 1 - elif self._size.get() < 30: - widget["line_width"] = 2 - else: - widget["line_width"] = 3 - self._layout() - - def destroy(self, *e): - if self._top is None: - return - self._top.destroy() - self._top = None - - def mainloop(self, *args, **kwargs): - """ - Enter the Tkinter mainloop. This function must be called if - this demo is created from a non-interactive program (e.g. - from a secript); otherwise, the demo will close as soon as - the script completes. - """ - if in_idle(): - return - self._top.mainloop(*args, **kwargs) - - -def draw_trees(*trees): - """ - Open a new window containing a graphical diagram of the given - trees. - - :rtype: None - """ - TreeView(*trees).mainloop() - return - - -##////////////////////////////////////////////////////// -## Demo Code -##////////////////////////////////////////////////////// - - -def demo(): - import random - - def fill(cw): - cw["fill"] = "#%06d" % random.randint(0, 999999) - - cf = CanvasFrame(width=550, height=450, closeenough=2) - - t = Tree.fromstring( - """ - (S (NP the very big cat) - (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))""" - ) - - tc = TreeWidget( - cf.canvas(), - t, - draggable=1, - node_font=("helvetica", -14, "bold"), - leaf_font=("helvetica", -12, "italic"), - roof_fill="white", - roof_color="black", - leaf_color="green4", - node_color="blue2", - ) - cf.add_widget(tc, 10, 10) - - def boxit(canvas, text): - big = ("helvetica", -16, "bold") - return BoxWidget(canvas, TextWidget(canvas, text, font=big), fill="green") - - def ovalit(canvas, text): - return OvalWidget(canvas, TextWidget(canvas, text), fill="cyan") - - treetok = Tree.fromstring("(S (NP this tree) (VP (V is) (AdjP shapeable)))") - tc2 = TreeWidget(cf.canvas(), treetok, boxit, ovalit, shapeable=1) - - def color(node): - node["color"] = "#%04d00" % random.randint(0, 9999) - - def color2(treeseg): - treeseg.label()["fill"] = "#%06d" % random.randint(0, 9999) - treeseg.label().child()["color"] = "white" - - tc.bind_click_trees(tc.toggle_collapsed) - tc2.bind_click_trees(tc2.toggle_collapsed) - tc.bind_click_nodes(color, 3) - tc2.expanded_tree(1).bind_click(color2, 3) - tc2.expanded_tree().bind_click(color2, 3) - - paren = ParenWidget(cf.canvas(), tc2) - cf.add_widget(paren, tc.bbox()[2] + 10, 10) - - tree3 = Tree.fromstring( - """ - (S (NP this tree) (AUX was) - (VP (V built) (PP (P with) (NP (N tree_to_treesegment)))))""" - ) - tc3 = tree_to_treesegment( - cf.canvas(), tree3, tree_color="green4", tree_xspace=2, tree_width=2 - ) - tc3["draggable"] = 1 - cf.add_widget(tc3, 10, tc.bbox()[3] + 10) - - def orientswitch(treewidget): - if treewidget["orientation"] == "horizontal": - treewidget.expanded_tree(1, 1).subtrees()[0].set_text("vertical") - treewidget.collapsed_tree(1, 1).subtrees()[0].set_text("vertical") - treewidget.collapsed_tree(1).subtrees()[1].set_text("vertical") - treewidget.collapsed_tree().subtrees()[3].set_text("vertical") - treewidget["orientation"] = "vertical" - else: - treewidget.expanded_tree(1, 1).subtrees()[0].set_text("horizontal") - treewidget.collapsed_tree(1, 1).subtrees()[0].set_text("horizontal") - treewidget.collapsed_tree(1).subtrees()[1].set_text("horizontal") - treewidget.collapsed_tree().subtrees()[3].set_text("horizontal") - treewidget["orientation"] = "horizontal" - - text = """ -Try clicking, right clicking, and dragging -different elements of each of the trees. -The top-left tree is a TreeWidget built from -a Tree. The top-right is a TreeWidget built -from a Tree, using non-default widget -constructors for the nodes & leaves (BoxWidget -and OvalWidget). The bottom-left tree is -built from tree_to_treesegment.""" - twidget = TextWidget(cf.canvas(), text.strip()) - textbox = BoxWidget(cf.canvas(), twidget, fill="white", draggable=1) - cf.add_widget(textbox, tc3.bbox()[2] + 10, tc2.bbox()[3] + 10) - - tree4 = Tree.fromstring("(S (NP this tree) (VP (V is) (Adj horizontal)))") - tc4 = TreeWidget( - cf.canvas(), - tree4, - draggable=1, - line_color="brown2", - roof_color="brown2", - node_font=("helvetica", -12, "bold"), - node_color="brown4", - orientation="horizontal", - ) - tc4.manage() - cf.add_widget(tc4, tc3.bbox()[2] + 10, textbox.bbox()[3] + 10) - tc4.bind_click(orientswitch) - tc4.bind_click_trees(tc4.toggle_collapsed, 3) - - # Run mainloop - cf.mainloop() - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/draw/util.py b/pipeline/nltk/draw/util.py deleted file mode 100644 index 31ae442099a892a6e84a0dbf3ff284d7aa184b3f..0000000000000000000000000000000000000000 --- a/pipeline/nltk/draw/util.py +++ /dev/null @@ -1,2575 +0,0 @@ -# Natural Language Toolkit: Drawing utilities -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -Tools for graphically displaying and interacting with the objects and -processing classes defined by the Toolkit. These tools are primarily -intended to help students visualize the objects that they create. - -The graphical tools are typically built using "canvas widgets", each -of which encapsulates the graphical elements and bindings used to -display a complex object on a Tkinter ``Canvas``. For example, NLTK -defines canvas widgets for displaying trees and directed graphs, as -well as a number of simpler widgets. These canvas widgets make it -easier to build new graphical tools and demos. See the class -documentation for ``CanvasWidget`` for more information. - -The ``nltk.draw`` module defines the abstract ``CanvasWidget`` base -class, and a number of simple canvas widgets. The remaining canvas -widgets are defined by submodules, such as ``nltk.draw.tree``. - -The ``nltk.draw`` module also defines ``CanvasFrame``, which -encapsulates a ``Canvas`` and its scrollbars. It uses a -``ScrollWatcherWidget`` to ensure that all canvas widgets contained on -its canvas are within the scroll region. - -Acknowledgements: Many of the ideas behind the canvas widget system -are derived from ``CLIG``, a Tk-based grapher for linguistic data -structures. For more information, see the CLIG -homepage (http://www.ags.uni-sb.de/~konrad/clig.html). - -""" -from abc import ABCMeta, abstractmethod -from tkinter import ( - RAISED, - Button, - Canvas, - Entry, - Frame, - Label, - Menu, - Menubutton, - Scrollbar, - StringVar, - Text, - Tk, - Toplevel, - Widget, -) -from tkinter.filedialog import asksaveasfilename - -from nltk.util import in_idle - -##////////////////////////////////////////////////////// -## CanvasWidget -##////////////////////////////////////////////////////// - - -class CanvasWidget(metaclass=ABCMeta): - """ - A collection of graphical elements and bindings used to display a - complex object on a Tkinter ``Canvas``. A canvas widget is - responsible for managing the ``Canvas`` tags and callback bindings - necessary to display and interact with the object. Canvas widgets - are often organized into hierarchies, where parent canvas widgets - control aspects of their child widgets. - - Each canvas widget is bound to a single ``Canvas``. This ``Canvas`` - is specified as the first argument to the ``CanvasWidget``'s - constructor. - - Attributes. Each canvas widget can support a variety of - "attributes", which control how the canvas widget is displayed. - Some typical examples attributes are ``color``, ``font``, and - ``radius``. Each attribute has a default value. This default - value can be overridden in the constructor, using keyword - arguments of the form ``attribute=value``: - - >>> from nltk.draw.util import TextWidget - >>> cn = TextWidget(Canvas(), 'test', color='red') # doctest: +SKIP - - Attribute values can also be changed after a canvas widget has - been constructed, using the ``__setitem__`` operator: - - >>> cn['font'] = 'times' # doctest: +SKIP - - The current value of an attribute value can be queried using the - ``__getitem__`` operator: - - >>> cn['color'] # doctest: +SKIP - 'red' - - For a list of the attributes supported by a type of canvas widget, - see its class documentation. - - Interaction. The attribute ``'draggable'`` controls whether the - user can drag a canvas widget around the canvas. By default, - canvas widgets are not draggable. - - ``CanvasWidget`` provides callback support for two types of user - interaction: clicking and dragging. The method ``bind_click`` - registers a callback function that is called whenever the canvas - widget is clicked. The method ``bind_drag`` registers a callback - function that is called after the canvas widget is dragged. If - the user clicks or drags a canvas widget with no registered - callback function, then the interaction event will propagate to - its parent. For each canvas widget, only one callback function - may be registered for an interaction event. Callback functions - can be deregistered with the ``unbind_click`` and ``unbind_drag`` - methods. - - Subclassing. ``CanvasWidget`` is an abstract class. Subclasses - are required to implement the following methods: - - - ``__init__``: Builds a new canvas widget. It must perform the - following three tasks (in order): - - - Create any new graphical elements. - - Call ``_add_child_widget`` on each child widget. - - Call the ``CanvasWidget`` constructor. - - ``_tags``: Returns a list of the canvas tags for all graphical - elements managed by this canvas widget, not including - graphical elements managed by its child widgets. - - ``_manage``: Arranges the child widgets of this canvas widget. - This is typically only called when the canvas widget is - created. - - ``_update``: Update this canvas widget in response to a - change in a single child. - - For a ``CanvasWidget`` with no child widgets, the default - definitions for ``_manage`` and ``_update`` may be used. - - If a subclass defines any attributes, then it should implement - ``__getitem__`` and ``__setitem__``. If either of these methods is - called with an unknown attribute, then they should propagate the - request to ``CanvasWidget``. - - Most subclasses implement a number of additional methods that - modify the ``CanvasWidget`` in some way. These methods must call - ``parent.update(self)`` after making any changes to the canvas - widget's graphical elements. The canvas widget must also call - ``parent.update(self)`` after changing any attribute value that - affects the shape or position of the canvas widget's graphical - elements. - - :type __canvas: Tkinter.Canvas - :ivar __canvas: This ``CanvasWidget``'s canvas. - - :type __parent: CanvasWidget or None - :ivar __parent: This ``CanvasWidget``'s hierarchical parent widget. - :type __children: list(CanvasWidget) - :ivar __children: This ``CanvasWidget``'s hierarchical child widgets. - - :type __updating: bool - :ivar __updating: Is this canvas widget currently performing an - update? If it is, then it will ignore any new update requests - from child widgets. - - :type __draggable: bool - :ivar __draggable: Is this canvas widget draggable? - :type __press: event - :ivar __press: The ButtonPress event that we're currently handling. - :type __drag_x: int - :ivar __drag_x: Where it's been moved to (to find dx) - :type __drag_y: int - :ivar __drag_y: Where it's been moved to (to find dy) - :type __callbacks: dictionary - :ivar __callbacks: Registered callbacks. Currently, four keys are - used: ``1``, ``2``, ``3``, and ``'drag'``. The values are - callback functions. Each callback function takes a single - argument, which is the ``CanvasWidget`` that triggered the - callback. - """ - - def __init__(self, canvas, parent=None, **attribs): - """ - Create a new canvas widget. This constructor should only be - called by subclass constructors; and it should be called only - "after" the subclass has constructed all graphical canvas - objects and registered all child widgets. - - :param canvas: This canvas widget's canvas. - :type canvas: Tkinter.Canvas - :param parent: This canvas widget's hierarchical parent. - :type parent: CanvasWidget - :param attribs: The new canvas widget's attributes. - """ - if self.__class__ == CanvasWidget: - raise TypeError("CanvasWidget is an abstract base class") - - if not isinstance(canvas, Canvas): - raise TypeError("Expected a canvas!") - - self.__canvas = canvas - self.__parent = parent - - # If the subclass constructor called _add_child_widget, then - # self.__children will already exist. - if not hasattr(self, "_CanvasWidget__children"): - self.__children = [] - - # Is this widget hidden? - self.__hidden = 0 - - # Update control (prevents infinite loops) - self.__updating = 0 - - # Button-press and drag callback handling. - self.__press = None - self.__drag_x = self.__drag_y = 0 - self.__callbacks = {} - self.__draggable = 0 - - # Set up attributes. - for (attr, value) in list(attribs.items()): - self[attr] = value - - # Manage this canvas widget - self._manage() - - # Register any new bindings - for tag in self._tags(): - self.__canvas.tag_bind(tag, "", self.__press_cb) - self.__canvas.tag_bind(tag, "", self.__press_cb) - self.__canvas.tag_bind(tag, "", self.__press_cb) - - ##////////////////////////////////////////////////////// - ## Inherited methods. - ##////////////////////////////////////////////////////// - - def bbox(self): - """ - :return: A bounding box for this ``CanvasWidget``. The bounding - box is a tuple of four coordinates, *(xmin, ymin, xmax, ymax)*, - for a rectangle which encloses all of the canvas - widget's graphical elements. Bounding box coordinates are - specified with respect to the coordinate space of the ``Canvas``. - :rtype: tuple(int, int, int, int) - """ - if self.__hidden: - return (0, 0, 0, 0) - if len(self.tags()) == 0: - raise ValueError("No tags") - return self.__canvas.bbox(*self.tags()) - - def width(self): - """ - :return: The width of this canvas widget's bounding box, in - its ``Canvas``'s coordinate space. - :rtype: int - """ - if len(self.tags()) == 0: - raise ValueError("No tags") - bbox = self.__canvas.bbox(*self.tags()) - return bbox[2] - bbox[0] - - def height(self): - """ - :return: The height of this canvas widget's bounding box, in - its ``Canvas``'s coordinate space. - :rtype: int - """ - if len(self.tags()) == 0: - raise ValueError("No tags") - bbox = self.__canvas.bbox(*self.tags()) - return bbox[3] - bbox[1] - - def parent(self): - """ - :return: The hierarchical parent of this canvas widget. - ``self`` is considered a subpart of its parent for - purposes of user interaction. - :rtype: CanvasWidget or None - """ - return self.__parent - - def child_widgets(self): - """ - :return: A list of the hierarchical children of this canvas - widget. These children are considered part of ``self`` - for purposes of user interaction. - :rtype: list of CanvasWidget - """ - return self.__children - - def canvas(self): - """ - :return: The canvas that this canvas widget is bound to. - :rtype: Tkinter.Canvas - """ - return self.__canvas - - def move(self, dx, dy): - """ - Move this canvas widget by a given distance. In particular, - shift the canvas widget right by ``dx`` pixels, and down by - ``dy`` pixels. Both ``dx`` and ``dy`` may be negative, resulting - in leftward or upward movement. - - :type dx: int - :param dx: The number of pixels to move this canvas widget - rightwards. - :type dy: int - :param dy: The number of pixels to move this canvas widget - downwards. - :rtype: None - """ - if dx == dy == 0: - return - for tag in self.tags(): - self.__canvas.move(tag, dx, dy) - if self.__parent: - self.__parent.update(self) - - def moveto(self, x, y, anchor="NW"): - """ - Move this canvas widget to the given location. In particular, - shift the canvas widget such that the corner or side of the - bounding box specified by ``anchor`` is at location (``x``, - ``y``). - - :param x,y: The location that the canvas widget should be moved - to. - :param anchor: The corner or side of the canvas widget that - should be moved to the specified location. ``'N'`` - specifies the top center; ``'NE'`` specifies the top right - corner; etc. - """ - x1, y1, x2, y2 = self.bbox() - if anchor == "NW": - self.move(x - x1, y - y1) - if anchor == "N": - self.move(x - x1 / 2 - x2 / 2, y - y1) - if anchor == "NE": - self.move(x - x2, y - y1) - if anchor == "E": - self.move(x - x2, y - y1 / 2 - y2 / 2) - if anchor == "SE": - self.move(x - x2, y - y2) - if anchor == "S": - self.move(x - x1 / 2 - x2 / 2, y - y2) - if anchor == "SW": - self.move(x - x1, y - y2) - if anchor == "W": - self.move(x - x1, y - y1 / 2 - y2 / 2) - - def destroy(self): - """ - Remove this ``CanvasWidget`` from its ``Canvas``. After a - ``CanvasWidget`` has been destroyed, it should not be accessed. - - Note that you only need to destroy a top-level - ``CanvasWidget``; its child widgets will be destroyed - automatically. If you destroy a non-top-level - ``CanvasWidget``, then the entire top-level widget will be - destroyed. - - :raise ValueError: if this ``CanvasWidget`` has a parent. - :rtype: None - """ - if self.__parent is not None: - self.__parent.destroy() - return - - for tag in self.tags(): - self.__canvas.tag_unbind(tag, "") - self.__canvas.tag_unbind(tag, "") - self.__canvas.tag_unbind(tag, "") - self.__canvas.delete(*self.tags()) - self.__canvas = None - - def update(self, child): - """ - Update the graphical display of this canvas widget, and all of - its ancestors, in response to a change in one of this canvas - widget's children. - - :param child: The child widget that changed. - :type child: CanvasWidget - """ - if self.__hidden or child.__hidden: - return - # If we're already updating, then do nothing. This prevents - # infinite loops when _update modifies its children. - if self.__updating: - return - self.__updating = 1 - - # Update this CanvasWidget. - self._update(child) - - # Propagate update request to the parent. - if self.__parent: - self.__parent.update(self) - - # We're done updating. - self.__updating = 0 - - def manage(self): - """ - Arrange this canvas widget and all of its descendants. - - :rtype: None - """ - if self.__hidden: - return - for child in self.__children: - child.manage() - self._manage() - - def tags(self): - """ - :return: a list of the canvas tags for all graphical - elements managed by this canvas widget, including - graphical elements managed by its child widgets. - :rtype: list of int - """ - if self.__canvas is None: - raise ValueError("Attempt to access a destroyed canvas widget") - tags = [] - tags += self._tags() - for child in self.__children: - tags += child.tags() - return tags - - def __setitem__(self, attr, value): - """ - Set the value of the attribute ``attr`` to ``value``. See the - class documentation for a list of attributes supported by this - canvas widget. - - :rtype: None - """ - if attr == "draggable": - self.__draggable = value - else: - raise ValueError("Unknown attribute %r" % attr) - - def __getitem__(self, attr): - """ - :return: the value of the attribute ``attr``. See the class - documentation for a list of attributes supported by this - canvas widget. - :rtype: (any) - """ - if attr == "draggable": - return self.__draggable - else: - raise ValueError("Unknown attribute %r" % attr) - - def __repr__(self): - """ - :return: a string representation of this canvas widget. - :rtype: str - """ - return "<%s>" % self.__class__.__name__ - - def hide(self): - """ - Temporarily hide this canvas widget. - - :rtype: None - """ - self.__hidden = 1 - for tag in self.tags(): - self.__canvas.itemconfig(tag, state="hidden") - - def show(self): - """ - Show a hidden canvas widget. - - :rtype: None - """ - self.__hidden = 0 - for tag in self.tags(): - self.__canvas.itemconfig(tag, state="normal") - - def hidden(self): - """ - :return: True if this canvas widget is hidden. - :rtype: bool - """ - return self.__hidden - - ##////////////////////////////////////////////////////// - ## Callback interface - ##////////////////////////////////////////////////////// - - def bind_click(self, callback, button=1): - """ - Register a new callback that will be called whenever this - ``CanvasWidget`` is clicked on. - - :type callback: function - :param callback: The callback function that will be called - whenever this ``CanvasWidget`` is clicked. This function - will be called with this ``CanvasWidget`` as its argument. - :type button: int - :param button: Which button the user should use to click on - this ``CanvasWidget``. Typically, this should be 1 (left - button), 3 (right button), or 2 (middle button). - """ - self.__callbacks[button] = callback - - def bind_drag(self, callback): - """ - Register a new callback that will be called after this - ``CanvasWidget`` is dragged. This implicitly makes this - ``CanvasWidget`` draggable. - - :type callback: function - :param callback: The callback function that will be called - whenever this ``CanvasWidget`` is clicked. This function - will be called with this ``CanvasWidget`` as its argument. - """ - self.__draggable = 1 - self.__callbacks["drag"] = callback - - def unbind_click(self, button=1): - """ - Remove a callback that was registered with ``bind_click``. - - :type button: int - :param button: Which button the user should use to click on - this ``CanvasWidget``. Typically, this should be 1 (left - button), 3 (right button), or 2 (middle button). - """ - try: - del self.__callbacks[button] - except: - pass - - def unbind_drag(self): - """ - Remove a callback that was registered with ``bind_drag``. - """ - try: - del self.__callbacks["drag"] - except: - pass - - ##////////////////////////////////////////////////////// - ## Callback internals - ##////////////////////////////////////////////////////// - - def __press_cb(self, event): - """ - Handle a button-press event: - - record the button press event in ``self.__press`` - - register a button-release callback. - - if this CanvasWidget or any of its ancestors are - draggable, then register the appropriate motion callback. - """ - # If we're already waiting for a button release, then ignore - # this new button press. - if ( - self.__canvas.bind("") - or self.__canvas.bind("") - or self.__canvas.bind("") - ): - return - - # Unbind motion (just in case; this shouldn't be necessary) - self.__canvas.unbind("") - - # Record the button press event. - self.__press = event - - # If any ancestor is draggable, set up a motion callback. - # (Only if they pressed button number 1) - if event.num == 1: - widget = self - while widget is not None: - if widget["draggable"]: - widget.__start_drag(event) - break - widget = widget.parent() - - # Set up the button release callback. - self.__canvas.bind("" % event.num, self.__release_cb) - - def __start_drag(self, event): - """ - Begin dragging this object: - - register a motion callback - - record the drag coordinates - """ - self.__canvas.bind("", self.__motion_cb) - self.__drag_x = event.x - self.__drag_y = event.y - - def __motion_cb(self, event): - """ - Handle a motion event: - - move this object to the new location - - record the new drag coordinates - """ - self.move(event.x - self.__drag_x, event.y - self.__drag_y) - self.__drag_x = event.x - self.__drag_y = event.y - - def __release_cb(self, event): - """ - Handle a release callback: - - unregister motion & button release callbacks. - - decide whether they clicked, dragged, or cancelled - - call the appropriate handler. - """ - # Unbind the button release & motion callbacks. - self.__canvas.unbind("" % event.num) - self.__canvas.unbind("") - - # Is it a click or a drag? - if ( - event.time - self.__press.time < 100 - and abs(event.x - self.__press.x) + abs(event.y - self.__press.y) < 5 - ): - # Move it back, if we were dragging. - if self.__draggable and event.num == 1: - self.move( - self.__press.x - self.__drag_x, self.__press.y - self.__drag_y - ) - self.__click(event.num) - elif event.num == 1: - self.__drag() - - self.__press = None - - def __drag(self): - """ - If this ``CanvasWidget`` has a drag callback, then call it; - otherwise, find the closest ancestor with a drag callback, and - call it. If no ancestors have a drag callback, do nothing. - """ - if self.__draggable: - if "drag" in self.__callbacks: - cb = self.__callbacks["drag"] - try: - cb(self) - except: - print("Error in drag callback for %r" % self) - elif self.__parent is not None: - self.__parent.__drag() - - def __click(self, button): - """ - If this ``CanvasWidget`` has a drag callback, then call it; - otherwise, find the closest ancestor with a click callback, and - call it. If no ancestors have a click callback, do nothing. - """ - if button in self.__callbacks: - cb = self.__callbacks[button] - # try: - cb(self) - # except: - # print('Error in click callback for %r' % self) - # raise - elif self.__parent is not None: - self.__parent.__click(button) - - ##////////////////////////////////////////////////////// - ## Child/parent Handling - ##////////////////////////////////////////////////////// - - def _add_child_widget(self, child): - """ - Register a hierarchical child widget. The child will be - considered part of this canvas widget for purposes of user - interaction. ``_add_child_widget`` has two direct effects: - - It sets ``child``'s parent to this canvas widget. - - It adds ``child`` to the list of canvas widgets returned by - the ``child_widgets`` member function. - - :param child: The new child widget. ``child`` must not already - have a parent. - :type child: CanvasWidget - """ - if not hasattr(self, "_CanvasWidget__children"): - self.__children = [] - if child.__parent is not None: - raise ValueError(f"{child} already has a parent") - child.__parent = self - self.__children.append(child) - - def _remove_child_widget(self, child): - """ - Remove a hierarchical child widget. This child will no longer - be considered part of this canvas widget for purposes of user - interaction. ``_add_child_widget`` has two direct effects: - - It sets ``child``'s parent to None. - - It removes ``child`` from the list of canvas widgets - returned by the ``child_widgets`` member function. - - :param child: The child widget to remove. ``child`` must be a - child of this canvas widget. - :type child: CanvasWidget - """ - self.__children.remove(child) - child.__parent = None - - ##////////////////////////////////////////////////////// - ## Defined by subclass - ##////////////////////////////////////////////////////// - - @abstractmethod - def _tags(self): - """ - :return: a list of canvas tags for all graphical elements - managed by this canvas widget, not including graphical - elements managed by its child widgets. - :rtype: list of int - """ - - def _manage(self): - """ - Arrange the child widgets of this canvas widget. This method - is called when the canvas widget is initially created. It is - also called if the user calls the ``manage`` method on this - canvas widget or any of its ancestors. - - :rtype: None - """ - - def _update(self, child): - """ - Update this canvas widget in response to a change in one of - its children. - - :param child: The child that changed. - :type child: CanvasWidget - :rtype: None - """ - - -##////////////////////////////////////////////////////// -## Basic widgets. -##////////////////////////////////////////////////////// - - -class TextWidget(CanvasWidget): - """ - A canvas widget that displays a single string of text. - - Attributes: - - ``color``: the color of the text. - - ``font``: the font used to display the text. - - ``justify``: justification for multi-line texts. Valid values - are ``left``, ``center``, and ``right``. - - ``width``: the width of the text. If the text is wider than - this width, it will be line-wrapped at whitespace. - - ``draggable``: whether the text can be dragged by the user. - """ - - def __init__(self, canvas, text, **attribs): - """ - Create a new text widget. - - :type canvas: Tkinter.Canvas - :param canvas: This canvas widget's canvas. - :type text: str - :param text: The string of text to display. - :param attribs: The new canvas widget's attributes. - """ - self._text = text - self._tag = canvas.create_text(1, 1, text=text) - CanvasWidget.__init__(self, canvas, **attribs) - - def __setitem__(self, attr, value): - if attr in ("color", "font", "justify", "width"): - if attr == "color": - attr = "fill" - self.canvas().itemconfig(self._tag, {attr: value}) - else: - CanvasWidget.__setitem__(self, attr, value) - - def __getitem__(self, attr): - if attr == "width": - return int(self.canvas().itemcget(self._tag, attr)) - elif attr in ("color", "font", "justify"): - if attr == "color": - attr = "fill" - return self.canvas().itemcget(self._tag, attr) - else: - return CanvasWidget.__getitem__(self, attr) - - def _tags(self): - return [self._tag] - - def text(self): - """ - :return: The text displayed by this text widget. - :rtype: str - """ - return self.canvas().itemcget(self._tag, "TEXT") - - def set_text(self, text): - """ - Change the text that is displayed by this text widget. - - :type text: str - :param text: The string of text to display. - :rtype: None - """ - self.canvas().itemconfig(self._tag, text=text) - if self.parent() is not None: - self.parent().update(self) - - def __repr__(self): - return "[Text: %r]" % self._text - - -class SymbolWidget(TextWidget): - """ - A canvas widget that displays special symbols, such as the - negation sign and the exists operator. Symbols are specified by - name. Currently, the following symbol names are defined: ``neg``, - ``disj``, ``conj``, ``lambda``, ``merge``, ``forall``, ``exists``, - ``subseteq``, ``subset``, ``notsubset``, ``emptyset``, ``imp``, - ``rightarrow``, ``equal``, ``notequal``, ``epsilon``. - - Attributes: - - - ``color``: the color of the text. - - ``draggable``: whether the text can be dragged by the user. - - :cvar SYMBOLS: A dictionary mapping from symbols to the character - in the ``symbol`` font used to render them. - """ - - SYMBOLS = { - "neg": "\330", - "disj": "\332", - "conj": "\331", - "lambda": "\154", - "merge": "\304", - "forall": "\042", - "exists": "\044", - "subseteq": "\315", - "subset": "\314", - "notsubset": "\313", - "emptyset": "\306", - "imp": "\336", - "rightarrow": chr(222), #'\256', - "equal": "\75", - "notequal": "\271", - "intersection": "\307", - "union": "\310", - "epsilon": "e", - } - - def __init__(self, canvas, symbol, **attribs): - """ - Create a new symbol widget. - - :type canvas: Tkinter.Canvas - :param canvas: This canvas widget's canvas. - :type symbol: str - :param symbol: The name of the symbol to display. - :param attribs: The new canvas widget's attributes. - """ - attribs["font"] = "symbol" - TextWidget.__init__(self, canvas, "", **attribs) - self.set_symbol(symbol) - - def symbol(self): - """ - :return: the name of the symbol that is displayed by this - symbol widget. - :rtype: str - """ - return self._symbol - - def set_symbol(self, symbol): - """ - Change the symbol that is displayed by this symbol widget. - - :type symbol: str - :param symbol: The name of the symbol to display. - """ - if symbol not in SymbolWidget.SYMBOLS: - raise ValueError("Unknown symbol: %s" % symbol) - self._symbol = symbol - self.set_text(SymbolWidget.SYMBOLS[symbol]) - - def __repr__(self): - return "[Symbol: %r]" % self._symbol - - @staticmethod - def symbolsheet(size=20): - """ - Open a new Tkinter window that displays the entire alphabet - for the symbol font. This is useful for constructing the - ``SymbolWidget.SYMBOLS`` dictionary. - """ - top = Tk() - - def destroy(e, top=top): - top.destroy() - - top.bind("q", destroy) - Button(top, text="Quit", command=top.destroy).pack(side="bottom") - text = Text(top, font=("helvetica", -size), width=20, height=30) - text.pack(side="left") - sb = Scrollbar(top, command=text.yview) - text["yscrollcommand"] = sb.set - sb.pack(side="right", fill="y") - text.tag_config("symbol", font=("symbol", -size)) - for i in range(256): - if i in (0, 10): - continue # null and newline - for k, v in list(SymbolWidget.SYMBOLS.items()): - if v == chr(i): - text.insert("end", "%-10s\t" % k) - break - else: - text.insert("end", "%-10d \t" % i) - text.insert("end", "[%s]\n" % chr(i), "symbol") - top.mainloop() - - -class AbstractContainerWidget(CanvasWidget): - """ - An abstract class for canvas widgets that contain a single child, - such as ``BoxWidget`` and ``OvalWidget``. Subclasses must define - a constructor, which should create any new graphical elements and - then call the ``AbstractCanvasContainer`` constructor. Subclasses - must also define the ``_update`` method and the ``_tags`` method; - and any subclasses that define attributes should define - ``__setitem__`` and ``__getitem__``. - """ - - def __init__(self, canvas, child, **attribs): - """ - Create a new container widget. This constructor should only - be called by subclass constructors. - - :type canvas: Tkinter.Canvas - :param canvas: This canvas widget's canvas. - :param child: The container's child widget. ``child`` must not - have a parent. - :type child: CanvasWidget - :param attribs: The new canvas widget's attributes. - """ - self._child = child - self._add_child_widget(child) - CanvasWidget.__init__(self, canvas, **attribs) - - def _manage(self): - self._update(self._child) - - def child(self): - """ - :return: The child widget contained by this container widget. - :rtype: CanvasWidget - """ - return self._child - - def set_child(self, child): - """ - Change the child widget contained by this container widget. - - :param child: The new child widget. ``child`` must not have a - parent. - :type child: CanvasWidget - :rtype: None - """ - self._remove_child_widget(self._child) - self._add_child_widget(child) - self._child = child - self.update(child) - - def __repr__(self): - name = self.__class__.__name__ - if name[-6:] == "Widget": - name = name[:-6] - return f"[{name}: {self._child!r}]" - - -class BoxWidget(AbstractContainerWidget): - """ - A canvas widget that places a box around a child widget. - - Attributes: - - ``fill``: The color used to fill the interior of the box. - - ``outline``: The color used to draw the outline of the box. - - ``width``: The width of the outline of the box. - - ``margin``: The number of pixels space left between the child - and the box. - - ``draggable``: whether the text can be dragged by the user. - """ - - def __init__(self, canvas, child, **attribs): - """ - Create a new box widget. - - :type canvas: Tkinter.Canvas - :param canvas: This canvas widget's canvas. - :param child: The child widget. ``child`` must not have a - parent. - :type child: CanvasWidget - :param attribs: The new canvas widget's attributes. - """ - self._child = child - self._margin = 1 - self._box = canvas.create_rectangle(1, 1, 1, 1) - canvas.tag_lower(self._box) - AbstractContainerWidget.__init__(self, canvas, child, **attribs) - - def __setitem__(self, attr, value): - if attr == "margin": - self._margin = value - elif attr in ("outline", "fill", "width"): - self.canvas().itemconfig(self._box, {attr: value}) - else: - CanvasWidget.__setitem__(self, attr, value) - - def __getitem__(self, attr): - if attr == "margin": - return self._margin - elif attr == "width": - return float(self.canvas().itemcget(self._box, attr)) - elif attr in ("outline", "fill", "width"): - return self.canvas().itemcget(self._box, attr) - else: - return CanvasWidget.__getitem__(self, attr) - - def _update(self, child): - (x1, y1, x2, y2) = child.bbox() - margin = self._margin + self["width"] / 2 - self.canvas().coords( - self._box, x1 - margin, y1 - margin, x2 + margin, y2 + margin - ) - - def _tags(self): - return [self._box] - - -class OvalWidget(AbstractContainerWidget): - """ - A canvas widget that places a oval around a child widget. - - Attributes: - - ``fill``: The color used to fill the interior of the oval. - - ``outline``: The color used to draw the outline of the oval. - - ``width``: The width of the outline of the oval. - - ``margin``: The number of pixels space left between the child - and the oval. - - ``draggable``: whether the text can be dragged by the user. - - ``double``: If true, then a double-oval is drawn. - """ - - def __init__(self, canvas, child, **attribs): - """ - Create a new oval widget. - - :type canvas: Tkinter.Canvas - :param canvas: This canvas widget's canvas. - :param child: The child widget. ``child`` must not have a - parent. - :type child: CanvasWidget - :param attribs: The new canvas widget's attributes. - """ - self._child = child - self._margin = 1 - self._oval = canvas.create_oval(1, 1, 1, 1) - self._circle = attribs.pop("circle", False) - self._double = attribs.pop("double", False) - if self._double: - self._oval2 = canvas.create_oval(1, 1, 1, 1) - else: - self._oval2 = None - canvas.tag_lower(self._oval) - AbstractContainerWidget.__init__(self, canvas, child, **attribs) - - def __setitem__(self, attr, value): - c = self.canvas() - if attr == "margin": - self._margin = value - elif attr == "double": - if value == True and self._oval2 is None: - # Copy attributes & position from self._oval. - x1, y1, x2, y2 = c.bbox(self._oval) - w = self["width"] * 2 - self._oval2 = c.create_oval( - x1 - w, - y1 - w, - x2 + w, - y2 + w, - outline=c.itemcget(self._oval, "outline"), - width=c.itemcget(self._oval, "width"), - ) - c.tag_lower(self._oval2) - if value == False and self._oval2 is not None: - c.delete(self._oval2) - self._oval2 = None - elif attr in ("outline", "fill", "width"): - c.itemconfig(self._oval, {attr: value}) - if self._oval2 is not None and attr != "fill": - c.itemconfig(self._oval2, {attr: value}) - if self._oval2 is not None and attr != "fill": - self.canvas().itemconfig(self._oval2, {attr: value}) - else: - CanvasWidget.__setitem__(self, attr, value) - - def __getitem__(self, attr): - if attr == "margin": - return self._margin - elif attr == "double": - return self._double is not None - elif attr == "width": - return float(self.canvas().itemcget(self._oval, attr)) - elif attr in ("outline", "fill", "width"): - return self.canvas().itemcget(self._oval, attr) - else: - return CanvasWidget.__getitem__(self, attr) - - # The ratio between inscribed & circumscribed ovals - RATIO = 1.4142135623730949 - - def _update(self, child): - R = OvalWidget.RATIO - (x1, y1, x2, y2) = child.bbox() - margin = self._margin - - # If we're a circle, pretend our contents are square. - if self._circle: - dx, dy = abs(x1 - x2), abs(y1 - y2) - if dx > dy: - y = (y1 + y2) / 2 - y1, y2 = y - dx / 2, y + dx / 2 - elif dy > dx: - x = (x1 + x2) / 2 - x1, x2 = x - dy / 2, x + dy / 2 - - # Find the four corners. - left = int((x1 * (1 + R) + x2 * (1 - R)) / 2) - right = left + int((x2 - x1) * R) - top = int((y1 * (1 + R) + y2 * (1 - R)) / 2) - bot = top + int((y2 - y1) * R) - self.canvas().coords( - self._oval, left - margin, top - margin, right + margin, bot + margin - ) - if self._oval2 is not None: - self.canvas().coords( - self._oval2, - left - margin + 2, - top - margin + 2, - right + margin - 2, - bot + margin - 2, - ) - - def _tags(self): - if self._oval2 is None: - return [self._oval] - else: - return [self._oval, self._oval2] - - -class ParenWidget(AbstractContainerWidget): - """ - A canvas widget that places a pair of parenthases around a child - widget. - - Attributes: - - ``color``: The color used to draw the parenthases. - - ``width``: The width of the parenthases. - - ``draggable``: whether the text can be dragged by the user. - """ - - def __init__(self, canvas, child, **attribs): - """ - Create a new parenthasis widget. - - :type canvas: Tkinter.Canvas - :param canvas: This canvas widget's canvas. - :param child: The child widget. ``child`` must not have a - parent. - :type child: CanvasWidget - :param attribs: The new canvas widget's attributes. - """ - self._child = child - self._oparen = canvas.create_arc(1, 1, 1, 1, style="arc", start=90, extent=180) - self._cparen = canvas.create_arc(1, 1, 1, 1, style="arc", start=-90, extent=180) - AbstractContainerWidget.__init__(self, canvas, child, **attribs) - - def __setitem__(self, attr, value): - if attr == "color": - self.canvas().itemconfig(self._oparen, outline=value) - self.canvas().itemconfig(self._cparen, outline=value) - elif attr == "width": - self.canvas().itemconfig(self._oparen, width=value) - self.canvas().itemconfig(self._cparen, width=value) - else: - CanvasWidget.__setitem__(self, attr, value) - - def __getitem__(self, attr): - if attr == "color": - return self.canvas().itemcget(self._oparen, "outline") - elif attr == "width": - return self.canvas().itemcget(self._oparen, "width") - else: - return CanvasWidget.__getitem__(self, attr) - - def _update(self, child): - (x1, y1, x2, y2) = child.bbox() - width = max((y2 - y1) / 6, 4) - self.canvas().coords(self._oparen, x1 - width, y1, x1 + width, y2) - self.canvas().coords(self._cparen, x2 - width, y1, x2 + width, y2) - - def _tags(self): - return [self._oparen, self._cparen] - - -class BracketWidget(AbstractContainerWidget): - """ - A canvas widget that places a pair of brackets around a child - widget. - - Attributes: - - ``color``: The color used to draw the brackets. - - ``width``: The width of the brackets. - - ``draggable``: whether the text can be dragged by the user. - """ - - def __init__(self, canvas, child, **attribs): - """ - Create a new bracket widget. - - :type canvas: Tkinter.Canvas - :param canvas: This canvas widget's canvas. - :param child: The child widget. ``child`` must not have a - parent. - :type child: CanvasWidget - :param attribs: The new canvas widget's attributes. - """ - self._child = child - self._obrack = canvas.create_line(1, 1, 1, 1, 1, 1, 1, 1) - self._cbrack = canvas.create_line(1, 1, 1, 1, 1, 1, 1, 1) - AbstractContainerWidget.__init__(self, canvas, child, **attribs) - - def __setitem__(self, attr, value): - if attr == "color": - self.canvas().itemconfig(self._obrack, fill=value) - self.canvas().itemconfig(self._cbrack, fill=value) - elif attr == "width": - self.canvas().itemconfig(self._obrack, width=value) - self.canvas().itemconfig(self._cbrack, width=value) - else: - CanvasWidget.__setitem__(self, attr, value) - - def __getitem__(self, attr): - if attr == "color": - return self.canvas().itemcget(self._obrack, "outline") - elif attr == "width": - return self.canvas().itemcget(self._obrack, "width") - else: - return CanvasWidget.__getitem__(self, attr) - - def _update(self, child): - (x1, y1, x2, y2) = child.bbox() - width = max((y2 - y1) / 8, 2) - self.canvas().coords( - self._obrack, x1, y1, x1 - width, y1, x1 - width, y2, x1, y2 - ) - self.canvas().coords( - self._cbrack, x2, y1, x2 + width, y1, x2 + width, y2, x2, y2 - ) - - def _tags(self): - return [self._obrack, self._cbrack] - - -class SequenceWidget(CanvasWidget): - """ - A canvas widget that keeps a list of canvas widgets in a - horizontal line. - - Attributes: - - ``align``: The vertical alignment of the children. Possible - values are ``'top'``, ``'center'``, and ``'bottom'``. By - default, children are center-aligned. - - ``space``: The amount of horizontal space to place between - children. By default, one pixel of space is used. - - ``ordered``: If true, then keep the children in their - original order. - """ - - def __init__(self, canvas, *children, **attribs): - """ - Create a new sequence widget. - - :type canvas: Tkinter.Canvas - :param canvas: This canvas widget's canvas. - :param children: The widgets that should be aligned - horizontally. Each child must not have a parent. - :type children: list(CanvasWidget) - :param attribs: The new canvas widget's attributes. - """ - self._align = "center" - self._space = 1 - self._ordered = False - self._children = list(children) - for child in children: - self._add_child_widget(child) - CanvasWidget.__init__(self, canvas, **attribs) - - def __setitem__(self, attr, value): - if attr == "align": - if value not in ("top", "bottom", "center"): - raise ValueError("Bad alignment: %r" % value) - self._align = value - elif attr == "space": - self._space = value - elif attr == "ordered": - self._ordered = value - else: - CanvasWidget.__setitem__(self, attr, value) - - def __getitem__(self, attr): - if attr == "align": - return self._align - elif attr == "space": - return self._space - elif attr == "ordered": - return self._ordered - else: - return CanvasWidget.__getitem__(self, attr) - - def _tags(self): - return [] - - def _yalign(self, top, bot): - if self._align == "top": - return top - if self._align == "bottom": - return bot - if self._align == "center": - return (top + bot) / 2 - - def _update(self, child): - # Align all children with child. - (left, top, right, bot) = child.bbox() - y = self._yalign(top, bot) - for c in self._children: - (x1, y1, x2, y2) = c.bbox() - c.move(0, y - self._yalign(y1, y2)) - - if self._ordered and len(self._children) > 1: - index = self._children.index(child) - - x = right + self._space - for i in range(index + 1, len(self._children)): - (x1, y1, x2, y2) = self._children[i].bbox() - if x > x1: - self._children[i].move(x - x1, 0) - x += x2 - x1 + self._space - - x = left - self._space - for i in range(index - 1, -1, -1): - (x1, y1, x2, y2) = self._children[i].bbox() - if x < x2: - self._children[i].move(x - x2, 0) - x -= x2 - x1 + self._space - - def _manage(self): - if len(self._children) == 0: - return - child = self._children[0] - - # Align all children with child. - (left, top, right, bot) = child.bbox() - y = self._yalign(top, bot) - - index = self._children.index(child) - - # Line up children to the right of child. - x = right + self._space - for i in range(index + 1, len(self._children)): - (x1, y1, x2, y2) = self._children[i].bbox() - self._children[i].move(x - x1, y - self._yalign(y1, y2)) - x += x2 - x1 + self._space - - # Line up children to the left of child. - x = left - self._space - for i in range(index - 1, -1, -1): - (x1, y1, x2, y2) = self._children[i].bbox() - self._children[i].move(x - x2, y - self._yalign(y1, y2)) - x -= x2 - x1 + self._space - - def __repr__(self): - return "[Sequence: " + repr(self._children)[1:-1] + "]" - - # Provide an alias for the child_widgets() member. - children = CanvasWidget.child_widgets - - def replace_child(self, oldchild, newchild): - """ - Replace the child canvas widget ``oldchild`` with ``newchild``. - ``newchild`` must not have a parent. ``oldchild``'s parent will - be set to None. - - :type oldchild: CanvasWidget - :param oldchild: The child canvas widget to remove. - :type newchild: CanvasWidget - :param newchild: The canvas widget that should replace - ``oldchild``. - """ - index = self._children.index(oldchild) - self._children[index] = newchild - self._remove_child_widget(oldchild) - self._add_child_widget(newchild) - self.update(newchild) - - def remove_child(self, child): - """ - Remove the given child canvas widget. ``child``'s parent will - be set to None. - - :type child: CanvasWidget - :param child: The child canvas widget to remove. - """ - index = self._children.index(child) - del self._children[index] - self._remove_child_widget(child) - if len(self._children) > 0: - self.update(self._children[0]) - - def insert_child(self, index, child): - """ - Insert a child canvas widget before a given index. - - :type child: CanvasWidget - :param child: The canvas widget that should be inserted. - :type index: int - :param index: The index where the child widget should be - inserted. In particular, the index of ``child`` will be - ``index``; and the index of any children whose indices were - greater than equal to ``index`` before ``child`` was - inserted will be incremented by one. - """ - self._children.insert(index, child) - self._add_child_widget(child) - - -class StackWidget(CanvasWidget): - """ - A canvas widget that keeps a list of canvas widgets in a vertical - line. - - Attributes: - - ``align``: The horizontal alignment of the children. Possible - values are ``'left'``, ``'center'``, and ``'right'``. By - default, children are center-aligned. - - ``space``: The amount of vertical space to place between - children. By default, one pixel of space is used. - - ``ordered``: If true, then keep the children in their - original order. - """ - - def __init__(self, canvas, *children, **attribs): - """ - Create a new stack widget. - - :type canvas: Tkinter.Canvas - :param canvas: This canvas widget's canvas. - :param children: The widgets that should be aligned - vertically. Each child must not have a parent. - :type children: list(CanvasWidget) - :param attribs: The new canvas widget's attributes. - """ - self._align = "center" - self._space = 1 - self._ordered = False - self._children = list(children) - for child in children: - self._add_child_widget(child) - CanvasWidget.__init__(self, canvas, **attribs) - - def __setitem__(self, attr, value): - if attr == "align": - if value not in ("left", "right", "center"): - raise ValueError("Bad alignment: %r" % value) - self._align = value - elif attr == "space": - self._space = value - elif attr == "ordered": - self._ordered = value - else: - CanvasWidget.__setitem__(self, attr, value) - - def __getitem__(self, attr): - if attr == "align": - return self._align - elif attr == "space": - return self._space - elif attr == "ordered": - return self._ordered - else: - return CanvasWidget.__getitem__(self, attr) - - def _tags(self): - return [] - - def _xalign(self, left, right): - if self._align == "left": - return left - if self._align == "right": - return right - if self._align == "center": - return (left + right) / 2 - - def _update(self, child): - # Align all children with child. - (left, top, right, bot) = child.bbox() - x = self._xalign(left, right) - for c in self._children: - (x1, y1, x2, y2) = c.bbox() - c.move(x - self._xalign(x1, x2), 0) - - if self._ordered and len(self._children) > 1: - index = self._children.index(child) - - y = bot + self._space - for i in range(index + 1, len(self._children)): - (x1, y1, x2, y2) = self._children[i].bbox() - if y > y1: - self._children[i].move(0, y - y1) - y += y2 - y1 + self._space - - y = top - self._space - for i in range(index - 1, -1, -1): - (x1, y1, x2, y2) = self._children[i].bbox() - if y < y2: - self._children[i].move(0, y - y2) - y -= y2 - y1 + self._space - - def _manage(self): - if len(self._children) == 0: - return - child = self._children[0] - - # Align all children with child. - (left, top, right, bot) = child.bbox() - x = self._xalign(left, right) - - index = self._children.index(child) - - # Line up children below the child. - y = bot + self._space - for i in range(index + 1, len(self._children)): - (x1, y1, x2, y2) = self._children[i].bbox() - self._children[i].move(x - self._xalign(x1, x2), y - y1) - y += y2 - y1 + self._space - - # Line up children above the child. - y = top - self._space - for i in range(index - 1, -1, -1): - (x1, y1, x2, y2) = self._children[i].bbox() - self._children[i].move(x - self._xalign(x1, x2), y - y2) - y -= y2 - y1 + self._space - - def __repr__(self): - return "[Stack: " + repr(self._children)[1:-1] + "]" - - # Provide an alias for the child_widgets() member. - children = CanvasWidget.child_widgets - - def replace_child(self, oldchild, newchild): - """ - Replace the child canvas widget ``oldchild`` with ``newchild``. - ``newchild`` must not have a parent. ``oldchild``'s parent will - be set to None. - - :type oldchild: CanvasWidget - :param oldchild: The child canvas widget to remove. - :type newchild: CanvasWidget - :param newchild: The canvas widget that should replace - ``oldchild``. - """ - index = self._children.index(oldchild) - self._children[index] = newchild - self._remove_child_widget(oldchild) - self._add_child_widget(newchild) - self.update(newchild) - - def remove_child(self, child): - """ - Remove the given child canvas widget. ``child``'s parent will - be set to None. - - :type child: CanvasWidget - :param child: The child canvas widget to remove. - """ - index = self._children.index(child) - del self._children[index] - self._remove_child_widget(child) - if len(self._children) > 0: - self.update(self._children[0]) - - def insert_child(self, index, child): - """ - Insert a child canvas widget before a given index. - - :type child: CanvasWidget - :param child: The canvas widget that should be inserted. - :type index: int - :param index: The index where the child widget should be - inserted. In particular, the index of ``child`` will be - ``index``; and the index of any children whose indices were - greater than equal to ``index`` before ``child`` was - inserted will be incremented by one. - """ - self._children.insert(index, child) - self._add_child_widget(child) - - -class SpaceWidget(CanvasWidget): - """ - A canvas widget that takes up space but does not display - anything. A ``SpaceWidget`` can be used to add space between - elements. Each space widget is characterized by a width and a - height. If you wish to only create horizontal space, then use a - height of zero; and if you wish to only create vertical space, use - a width of zero. - """ - - def __init__(self, canvas, width, height, **attribs): - """ - Create a new space widget. - - :type canvas: Tkinter.Canvas - :param canvas: This canvas widget's canvas. - :type width: int - :param width: The width of the new space widget. - :type height: int - :param height: The height of the new space widget. - :param attribs: The new canvas widget's attributes. - """ - # For some reason, - if width > 4: - width -= 4 - if height > 4: - height -= 4 - self._tag = canvas.create_line(1, 1, width, height, fill="") - CanvasWidget.__init__(self, canvas, **attribs) - - # note: width() and height() are already defined by CanvasWidget. - def set_width(self, width): - """ - Change the width of this space widget. - - :param width: The new width. - :type width: int - :rtype: None - """ - [x1, y1, x2, y2] = self.bbox() - self.canvas().coords(self._tag, x1, y1, x1 + width, y2) - - def set_height(self, height): - """ - Change the height of this space widget. - - :param height: The new height. - :type height: int - :rtype: None - """ - [x1, y1, x2, y2] = self.bbox() - self.canvas().coords(self._tag, x1, y1, x2, y1 + height) - - def _tags(self): - return [self._tag] - - def __repr__(self): - return "[Space]" - - -class ScrollWatcherWidget(CanvasWidget): - """ - A special canvas widget that adjusts its ``Canvas``'s scrollregion - to always include the bounding boxes of all of its children. The - scroll-watcher widget will only increase the size of the - ``Canvas``'s scrollregion; it will never decrease it. - """ - - def __init__(self, canvas, *children, **attribs): - """ - Create a new scroll-watcher widget. - - :type canvas: Tkinter.Canvas - :param canvas: This canvas widget's canvas. - :type children: list(CanvasWidget) - :param children: The canvas widgets watched by the - scroll-watcher. The scroll-watcher will ensure that these - canvas widgets are always contained in their canvas's - scrollregion. - :param attribs: The new canvas widget's attributes. - """ - for child in children: - self._add_child_widget(child) - CanvasWidget.__init__(self, canvas, **attribs) - - def add_child(self, canvaswidget): - """ - Add a new canvas widget to the scroll-watcher. The - scroll-watcher will ensure that the new canvas widget is - always contained in its canvas's scrollregion. - - :param canvaswidget: The new canvas widget. - :type canvaswidget: CanvasWidget - :rtype: None - """ - self._add_child_widget(canvaswidget) - self.update(canvaswidget) - - def remove_child(self, canvaswidget): - """ - Remove a canvas widget from the scroll-watcher. The - scroll-watcher will no longer ensure that the new canvas - widget is always contained in its canvas's scrollregion. - - :param canvaswidget: The canvas widget to remove. - :type canvaswidget: CanvasWidget - :rtype: None - """ - self._remove_child_widget(canvaswidget) - - def _tags(self): - return [] - - def _update(self, child): - self._adjust_scrollregion() - - def _adjust_scrollregion(self): - """ - Adjust the scrollregion of this scroll-watcher's ``Canvas`` to - include the bounding boxes of all of its children. - """ - bbox = self.bbox() - canvas = self.canvas() - scrollregion = [int(n) for n in canvas["scrollregion"].split()] - if len(scrollregion) != 4: - return - if ( - bbox[0] < scrollregion[0] - or bbox[1] < scrollregion[1] - or bbox[2] > scrollregion[2] - or bbox[3] > scrollregion[3] - ): - scrollregion = "%d %d %d %d" % ( - min(bbox[0], scrollregion[0]), - min(bbox[1], scrollregion[1]), - max(bbox[2], scrollregion[2]), - max(bbox[3], scrollregion[3]), - ) - canvas["scrollregion"] = scrollregion - - -##////////////////////////////////////////////////////// -## Canvas Frame -##////////////////////////////////////////////////////// - - -class CanvasFrame: - """ - A ``Tkinter`` frame containing a canvas and scrollbars. - ``CanvasFrame`` uses a ``ScrollWatcherWidget`` to ensure that all of - the canvas widgets contained on its canvas are within its - scrollregion. In order for ``CanvasFrame`` to make these checks, - all canvas widgets must be registered with ``add_widget`` when they - are added to the canvas; and destroyed with ``destroy_widget`` when - they are no longer needed. - - If a ``CanvasFrame`` is created with no parent, then it will create - its own main window, including a "Done" button and a "Print" - button. - """ - - def __init__(self, parent=None, **kw): - """ - Create a new ``CanvasFrame``. - - :type parent: Tkinter.BaseWidget or Tkinter.Tk - :param parent: The parent ``Tkinter`` widget. If no parent is - specified, then ``CanvasFrame`` will create a new main - window. - :param kw: Keyword arguments for the new ``Canvas``. See the - documentation for ``Tkinter.Canvas`` for more information. - """ - # If no parent was given, set up a top-level window. - if parent is None: - self._parent = Tk() - self._parent.title("NLTK") - self._parent.bind("", lambda e: self.print_to_file()) - self._parent.bind("", self.destroy) - self._parent.bind("", self.destroy) - else: - self._parent = parent - - # Create a frame for the canvas & scrollbars - self._frame = frame = Frame(self._parent) - self._canvas = canvas = Canvas(frame, **kw) - xscrollbar = Scrollbar(self._frame, orient="horizontal") - yscrollbar = Scrollbar(self._frame, orient="vertical") - xscrollbar["command"] = canvas.xview - yscrollbar["command"] = canvas.yview - canvas["xscrollcommand"] = xscrollbar.set - canvas["yscrollcommand"] = yscrollbar.set - yscrollbar.pack(fill="y", side="right") - xscrollbar.pack(fill="x", side="bottom") - canvas.pack(expand=1, fill="both", side="left") - - # Set initial scroll region. - scrollregion = "0 0 {} {}".format(canvas["width"], canvas["height"]) - canvas["scrollregion"] = scrollregion - - self._scrollwatcher = ScrollWatcherWidget(canvas) - - # If no parent was given, pack the frame, and add a menu. - if parent is None: - self.pack(expand=1, fill="both") - self._init_menubar() - - def _init_menubar(self): - menubar = Menu(self._parent) - - filemenu = Menu(menubar, tearoff=0) - filemenu.add_command( - label="Print to Postscript", - underline=0, - command=self.print_to_file, - accelerator="Ctrl-p", - ) - filemenu.add_command( - label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x" - ) - menubar.add_cascade(label="File", underline=0, menu=filemenu) - - self._parent.config(menu=menubar) - - def print_to_file(self, filename=None): - """ - Print the contents of this ``CanvasFrame`` to a postscript - file. If no filename is given, then prompt the user for one. - - :param filename: The name of the file to print the tree to. - :type filename: str - :rtype: None - """ - if filename is None: - ftypes = [("Postscript files", ".ps"), ("All files", "*")] - filename = asksaveasfilename(filetypes=ftypes, defaultextension=".ps") - if not filename: - return - (x0, y0, w, h) = self.scrollregion() - postscript = self._canvas.postscript( - x=x0, - y=y0, - width=w + 2, - height=h + 2, - pagewidth=w + 2, # points = 1/72 inch - pageheight=h + 2, # points = 1/72 inch - pagex=0, - pagey=0, - ) - # workaround for bug in Tk font handling - postscript = postscript.replace(" 0 scalefont ", " 9 scalefont ") - with open(filename, "wb") as f: - f.write(postscript.encode("utf8")) - - def scrollregion(self): - """ - :return: The current scroll region for the canvas managed by - this ``CanvasFrame``. - :rtype: 4-tuple of int - """ - (x1, y1, x2, y2) = self._canvas["scrollregion"].split() - return (int(x1), int(y1), int(x2), int(y2)) - - def canvas(self): - """ - :return: The canvas managed by this ``CanvasFrame``. - :rtype: Tkinter.Canvas - """ - return self._canvas - - def add_widget(self, canvaswidget, x=None, y=None): - """ - Register a canvas widget with this ``CanvasFrame``. The - ``CanvasFrame`` will ensure that this canvas widget is always - within the ``Canvas``'s scrollregion. If no coordinates are - given for the canvas widget, then the ``CanvasFrame`` will - attempt to find a clear area of the canvas for it. - - :type canvaswidget: CanvasWidget - :param canvaswidget: The new canvas widget. ``canvaswidget`` - must have been created on this ``CanvasFrame``'s canvas. - :type x: int - :param x: The initial x coordinate for the upper left hand - corner of ``canvaswidget``, in the canvas's coordinate - space. - :type y: int - :param y: The initial y coordinate for the upper left hand - corner of ``canvaswidget``, in the canvas's coordinate - space. - """ - if x is None or y is None: - (x, y) = self._find_room(canvaswidget, x, y) - - # Move to (x,y) - (x1, y1, x2, y2) = canvaswidget.bbox() - canvaswidget.move(x - x1, y - y1) - - # Register with scrollwatcher. - self._scrollwatcher.add_child(canvaswidget) - - def _find_room(self, widget, desired_x, desired_y): - """ - Try to find a space for a given widget. - """ - (left, top, right, bot) = self.scrollregion() - w = widget.width() - h = widget.height() - - if w >= (right - left): - return (0, 0) - if h >= (bot - top): - return (0, 0) - - # Move the widget out of the way, for now. - (x1, y1, x2, y2) = widget.bbox() - widget.move(left - x2 - 50, top - y2 - 50) - - if desired_x is not None: - x = desired_x - for y in range(top, bot - h, int((bot - top - h) / 10)): - if not self._canvas.find_overlapping( - x - 5, y - 5, x + w + 5, y + h + 5 - ): - return (x, y) - - if desired_y is not None: - y = desired_y - for x in range(left, right - w, int((right - left - w) / 10)): - if not self._canvas.find_overlapping( - x - 5, y - 5, x + w + 5, y + h + 5 - ): - return (x, y) - - for y in range(top, bot - h, int((bot - top - h) / 10)): - for x in range(left, right - w, int((right - left - w) / 10)): - if not self._canvas.find_overlapping( - x - 5, y - 5, x + w + 5, y + h + 5 - ): - return (x, y) - return (0, 0) - - def destroy_widget(self, canvaswidget): - """ - Remove a canvas widget from this ``CanvasFrame``. This - deregisters the canvas widget, and destroys it. - """ - self.remove_widget(canvaswidget) - canvaswidget.destroy() - - def remove_widget(self, canvaswidget): - # Deregister with scrollwatcher. - self._scrollwatcher.remove_child(canvaswidget) - - def pack(self, cnf={}, **kw): - """ - Pack this ``CanvasFrame``. See the documentation for - ``Tkinter.Pack`` for more information. - """ - self._frame.pack(cnf, **kw) - # Adjust to be big enough for kids? - - def destroy(self, *e): - """ - Destroy this ``CanvasFrame``. If this ``CanvasFrame`` created a - top-level window, then this will close that window. - """ - if self._parent is None: - return - self._parent.destroy() - self._parent = None - - def mainloop(self, *args, **kwargs): - """ - Enter the Tkinter mainloop. This function must be called if - this frame is created from a non-interactive program (e.g. - from a secript); otherwise, the frame will close as soon as - the script completes. - """ - if in_idle(): - return - self._parent.mainloop(*args, **kwargs) - - -##////////////////////////////////////////////////////// -## Text display -##////////////////////////////////////////////////////// - - -class ShowText: - """ - A ``Tkinter`` window used to display a text. ``ShowText`` is - typically used by graphical tools to display help text, or similar - information. - """ - - def __init__(self, root, title, text, width=None, height=None, **textbox_options): - if width is None or height is None: - (width, height) = self.find_dimentions(text, width, height) - - # Create the main window. - if root is None: - self._top = top = Tk() - else: - self._top = top = Toplevel(root) - top.title(title) - - b = Button(top, text="Ok", command=self.destroy) - b.pack(side="bottom") - - tbf = Frame(top) - tbf.pack(expand=1, fill="both") - scrollbar = Scrollbar(tbf, orient="vertical") - scrollbar.pack(side="right", fill="y") - textbox = Text(tbf, wrap="word", width=width, height=height, **textbox_options) - textbox.insert("end", text) - textbox["state"] = "disabled" - textbox.pack(side="left", expand=1, fill="both") - scrollbar["command"] = textbox.yview - textbox["yscrollcommand"] = scrollbar.set - - # Make it easy to close the window. - top.bind("q", self.destroy) - top.bind("x", self.destroy) - top.bind("c", self.destroy) - top.bind("", self.destroy) - top.bind("", self.destroy) - - # Focus the scrollbar, so they can use up/down, etc. - scrollbar.focus() - - def find_dimentions(self, text, width, height): - lines = text.split("\n") - if width is None: - maxwidth = max(len(line) for line in lines) - width = min(maxwidth, 80) - - # Now, find height. - height = 0 - for line in lines: - while len(line) > width: - brk = line[:width].rfind(" ") - line = line[brk:] - height += 1 - height += 1 - height = min(height, 25) - - return (width, height) - - def destroy(self, *e): - if self._top is None: - return - self._top.destroy() - self._top = None - - def mainloop(self, *args, **kwargs): - """ - Enter the Tkinter mainloop. This function must be called if - this window is created from a non-interactive program (e.g. - from a secript); otherwise, the window will close as soon as - the script completes. - """ - if in_idle(): - return - self._top.mainloop(*args, **kwargs) - - -##////////////////////////////////////////////////////// -## Entry dialog -##////////////////////////////////////////////////////// - - -class EntryDialog: - """ - A dialog box for entering - """ - - def __init__( - self, parent, original_text="", instructions="", set_callback=None, title=None - ): - self._parent = parent - self._original_text = original_text - self._set_callback = set_callback - - width = int(max(30, len(original_text) * 3 / 2)) - self._top = Toplevel(parent) - - if title: - self._top.title(title) - - # The text entry box. - entryframe = Frame(self._top) - entryframe.pack(expand=1, fill="both", padx=5, pady=5, ipady=10) - if instructions: - l = Label(entryframe, text=instructions) - l.pack(side="top", anchor="w", padx=30) - self._entry = Entry(entryframe, width=width) - self._entry.pack(expand=1, fill="x", padx=30) - self._entry.insert(0, original_text) - - # A divider - divider = Frame(self._top, borderwidth=1, relief="sunken") - divider.pack(fill="x", ipady=1, padx=10) - - # The buttons. - buttons = Frame(self._top) - buttons.pack(expand=0, fill="x", padx=5, pady=5) - b = Button(buttons, text="Cancel", command=self._cancel, width=8) - b.pack(side="right", padx=5) - b = Button(buttons, text="Ok", command=self._ok, width=8, default="active") - b.pack(side="left", padx=5) - b = Button(buttons, text="Apply", command=self._apply, width=8) - b.pack(side="left") - - self._top.bind("", self._ok) - self._top.bind("", self._cancel) - self._top.bind("", self._cancel) - - self._entry.focus() - - def _reset(self, *e): - self._entry.delete(0, "end") - self._entry.insert(0, self._original_text) - if self._set_callback: - self._set_callback(self._original_text) - - def _cancel(self, *e): - try: - self._reset() - except: - pass - self._destroy() - - def _ok(self, *e): - self._apply() - self._destroy() - - def _apply(self, *e): - if self._set_callback: - self._set_callback(self._entry.get()) - - def _destroy(self, *e): - if self._top is None: - return - self._top.destroy() - self._top = None - - -##////////////////////////////////////////////////////// -## Colorized List -##////////////////////////////////////////////////////// - - -class ColorizedList: - """ - An abstract base class for displaying a colorized list of items. - Subclasses should define: - - - ``_init_colortags``, which sets up Text color tags that - will be used by the list. - - ``_item_repr``, which returns a list of (text,colortag) - tuples that make up the colorized representation of the - item. - - :note: Typically, you will want to register a callback for - ``'select'`` that calls ``mark`` on the given item. - """ - - def __init__(self, parent, items=[], **options): - """ - Construct a new list. - - :param parent: The Tk widget that contains the colorized list - :param items: The initial contents of the colorized list. - :param options: - """ - self._parent = parent - self._callbacks = {} - - # Which items are marked? - self._marks = {} - - # Initialize the Tkinter frames. - self._init_itemframe(options.copy()) - - # Set up key & mouse bindings. - self._textwidget.bind("", self._keypress) - self._textwidget.bind("", self._buttonpress) - - # Fill in the given CFG's items. - self._items = None - self.set(items) - - # //////////////////////////////////////////////////////////// - # Abstract methods - # //////////////////////////////////////////////////////////// - @abstractmethod - def _init_colortags(self, textwidget, options): - """ - Set up any colortags that will be used by this colorized list. - E.g.: - textwidget.tag_config('terminal', foreground='black') - """ - - @abstractmethod - def _item_repr(self, item): - """ - Return a list of (text, colortag) tuples that make up the - colorized representation of the item. Colorized - representations may not span multiple lines. I.e., the text - strings returned may not contain newline characters. - """ - - # //////////////////////////////////////////////////////////// - # Item Access - # //////////////////////////////////////////////////////////// - - def get(self, index=None): - """ - :return: A list of the items contained by this list. - """ - if index is None: - return self._items[:] - else: - return self._items[index] - - def set(self, items): - """ - Modify the list of items contained by this list. - """ - items = list(items) - if self._items == items: - return - self._items = list(items) - - self._textwidget["state"] = "normal" - self._textwidget.delete("1.0", "end") - for item in items: - for (text, colortag) in self._item_repr(item): - assert "\n" not in text, "item repr may not contain newline" - self._textwidget.insert("end", text, colortag) - self._textwidget.insert("end", "\n") - # Remove the final newline - self._textwidget.delete("end-1char", "end") - self._textwidget.mark_set("insert", "1.0") - self._textwidget["state"] = "disabled" - # Clear all marks - self._marks.clear() - - def unmark(self, item=None): - """ - Remove highlighting from the given item; or from every item, - if no item is given. - :raise ValueError: If ``item`` is not contained in the list. - :raise KeyError: If ``item`` is not marked. - """ - if item is None: - self._marks.clear() - self._textwidget.tag_remove("highlight", "1.0", "end+1char") - else: - index = self._items.index(item) - del self._marks[item] - (start, end) = ("%d.0" % (index + 1), "%d.0" % (index + 2)) - self._textwidget.tag_remove("highlight", start, end) - - def mark(self, item): - """ - Highlight the given item. - :raise ValueError: If ``item`` is not contained in the list. - """ - self._marks[item] = 1 - index = self._items.index(item) - (start, end) = ("%d.0" % (index + 1), "%d.0" % (index + 2)) - self._textwidget.tag_add("highlight", start, end) - - def markonly(self, item): - """ - Remove any current highlighting, and mark the given item. - :raise ValueError: If ``item`` is not contained in the list. - """ - self.unmark() - self.mark(item) - - def view(self, item): - """ - Adjust the view such that the given item is visible. If - the item is already visible, then do nothing. - """ - index = self._items.index(item) - self._textwidget.see("%d.0" % (index + 1)) - - # //////////////////////////////////////////////////////////// - # Callbacks - # //////////////////////////////////////////////////////////// - - def add_callback(self, event, func): - """ - Register a callback function with the list. This function - will be called whenever the given event occurs. - - :param event: The event that will trigger the callback - function. Valid events are: click1, click2, click3, - space, return, select, up, down, next, prior, move - :param func: The function that should be called when - the event occurs. ``func`` will be called with a - single item as its argument. (The item selected - or the item moved to). - """ - if event == "select": - events = ["click1", "space", "return"] - elif event == "move": - events = ["up", "down", "next", "prior"] - else: - events = [event] - - for e in events: - self._callbacks.setdefault(e, {})[func] = 1 - - def remove_callback(self, event, func=None): - """ - Deregister a callback function. If ``func`` is none, then - all callbacks are removed for the given event. - """ - if event is None: - events = list(self._callbacks.keys()) - elif event == "select": - events = ["click1", "space", "return"] - elif event == "move": - events = ["up", "down", "next", "prior"] - else: - events = [event] - - for e in events: - if func is None: - del self._callbacks[e] - else: - try: - del self._callbacks[e][func] - except: - pass - - # //////////////////////////////////////////////////////////// - # Tkinter Methods - # //////////////////////////////////////////////////////////// - - def pack(self, cnf={}, **kw): - # "@include: Tkinter.Pack.pack" - self._itemframe.pack(cnf, **kw) - - def grid(self, cnf={}, **kw): - # "@include: Tkinter.Grid.grid" - self._itemframe.grid(cnf, *kw) - - def focus(self): - # "@include: Tkinter.Widget.focus" - self._textwidget.focus() - - # //////////////////////////////////////////////////////////// - # Internal Methods - # //////////////////////////////////////////////////////////// - - def _init_itemframe(self, options): - self._itemframe = Frame(self._parent) - - # Create the basic Text widget & scrollbar. - options.setdefault("background", "#e0e0e0") - self._textwidget = Text(self._itemframe, **options) - self._textscroll = Scrollbar(self._itemframe, takefocus=0, orient="vertical") - self._textwidget.config(yscrollcommand=self._textscroll.set) - self._textscroll.config(command=self._textwidget.yview) - self._textscroll.pack(side="right", fill="y") - self._textwidget.pack(expand=1, fill="both", side="left") - - # Initialize the colorization tags - self._textwidget.tag_config( - "highlight", background="#e0ffff", border="1", relief="raised" - ) - self._init_colortags(self._textwidget, options) - - # How do I want to mark keyboard selection? - self._textwidget.tag_config("sel", foreground="") - self._textwidget.tag_config( - "sel", foreground="", background="", border="", underline=1 - ) - self._textwidget.tag_lower("highlight", "sel") - - def _fire_callback(self, event, itemnum): - if event not in self._callbacks: - return - if 0 <= itemnum < len(self._items): - item = self._items[itemnum] - else: - item = None - for cb_func in list(self._callbacks[event].keys()): - cb_func(item) - - def _buttonpress(self, event): - clickloc = "@%d,%d" % (event.x, event.y) - insert_point = self._textwidget.index(clickloc) - itemnum = int(insert_point.split(".")[0]) - 1 - self._fire_callback("click%d" % event.num, itemnum) - - def _keypress(self, event): - if event.keysym == "Return" or event.keysym == "space": - insert_point = self._textwidget.index("insert") - itemnum = int(insert_point.split(".")[0]) - 1 - self._fire_callback(event.keysym.lower(), itemnum) - return - elif event.keysym == "Down": - delta = "+1line" - elif event.keysym == "Up": - delta = "-1line" - elif event.keysym == "Next": - delta = "+10lines" - elif event.keysym == "Prior": - delta = "-10lines" - else: - return "continue" - - self._textwidget.mark_set("insert", "insert" + delta) - self._textwidget.see("insert") - self._textwidget.tag_remove("sel", "1.0", "end+1char") - self._textwidget.tag_add("sel", "insert linestart", "insert lineend") - - insert_point = self._textwidget.index("insert") - itemnum = int(insert_point.split(".")[0]) - 1 - self._fire_callback(event.keysym.lower(), itemnum) - - return "break" - - -##////////////////////////////////////////////////////// -## Improved OptionMenu -##////////////////////////////////////////////////////// - - -class MutableOptionMenu(Menubutton): - def __init__(self, master, values, **options): - self._callback = options.get("command") - if "command" in options: - del options["command"] - - # Create a variable - self._variable = variable = StringVar() - if len(values) > 0: - variable.set(values[0]) - - kw = { - "borderwidth": 2, - "textvariable": variable, - "indicatoron": 1, - "relief": RAISED, - "anchor": "c", - "highlightthickness": 2, - } - kw.update(options) - Widget.__init__(self, master, "menubutton", kw) - self.widgetName = "tk_optionMenu" - self._menu = Menu(self, name="menu", tearoff=0) - self.menuname = self._menu._w - - self._values = [] - for value in values: - self.add(value) - - self["menu"] = self._menu - - def add(self, value): - if value in self._values: - return - - def set(value=value): - self.set(value) - - self._menu.add_command(label=value, command=set) - self._values.append(value) - - def set(self, value): - self._variable.set(value) - if self._callback: - self._callback(value) - - def remove(self, value): - # Might raise indexerror: pass to parent. - i = self._values.index(value) - del self._values[i] - self._menu.delete(i, i) - - def __getitem__(self, name): - if name == "menu": - return self.__menu - return Widget.__getitem__(self, name) - - def destroy(self): - """Destroy this widget and the associated menu.""" - Menubutton.destroy(self) - self._menu = None - - -##////////////////////////////////////////////////////// -## Test code. -##////////////////////////////////////////////////////// - - -def demo(): - """ - A simple demonstration showing how to use canvas widgets. - """ - - def fill(cw): - from random import randint - - cw["fill"] = "#00%04d" % randint(0, 9999) - - def color(cw): - from random import randint - - cw["color"] = "#ff%04d" % randint(0, 9999) - - cf = CanvasFrame(closeenough=10, width=300, height=300) - c = cf.canvas() - ct3 = TextWidget(c, "hiya there", draggable=1) - ct2 = TextWidget(c, "o o\n||\n___\n U", draggable=1, justify="center") - co = OvalWidget(c, ct2, outline="red") - ct = TextWidget(c, "o o\n||\n\\___/", draggable=1, justify="center") - cp = ParenWidget(c, ct, color="red") - cb = BoxWidget(c, cp, fill="cyan", draggable=1, width=3, margin=10) - equation = SequenceWidget( - c, - SymbolWidget(c, "forall"), - TextWidget(c, "x"), - SymbolWidget(c, "exists"), - TextWidget(c, "y: "), - TextWidget(c, "x"), - SymbolWidget(c, "notequal"), - TextWidget(c, "y"), - ) - space = SpaceWidget(c, 0, 30) - cstack = StackWidget(c, cb, ct3, space, co, equation, align="center") - prompt_msg = TextWidget( - c, "try clicking\nand dragging", draggable=1, justify="center" - ) - cs = SequenceWidget(c, cstack, prompt_msg) - zz = BracketWidget(c, cs, color="green4", width=3) - cf.add_widget(zz, 60, 30) - - cb.bind_click(fill) - ct.bind_click(color) - co.bind_click(fill) - ct2.bind_click(color) - ct3.bind_click(color) - - cf.mainloop() - # ShowText(None, 'title', ((('this is text'*150)+'\n')*5)) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/featstruct.py b/pipeline/nltk/featstruct.py deleted file mode 100644 index 5684f06f51e76070ca6e606722aa1583332429e3..0000000000000000000000000000000000000000 --- a/pipeline/nltk/featstruct.py +++ /dev/null @@ -1,2779 +0,0 @@ -# Natural Language Toolkit: Feature Structures -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper , -# Rob Speer, -# Steven Bird -# URL: -# For license information, see LICENSE.TXT - -""" -Basic data classes for representing feature structures, and for -performing basic operations on those feature structures. A feature -structure is a mapping from feature identifiers to feature values, -where each feature value is either a basic value (such as a string or -an integer), or a nested feature structure. There are two types of -feature structure, implemented by two subclasses of ``FeatStruct``: - - - feature dictionaries, implemented by ``FeatDict``, act like - Python dictionaries. Feature identifiers may be strings or - instances of the ``Feature`` class. - - feature lists, implemented by ``FeatList``, act like Python - lists. Feature identifiers are integers. - -Feature structures are typically used to represent partial information -about objects. A feature identifier that is not mapped to a value -stands for a feature whose value is unknown (*not* a feature without -a value). Two feature structures that represent (potentially -overlapping) information about the same object can be combined by -unification. When two inconsistent feature structures are unified, -the unification fails and returns None. - -Features can be specified using "feature paths", or tuples of feature -identifiers that specify path through the nested feature structures to -a value. Feature structures may contain reentrant feature values. A -"reentrant feature value" is a single feature value that can be -accessed via multiple feature paths. Unification preserves the -reentrance relations imposed by both of the unified feature -structures. In the feature structure resulting from unification, any -modifications to a reentrant feature value will be visible using any -of its feature paths. - -Feature structure variables are encoded using the ``nltk.sem.Variable`` -class. The variables' values are tracked using a bindings -dictionary, which maps variables to their values. When two feature -structures are unified, a fresh bindings dictionary is created to -track their values; and before unification completes, all bound -variables are replaced by their values. Thus, the bindings -dictionaries are usually strictly internal to the unification process. -However, it is possible to track the bindings of variables if you -choose to, by supplying your own initial bindings dictionary to the -``unify()`` function. - -When unbound variables are unified with one another, they become -aliased. This is encoded by binding one variable to the other. - -Lightweight Feature Structures -============================== -Many of the functions defined by ``nltk.featstruct`` can be applied -directly to simple Python dictionaries and lists, rather than to -full-fledged ``FeatDict`` and ``FeatList`` objects. In other words, -Python ``dicts`` and ``lists`` can be used as "light-weight" feature -structures. - - >>> from nltk.featstruct import unify - >>> unify(dict(x=1, y=dict()), dict(a='a', y=dict(b='b'))) # doctest: +SKIP - {'y': {'b': 'b'}, 'x': 1, 'a': 'a'} - -However, you should keep in mind the following caveats: - - - Python dictionaries & lists ignore reentrance when checking for - equality between values. But two FeatStructs with different - reentrances are considered nonequal, even if all their base - values are equal. - - - FeatStructs can be easily frozen, allowing them to be used as - keys in hash tables. Python dictionaries and lists can not. - - - FeatStructs display reentrance in their string representations; - Python dictionaries and lists do not. - - - FeatStructs may *not* be mixed with Python dictionaries and lists - (e.g., when performing unification). - - - FeatStructs provide a number of useful methods, such as ``walk()`` - and ``cyclic()``, which are not available for Python dicts and lists. - -In general, if your feature structures will contain any reentrances, -or if you plan to use them as dictionary keys, it is strongly -recommended that you use full-fledged ``FeatStruct`` objects. -""" - -import copy -import re -from functools import total_ordering - -from nltk.internals import raise_unorderable_types, read_str -from nltk.sem.logic import ( - Expression, - LogicalExpressionException, - LogicParser, - SubstituteBindingsI, - Variable, -) - -###################################################################### -# Feature Structure -###################################################################### - - -@total_ordering -class FeatStruct(SubstituteBindingsI): - """ - A mapping from feature identifiers to feature values, where each - feature value is either a basic value (such as a string or an - integer), or a nested feature structure. There are two types of - feature structure: - - - feature dictionaries, implemented by ``FeatDict``, act like - Python dictionaries. Feature identifiers may be strings or - instances of the ``Feature`` class. - - feature lists, implemented by ``FeatList``, act like Python - lists. Feature identifiers are integers. - - Feature structures may be indexed using either simple feature - identifiers or 'feature paths.' A feature path is a sequence - of feature identifiers that stand for a corresponding sequence of - indexing operations. In particular, ``fstruct[(f1,f2,...,fn)]`` is - equivalent to ``fstruct[f1][f2]...[fn]``. - - Feature structures may contain reentrant feature structures. A - "reentrant feature structure" is a single feature structure - object that can be accessed via multiple feature paths. Feature - structures may also be cyclic. A feature structure is "cyclic" - if there is any feature path from the feature structure to itself. - - Two feature structures are considered equal if they assign the - same values to all features, and have the same reentrancies. - - By default, feature structures are mutable. They may be made - immutable with the ``freeze()`` method. Once they have been - frozen, they may be hashed, and thus used as dictionary keys. - """ - - _frozen = False - """:ivar: A flag indicating whether this feature structure is - frozen or not. Once this flag is set, it should never be - un-set; and no further modification should be made to this - feature structure.""" - - ##//////////////////////////////////////////////////////////// - # { Constructor - ##//////////////////////////////////////////////////////////// - - def __new__(cls, features=None, **morefeatures): - """ - Construct and return a new feature structure. If this - constructor is called directly, then the returned feature - structure will be an instance of either the ``FeatDict`` class - or the ``FeatList`` class. - - :param features: The initial feature values for this feature - structure: - - - FeatStruct(string) -> FeatStructReader().read(string) - - FeatStruct(mapping) -> FeatDict(mapping) - - FeatStruct(sequence) -> FeatList(sequence) - - FeatStruct() -> FeatDict() - :param morefeatures: If ``features`` is a mapping or None, - then ``morefeatures`` provides additional features for the - ``FeatDict`` constructor. - """ - # If the FeatStruct constructor is called directly, then decide - # whether to create a FeatDict or a FeatList, based on the - # contents of the `features` argument. - if cls is FeatStruct: - if features is None: - return FeatDict.__new__(FeatDict, **morefeatures) - elif _is_mapping(features): - return FeatDict.__new__(FeatDict, features, **morefeatures) - elif morefeatures: - raise TypeError( - "Keyword arguments may only be specified " - "if features is None or is a mapping." - ) - if isinstance(features, str): - if FeatStructReader._START_FDICT_RE.match(features): - return FeatDict.__new__(FeatDict, features, **morefeatures) - else: - return FeatList.__new__(FeatList, features, **morefeatures) - elif _is_sequence(features): - return FeatList.__new__(FeatList, features) - else: - raise TypeError("Expected string or mapping or sequence") - - # Otherwise, construct the object as normal. - else: - return super().__new__(cls, features, **morefeatures) - - ##//////////////////////////////////////////////////////////// - # { Uniform Accessor Methods - ##//////////////////////////////////////////////////////////// - # These helper functions allow the methods defined by FeatStruct - # to treat all feature structures as mappings, even if they're - # really lists. (Lists are treated as mappings from ints to vals) - - def _keys(self): - """Return an iterable of the feature identifiers used by this - FeatStruct.""" - raise NotImplementedError() # Implemented by subclasses. - - def _values(self): - """Return an iterable of the feature values directly defined - by this FeatStruct.""" - raise NotImplementedError() # Implemented by subclasses. - - def _items(self): - """Return an iterable of (fid,fval) pairs, where fid is a - feature identifier and fval is the corresponding feature - value, for all features defined by this FeatStruct.""" - raise NotImplementedError() # Implemented by subclasses. - - ##//////////////////////////////////////////////////////////// - # { Equality & Hashing - ##//////////////////////////////////////////////////////////// - - def equal_values(self, other, check_reentrance=False): - """ - Return True if ``self`` and ``other`` assign the same value to - to every feature. In particular, return true if - ``self[p]==other[p]`` for every feature path *p* such - that ``self[p]`` or ``other[p]`` is a base value (i.e., - not a nested feature structure). - - :param check_reentrance: If True, then also return False if - there is any difference between the reentrances of ``self`` - and ``other``. - :note: the ``==`` is equivalent to ``equal_values()`` with - ``check_reentrance=True``. - """ - return self._equal(other, check_reentrance, set(), set(), set()) - - def __eq__(self, other): - """ - Return true if ``self`` and ``other`` are both feature structures, - assign the same values to all features, and contain the same - reentrances. I.e., return - ``self.equal_values(other, check_reentrance=True)``. - - :see: ``equal_values()`` - """ - return self._equal(other, True, set(), set(), set()) - - def __ne__(self, other): - return not self == other - - def __lt__(self, other): - if not isinstance(other, FeatStruct): - # raise_unorderable_types("<", self, other) - # Sometimes feature values can be pure strings, - # so we need to be able to compare with non-featstructs: - return self.__class__.__name__ < other.__class__.__name__ - else: - return len(self) < len(other) - - def __hash__(self): - """ - If this feature structure is frozen, return its hash value; - otherwise, raise ``TypeError``. - """ - if not self._frozen: - raise TypeError("FeatStructs must be frozen before they " "can be hashed.") - try: - return self._hash - except AttributeError: - self._hash = self._calculate_hashvalue(set()) - return self._hash - - def _equal( - self, other, check_reentrance, visited_self, visited_other, visited_pairs - ): - """ - Return True iff self and other have equal values. - - :param visited_self: A set containing the ids of all ``self`` - feature structures we've already visited. - :param visited_other: A set containing the ids of all ``other`` - feature structures we've already visited. - :param visited_pairs: A set containing ``(selfid, otherid)`` pairs - for all pairs of feature structures we've already visited. - """ - # If we're the same object, then we're equal. - if self is other: - return True - - # If we have different classes, we're definitely not equal. - if self.__class__ != other.__class__: - return False - - # If we define different features, we're definitely not equal. - # (Perform len test first because it's faster -- we should - # do profiling to see if this actually helps) - if len(self) != len(other): - return False - if set(self._keys()) != set(other._keys()): - return False - - # If we're checking reentrance, then any time we revisit a - # structure, make sure that it was paired with the same - # feature structure that it is now. Note: if check_reentrance, - # then visited_pairs will never contain two pairs whose first - # values are equal, or two pairs whose second values are equal. - if check_reentrance: - if id(self) in visited_self or id(other) in visited_other: - return (id(self), id(other)) in visited_pairs - - # If we're not checking reentrance, then we still need to deal - # with cycles. If we encounter the same (self, other) pair a - # second time, then we won't learn anything more by examining - # their children a second time, so just return true. - else: - if (id(self), id(other)) in visited_pairs: - return True - - # Keep track of which nodes we've visited. - visited_self.add(id(self)) - visited_other.add(id(other)) - visited_pairs.add((id(self), id(other))) - - # Now we have to check all values. If any of them don't match, - # then return false. - for (fname, self_fval) in self._items(): - other_fval = other[fname] - if isinstance(self_fval, FeatStruct): - if not self_fval._equal( - other_fval, - check_reentrance, - visited_self, - visited_other, - visited_pairs, - ): - return False - else: - if self_fval != other_fval: - return False - - # Everything matched up; return true. - return True - - def _calculate_hashvalue(self, visited): - """ - Return a hash value for this feature structure. - - :require: ``self`` must be frozen. - :param visited: A set containing the ids of all feature - structures we've already visited while hashing. - """ - if id(self) in visited: - return 1 - visited.add(id(self)) - - hashval = 5831 - for (fname, fval) in sorted(self._items()): - hashval *= 37 - hashval += hash(fname) - hashval *= 37 - if isinstance(fval, FeatStruct): - hashval += fval._calculate_hashvalue(visited) - else: - hashval += hash(fval) - # Convert to a 32 bit int. - hashval = int(hashval & 0x7FFFFFFF) - return hashval - - ##//////////////////////////////////////////////////////////// - # { Freezing - ##//////////////////////////////////////////////////////////// - - #: Error message used by mutating methods when called on a frozen - #: feature structure. - _FROZEN_ERROR = "Frozen FeatStructs may not be modified." - - def freeze(self): - """ - Make this feature structure, and any feature structures it - contains, immutable. Note: this method does not attempt to - 'freeze' any feature value that is not a ``FeatStruct``; it - is recommended that you use only immutable feature values. - """ - if self._frozen: - return - self._freeze(set()) - - def frozen(self): - """ - Return True if this feature structure is immutable. Feature - structures can be made immutable with the ``freeze()`` method. - Immutable feature structures may not be made mutable again, - but new mutable copies can be produced with the ``copy()`` method. - """ - return self._frozen - - def _freeze(self, visited): - """ - Make this feature structure, and any feature structure it - contains, immutable. - - :param visited: A set containing the ids of all feature - structures we've already visited while freezing. - """ - if id(self) in visited: - return - visited.add(id(self)) - self._frozen = True - for (fname, fval) in sorted(self._items()): - if isinstance(fval, FeatStruct): - fval._freeze(visited) - - ##//////////////////////////////////////////////////////////// - # { Copying - ##//////////////////////////////////////////////////////////// - - def copy(self, deep=True): - """ - Return a new copy of ``self``. The new copy will not be frozen. - - :param deep: If true, create a deep copy; if false, create - a shallow copy. - """ - if deep: - return copy.deepcopy(self) - else: - return self.__class__(self) - - # Subclasses should define __deepcopy__ to ensure that the new - # copy will not be frozen. - def __deepcopy__(self, memo): - raise NotImplementedError() # Implemented by subclasses. - - ##//////////////////////////////////////////////////////////// - # { Structural Information - ##//////////////////////////////////////////////////////////// - - def cyclic(self): - """ - Return True if this feature structure contains itself. - """ - return self._find_reentrances({})[id(self)] - - def walk(self): - """ - Return an iterator that generates this feature structure, and - each feature structure it contains. Each feature structure will - be generated exactly once. - """ - return self._walk(set()) - - def _walk(self, visited): - """ - Return an iterator that generates this feature structure, and - each feature structure it contains. - - :param visited: A set containing the ids of all feature - structures we've already visited while freezing. - """ - raise NotImplementedError() # Implemented by subclasses. - - def _walk(self, visited): - if id(self) in visited: - return - visited.add(id(self)) - yield self - for fval in self._values(): - if isinstance(fval, FeatStruct): - yield from fval._walk(visited) - - # Walk through the feature tree. The first time we see a feature - # value, map it to False (not reentrant). If we see a feature - # value more than once, then map it to True (reentrant). - def _find_reentrances(self, reentrances): - """ - Return a dictionary that maps from the ``id`` of each feature - structure contained in ``self`` (including ``self``) to a - boolean value, indicating whether it is reentrant or not. - """ - if id(self) in reentrances: - # We've seen it more than once. - reentrances[id(self)] = True - else: - # This is the first time we've seen it. - reentrances[id(self)] = False - - # Recurse to contained feature structures. - for fval in self._values(): - if isinstance(fval, FeatStruct): - fval._find_reentrances(reentrances) - - return reentrances - - ##//////////////////////////////////////////////////////////// - # { Variables & Bindings - ##//////////////////////////////////////////////////////////// - - def substitute_bindings(self, bindings): - """:see: ``nltk.featstruct.substitute_bindings()``""" - return substitute_bindings(self, bindings) - - def retract_bindings(self, bindings): - """:see: ``nltk.featstruct.retract_bindings()``""" - return retract_bindings(self, bindings) - - def variables(self): - """:see: ``nltk.featstruct.find_variables()``""" - return find_variables(self) - - def rename_variables(self, vars=None, used_vars=(), new_vars=None): - """:see: ``nltk.featstruct.rename_variables()``""" - return rename_variables(self, vars, used_vars, new_vars) - - def remove_variables(self): - """ - Return the feature structure that is obtained by deleting - any feature whose value is a ``Variable``. - - :rtype: FeatStruct - """ - return remove_variables(self) - - ##//////////////////////////////////////////////////////////// - # { Unification - ##//////////////////////////////////////////////////////////// - - def unify(self, other, bindings=None, trace=False, fail=None, rename_vars=True): - return unify(self, other, bindings, trace, fail, rename_vars) - - def subsumes(self, other): - """ - Return True if ``self`` subsumes ``other``. I.e., return true - If unifying ``self`` with ``other`` would result in a feature - structure equal to ``other``. - """ - return subsumes(self, other) - - ##//////////////////////////////////////////////////////////// - # { String Representations - ##//////////////////////////////////////////////////////////// - - def __repr__(self): - """ - Display a single-line representation of this feature structure, - suitable for embedding in other representations. - """ - return self._repr(self._find_reentrances({}), {}) - - def _repr(self, reentrances, reentrance_ids): - """ - Return a string representation of this feature structure. - - :param reentrances: A dictionary that maps from the ``id`` of - each feature value in self, indicating whether that value - is reentrant or not. - :param reentrance_ids: A dictionary mapping from each ``id`` - of a feature value to a unique identifier. This is modified - by ``repr``: the first time a reentrant feature value is - displayed, an identifier is added to ``reentrance_ids`` for it. - """ - raise NotImplementedError() - - -# Mutation: disable if frozen. -_FROZEN_ERROR = "Frozen FeatStructs may not be modified." -_FROZEN_NOTICE = "\n%sIf self is frozen, raise ValueError." - - -def _check_frozen(method, indent=""): - """ - Given a method function, return a new method function that first - checks if ``self._frozen`` is true; and if so, raises ``ValueError`` - with an appropriate message. Otherwise, call the method and return - its result. - """ - - def wrapped(self, *args, **kwargs): - if self._frozen: - raise ValueError(_FROZEN_ERROR) - else: - return method(self, *args, **kwargs) - - wrapped.__name__ = method.__name__ - wrapped.__doc__ = (method.__doc__ or "") + (_FROZEN_NOTICE % indent) - return wrapped - - -###################################################################### -# Feature Dictionary -###################################################################### - - -class FeatDict(FeatStruct, dict): - """ - A feature structure that acts like a Python dictionary. I.e., a - mapping from feature identifiers to feature values, where a feature - identifier can be a string or a ``Feature``; and where a feature value - can be either a basic value (such as a string or an integer), or a nested - feature structure. A feature identifiers for a ``FeatDict`` is - sometimes called a "feature name". - - Two feature dicts are considered equal if they assign the same - values to all features, and have the same reentrances. - - :see: ``FeatStruct`` for information about feature paths, reentrance, - cyclic feature structures, mutability, freezing, and hashing. - """ - - def __init__(self, features=None, **morefeatures): - """ - Create a new feature dictionary, with the specified features. - - :param features: The initial value for this feature - dictionary. If ``features`` is a ``FeatStruct``, then its - features are copied (shallow copy). If ``features`` is a - dict, then a feature is created for each item, mapping its - key to its value. If ``features`` is a string, then it is - processed using ``FeatStructReader``. If ``features`` is a list of - tuples ``(name, val)``, then a feature is created for each tuple. - :param morefeatures: Additional features for the new feature - dictionary. If a feature is listed under both ``features`` and - ``morefeatures``, then the value from ``morefeatures`` will be - used. - """ - if isinstance(features, str): - FeatStructReader().fromstring(features, self) - self.update(**morefeatures) - else: - # update() checks the types of features. - self.update(features, **morefeatures) - - # //////////////////////////////////////////////////////////// - # { Dict methods - # //////////////////////////////////////////////////////////// - _INDEX_ERROR = "Expected feature name or path. Got %r." - - def __getitem__(self, name_or_path): - """If the feature with the given name or path exists, return - its value; otherwise, raise ``KeyError``.""" - if isinstance(name_or_path, (str, Feature)): - return dict.__getitem__(self, name_or_path) - elif isinstance(name_or_path, tuple): - try: - val = self - for fid in name_or_path: - if not isinstance(val, FeatStruct): - raise KeyError # path contains base value - val = val[fid] - return val - except (KeyError, IndexError) as e: - raise KeyError(name_or_path) from e - else: - raise TypeError(self._INDEX_ERROR % name_or_path) - - def get(self, name_or_path, default=None): - """If the feature with the given name or path exists, return its - value; otherwise, return ``default``.""" - try: - return self[name_or_path] - except KeyError: - return default - - def __contains__(self, name_or_path): - """Return true if a feature with the given name or path exists.""" - try: - self[name_or_path] - return True - except KeyError: - return False - - def has_key(self, name_or_path): - """Return true if a feature with the given name or path exists.""" - return name_or_path in self - - def __delitem__(self, name_or_path): - """If the feature with the given name or path exists, delete - its value; otherwise, raise ``KeyError``.""" - if self._frozen: - raise ValueError(_FROZEN_ERROR) - if isinstance(name_or_path, (str, Feature)): - return dict.__delitem__(self, name_or_path) - elif isinstance(name_or_path, tuple): - if len(name_or_path) == 0: - raise ValueError("The path () can not be set") - else: - parent = self[name_or_path[:-1]] - if not isinstance(parent, FeatStruct): - raise KeyError(name_or_path) # path contains base value - del parent[name_or_path[-1]] - else: - raise TypeError(self._INDEX_ERROR % name_or_path) - - def __setitem__(self, name_or_path, value): - """Set the value for the feature with the given name or path - to ``value``. If ``name_or_path`` is an invalid path, raise - ``KeyError``.""" - if self._frozen: - raise ValueError(_FROZEN_ERROR) - if isinstance(name_or_path, (str, Feature)): - return dict.__setitem__(self, name_or_path, value) - elif isinstance(name_or_path, tuple): - if len(name_or_path) == 0: - raise ValueError("The path () can not be set") - else: - parent = self[name_or_path[:-1]] - if not isinstance(parent, FeatStruct): - raise KeyError(name_or_path) # path contains base value - parent[name_or_path[-1]] = value - else: - raise TypeError(self._INDEX_ERROR % name_or_path) - - clear = _check_frozen(dict.clear) - pop = _check_frozen(dict.pop) - popitem = _check_frozen(dict.popitem) - setdefault = _check_frozen(dict.setdefault) - - def update(self, features=None, **morefeatures): - if self._frozen: - raise ValueError(_FROZEN_ERROR) - if features is None: - items = () - elif hasattr(features, "items") and callable(features.items): - items = features.items() - elif hasattr(features, "__iter__"): - items = features - else: - raise ValueError("Expected mapping or list of tuples") - - for key, val in items: - if not isinstance(key, (str, Feature)): - raise TypeError("Feature names must be strings") - self[key] = val - for key, val in morefeatures.items(): - if not isinstance(key, (str, Feature)): - raise TypeError("Feature names must be strings") - self[key] = val - - ##//////////////////////////////////////////////////////////// - # { Copying - ##//////////////////////////////////////////////////////////// - - def __deepcopy__(self, memo): - memo[id(self)] = selfcopy = self.__class__() - for (key, val) in self._items(): - selfcopy[copy.deepcopy(key, memo)] = copy.deepcopy(val, memo) - return selfcopy - - ##//////////////////////////////////////////////////////////// - # { Uniform Accessor Methods - ##//////////////////////////////////////////////////////////// - - def _keys(self): - return self.keys() - - def _values(self): - return self.values() - - def _items(self): - return self.items() - - ##//////////////////////////////////////////////////////////// - # { String Representations - ##//////////////////////////////////////////////////////////// - - def __str__(self): - """ - Display a multi-line representation of this feature dictionary - as an FVM (feature value matrix). - """ - return "\n".join(self._str(self._find_reentrances({}), {})) - - def _repr(self, reentrances, reentrance_ids): - segments = [] - prefix = "" - suffix = "" - - # If this is the first time we've seen a reentrant structure, - # then assign it a unique identifier. - if reentrances[id(self)]: - assert id(self) not in reentrance_ids - reentrance_ids[id(self)] = repr(len(reentrance_ids) + 1) - - # sorting note: keys are unique strings, so we'll never fall - # through to comparing values. - for (fname, fval) in sorted(self.items()): - display = getattr(fname, "display", None) - if id(fval) in reentrance_ids: - segments.append(f"{fname}->({reentrance_ids[id(fval)]})") - elif ( - display == "prefix" and not prefix and isinstance(fval, (Variable, str)) - ): - prefix = "%s" % fval - elif display == "slash" and not suffix: - if isinstance(fval, Variable): - suffix = "/%s" % fval.name - else: - suffix = "/%s" % repr(fval) - elif isinstance(fval, Variable): - segments.append(f"{fname}={fval.name}") - elif fval is True: - segments.append("+%s" % fname) - elif fval is False: - segments.append("-%s" % fname) - elif isinstance(fval, Expression): - segments.append(f"{fname}=<{fval}>") - elif not isinstance(fval, FeatStruct): - segments.append(f"{fname}={repr(fval)}") - else: - fval_repr = fval._repr(reentrances, reentrance_ids) - segments.append(f"{fname}={fval_repr}") - # If it's reentrant, then add on an identifier tag. - if reentrances[id(self)]: - prefix = f"({reentrance_ids[id(self)]}){prefix}" - return "{}[{}]{}".format(prefix, ", ".join(segments), suffix) - - def _str(self, reentrances, reentrance_ids): - """ - :return: A list of lines composing a string representation of - this feature dictionary. - :param reentrances: A dictionary that maps from the ``id`` of - each feature value in self, indicating whether that value - is reentrant or not. - :param reentrance_ids: A dictionary mapping from each ``id`` - of a feature value to a unique identifier. This is modified - by ``repr``: the first time a reentrant feature value is - displayed, an identifier is added to ``reentrance_ids`` for - it. - """ - # If this is the first time we've seen a reentrant structure, - # then tack on an id string. - if reentrances[id(self)]: - assert id(self) not in reentrance_ids - reentrance_ids[id(self)] = repr(len(reentrance_ids) + 1) - - # Special case: empty feature dict. - if len(self) == 0: - if reentrances[id(self)]: - return ["(%s) []" % reentrance_ids[id(self)]] - else: - return ["[]"] - - # What's the longest feature name? Use this to align names. - maxfnamelen = max(len("%s" % k) for k in self.keys()) - - lines = [] - # sorting note: keys are unique strings, so we'll never fall - # through to comparing values. - for (fname, fval) in sorted(self.items()): - fname = ("%s" % fname).ljust(maxfnamelen) - if isinstance(fval, Variable): - lines.append(f"{fname} = {fval.name}") - - elif isinstance(fval, Expression): - lines.append(f"{fname} = <{fval}>") - - elif isinstance(fval, FeatList): - fval_repr = fval._repr(reentrances, reentrance_ids) - lines.append(f"{fname} = {repr(fval_repr)}") - - elif not isinstance(fval, FeatDict): - # It's not a nested feature structure -- just print it. - lines.append(f"{fname} = {repr(fval)}") - - elif id(fval) in reentrance_ids: - # It's a feature structure we've seen before -- print - # the reentrance id. - lines.append(f"{fname} -> ({reentrance_ids[id(fval)]})") - - else: - # It's a new feature structure. Separate it from - # other values by a blank line. - if lines and lines[-1] != "": - lines.append("") - - # Recursively print the feature's value (fval). - fval_lines = fval._str(reentrances, reentrance_ids) - - # Indent each line to make room for fname. - fval_lines = [(" " * (maxfnamelen + 3)) + l for l in fval_lines] - - # Pick which line we'll display fname on, & splice it in. - nameline = (len(fval_lines) - 1) // 2 - fval_lines[nameline] = ( - fname + " =" + fval_lines[nameline][maxfnamelen + 2 :] - ) - - # Add the feature structure to the output. - lines += fval_lines - - # Separate FeatStructs by a blank line. - lines.append("") - - # Get rid of any excess blank lines. - if lines[-1] == "": - lines.pop() - - # Add brackets around everything. - maxlen = max(len(line) for line in lines) - lines = ["[ {}{} ]".format(line, " " * (maxlen - len(line))) for line in lines] - - # If it's reentrant, then add on an identifier tag. - if reentrances[id(self)]: - idstr = "(%s) " % reentrance_ids[id(self)] - lines = [(" " * len(idstr)) + l for l in lines] - idline = (len(lines) - 1) // 2 - lines[idline] = idstr + lines[idline][len(idstr) :] - - return lines - - -###################################################################### -# Feature List -###################################################################### - - -class FeatList(FeatStruct, list): - """ - A list of feature values, where each feature value is either a - basic value (such as a string or an integer), or a nested feature - structure. - - Feature lists may contain reentrant feature values. A "reentrant - feature value" is a single feature value that can be accessed via - multiple feature paths. Feature lists may also be cyclic. - - Two feature lists are considered equal if they assign the same - values to all features, and have the same reentrances. - - :see: ``FeatStruct`` for information about feature paths, reentrance, - cyclic feature structures, mutability, freezing, and hashing. - """ - - def __init__(self, features=()): - """ - Create a new feature list, with the specified features. - - :param features: The initial list of features for this feature - list. If ``features`` is a string, then it is paresd using - ``FeatStructReader``. Otherwise, it should be a sequence - of basic values and nested feature structures. - """ - if isinstance(features, str): - FeatStructReader().fromstring(features, self) - else: - list.__init__(self, features) - - # //////////////////////////////////////////////////////////// - # { List methods - # //////////////////////////////////////////////////////////// - _INDEX_ERROR = "Expected int or feature path. Got %r." - - def __getitem__(self, name_or_path): - if isinstance(name_or_path, int): - return list.__getitem__(self, name_or_path) - elif isinstance(name_or_path, tuple): - try: - val = self - for fid in name_or_path: - if not isinstance(val, FeatStruct): - raise KeyError # path contains base value - val = val[fid] - return val - except (KeyError, IndexError) as e: - raise KeyError(name_or_path) from e - else: - raise TypeError(self._INDEX_ERROR % name_or_path) - - def __delitem__(self, name_or_path): - """If the feature with the given name or path exists, delete - its value; otherwise, raise ``KeyError``.""" - if self._frozen: - raise ValueError(_FROZEN_ERROR) - if isinstance(name_or_path, (int, slice)): - return list.__delitem__(self, name_or_path) - elif isinstance(name_or_path, tuple): - if len(name_or_path) == 0: - raise ValueError("The path () can not be set") - else: - parent = self[name_or_path[:-1]] - if not isinstance(parent, FeatStruct): - raise KeyError(name_or_path) # path contains base value - del parent[name_or_path[-1]] - else: - raise TypeError(self._INDEX_ERROR % name_or_path) - - def __setitem__(self, name_or_path, value): - """Set the value for the feature with the given name or path - to ``value``. If ``name_or_path`` is an invalid path, raise - ``KeyError``.""" - if self._frozen: - raise ValueError(_FROZEN_ERROR) - if isinstance(name_or_path, (int, slice)): - return list.__setitem__(self, name_or_path, value) - elif isinstance(name_or_path, tuple): - if len(name_or_path) == 0: - raise ValueError("The path () can not be set") - else: - parent = self[name_or_path[:-1]] - if not isinstance(parent, FeatStruct): - raise KeyError(name_or_path) # path contains base value - parent[name_or_path[-1]] = value - else: - raise TypeError(self._INDEX_ERROR % name_or_path) - - # __delslice__ = _check_frozen(list.__delslice__, ' ') - # __setslice__ = _check_frozen(list.__setslice__, ' ') - __iadd__ = _check_frozen(list.__iadd__) - __imul__ = _check_frozen(list.__imul__) - append = _check_frozen(list.append) - extend = _check_frozen(list.extend) - insert = _check_frozen(list.insert) - pop = _check_frozen(list.pop) - remove = _check_frozen(list.remove) - reverse = _check_frozen(list.reverse) - sort = _check_frozen(list.sort) - - ##//////////////////////////////////////////////////////////// - # { Copying - ##//////////////////////////////////////////////////////////// - - def __deepcopy__(self, memo): - memo[id(self)] = selfcopy = self.__class__() - selfcopy.extend(copy.deepcopy(fval, memo) for fval in self) - return selfcopy - - ##//////////////////////////////////////////////////////////// - # { Uniform Accessor Methods - ##//////////////////////////////////////////////////////////// - - def _keys(self): - return list(range(len(self))) - - def _values(self): - return self - - def _items(self): - return enumerate(self) - - ##//////////////////////////////////////////////////////////// - # { String Representations - ##//////////////////////////////////////////////////////////// - - # Special handling for: reentrances, variables, expressions. - def _repr(self, reentrances, reentrance_ids): - # If this is the first time we've seen a reentrant structure, - # then assign it a unique identifier. - if reentrances[id(self)]: - assert id(self) not in reentrance_ids - reentrance_ids[id(self)] = repr(len(reentrance_ids) + 1) - prefix = "(%s)" % reentrance_ids[id(self)] - else: - prefix = "" - - segments = [] - for fval in self: - if id(fval) in reentrance_ids: - segments.append("->(%s)" % reentrance_ids[id(fval)]) - elif isinstance(fval, Variable): - segments.append(fval.name) - elif isinstance(fval, Expression): - segments.append("%s" % fval) - elif isinstance(fval, FeatStruct): - segments.append(fval._repr(reentrances, reentrance_ids)) - else: - segments.append("%s" % repr(fval)) - - return "{}[{}]".format(prefix, ", ".join(segments)) - - -###################################################################### -# Variables & Bindings -###################################################################### - - -def substitute_bindings(fstruct, bindings, fs_class="default"): - """ - Return the feature structure that is obtained by replacing each - variable bound by ``bindings`` with its binding. If a variable is - aliased to a bound variable, then it will be replaced by that - variable's value. If a variable is aliased to an unbound - variable, then it will be replaced by that variable. - - :type bindings: dict(Variable -> any) - :param bindings: A dictionary mapping from variables to values. - """ - if fs_class == "default": - fs_class = _default_fs_class(fstruct) - fstruct = copy.deepcopy(fstruct) - _substitute_bindings(fstruct, bindings, fs_class, set()) - return fstruct - - -def _substitute_bindings(fstruct, bindings, fs_class, visited): - # Visit each node only once: - if id(fstruct) in visited: - return - visited.add(id(fstruct)) - - if _is_mapping(fstruct): - items = fstruct.items() - elif _is_sequence(fstruct): - items = enumerate(fstruct) - else: - raise ValueError("Expected mapping or sequence") - for (fname, fval) in items: - while isinstance(fval, Variable) and fval in bindings: - fval = fstruct[fname] = bindings[fval] - if isinstance(fval, fs_class): - _substitute_bindings(fval, bindings, fs_class, visited) - elif isinstance(fval, SubstituteBindingsI): - fstruct[fname] = fval.substitute_bindings(bindings) - - -def retract_bindings(fstruct, bindings, fs_class="default"): - """ - Return the feature structure that is obtained by replacing each - feature structure value that is bound by ``bindings`` with the - variable that binds it. A feature structure value must be - identical to a bound value (i.e., have equal id) to be replaced. - - ``bindings`` is modified to point to this new feature structure, - rather than the original feature structure. Feature structure - values in ``bindings`` may be modified if they are contained in - ``fstruct``. - """ - if fs_class == "default": - fs_class = _default_fs_class(fstruct) - (fstruct, new_bindings) = copy.deepcopy((fstruct, bindings)) - bindings.update(new_bindings) - inv_bindings = {id(val): var for (var, val) in bindings.items()} - _retract_bindings(fstruct, inv_bindings, fs_class, set()) - return fstruct - - -def _retract_bindings(fstruct, inv_bindings, fs_class, visited): - # Visit each node only once: - if id(fstruct) in visited: - return - visited.add(id(fstruct)) - - if _is_mapping(fstruct): - items = fstruct.items() - elif _is_sequence(fstruct): - items = enumerate(fstruct) - else: - raise ValueError("Expected mapping or sequence") - for (fname, fval) in items: - if isinstance(fval, fs_class): - if id(fval) in inv_bindings: - fstruct[fname] = inv_bindings[id(fval)] - _retract_bindings(fval, inv_bindings, fs_class, visited) - - -def find_variables(fstruct, fs_class="default"): - """ - :return: The set of variables used by this feature structure. - :rtype: set(Variable) - """ - if fs_class == "default": - fs_class = _default_fs_class(fstruct) - return _variables(fstruct, set(), fs_class, set()) - - -def _variables(fstruct, vars, fs_class, visited): - # Visit each node only once: - if id(fstruct) in visited: - return - visited.add(id(fstruct)) - if _is_mapping(fstruct): - items = fstruct.items() - elif _is_sequence(fstruct): - items = enumerate(fstruct) - else: - raise ValueError("Expected mapping or sequence") - for (fname, fval) in items: - if isinstance(fval, Variable): - vars.add(fval) - elif isinstance(fval, fs_class): - _variables(fval, vars, fs_class, visited) - elif isinstance(fval, SubstituteBindingsI): - vars.update(fval.variables()) - return vars - - -def rename_variables( - fstruct, vars=None, used_vars=(), new_vars=None, fs_class="default" -): - """ - Return the feature structure that is obtained by replacing - any of this feature structure's variables that are in ``vars`` - with new variables. The names for these new variables will be - names that are not used by any variable in ``vars``, or in - ``used_vars``, or in this feature structure. - - :type vars: set - :param vars: The set of variables that should be renamed. - If not specified, ``find_variables(fstruct)`` is used; i.e., all - variables will be given new names. - :type used_vars: set - :param used_vars: A set of variables whose names should not be - used by the new variables. - :type new_vars: dict(Variable -> Variable) - :param new_vars: A dictionary that is used to hold the mapping - from old variables to new variables. For each variable *v* - in this feature structure: - - - If ``new_vars`` maps *v* to *v'*, then *v* will be - replaced by *v'*. - - If ``new_vars`` does not contain *v*, but ``vars`` - does contain *v*, then a new entry will be added to - ``new_vars``, mapping *v* to the new variable that is used - to replace it. - - To consistently rename the variables in a set of feature - structures, simply apply rename_variables to each one, using - the same dictionary: - - >>> from nltk.featstruct import FeatStruct - >>> fstruct1 = FeatStruct('[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]') - >>> fstruct2 = FeatStruct('[subj=[agr=[number=?z,gender=?y]], obj=[agr=[number=?z,gender=?y]]]') - >>> new_vars = {} # Maps old vars to alpha-renamed vars - >>> fstruct1.rename_variables(new_vars=new_vars) - [obj=[agr=[gender=?y2]], subj=[agr=[gender=?y2]]] - >>> fstruct2.rename_variables(new_vars=new_vars) - [obj=[agr=[gender=?y2, number=?z2]], subj=[agr=[gender=?y2, number=?z2]]] - - If new_vars is not specified, then an empty dictionary is used. - """ - if fs_class == "default": - fs_class = _default_fs_class(fstruct) - - # Default values: - if new_vars is None: - new_vars = {} - if vars is None: - vars = find_variables(fstruct, fs_class) - else: - vars = set(vars) - - # Add our own variables to used_vars. - used_vars = find_variables(fstruct, fs_class).union(used_vars) - - # Copy ourselves, and rename variables in the copy. - return _rename_variables( - copy.deepcopy(fstruct), vars, used_vars, new_vars, fs_class, set() - ) - - -def _rename_variables(fstruct, vars, used_vars, new_vars, fs_class, visited): - if id(fstruct) in visited: - return - visited.add(id(fstruct)) - if _is_mapping(fstruct): - items = fstruct.items() - elif _is_sequence(fstruct): - items = enumerate(fstruct) - else: - raise ValueError("Expected mapping or sequence") - for (fname, fval) in items: - if isinstance(fval, Variable): - # If it's in new_vars, then rebind it. - if fval in new_vars: - fstruct[fname] = new_vars[fval] - # If it's in vars, pick a new name for it. - elif fval in vars: - new_vars[fval] = _rename_variable(fval, used_vars) - fstruct[fname] = new_vars[fval] - used_vars.add(new_vars[fval]) - elif isinstance(fval, fs_class): - _rename_variables(fval, vars, used_vars, new_vars, fs_class, visited) - elif isinstance(fval, SubstituteBindingsI): - # Pick new names for any variables in `vars` - for var in fval.variables(): - if var in vars and var not in new_vars: - new_vars[var] = _rename_variable(var, used_vars) - used_vars.add(new_vars[var]) - # Replace all variables in `new_vars`. - fstruct[fname] = fval.substitute_bindings(new_vars) - return fstruct - - -def _rename_variable(var, used_vars): - name, n = re.sub(r"\d+$", "", var.name), 2 - if not name: - name = "?" - while Variable(f"{name}{n}") in used_vars: - n += 1 - return Variable(f"{name}{n}") - - -def remove_variables(fstruct, fs_class="default"): - """ - :rtype: FeatStruct - :return: The feature structure that is obtained by deleting - all features whose values are ``Variables``. - """ - if fs_class == "default": - fs_class = _default_fs_class(fstruct) - return _remove_variables(copy.deepcopy(fstruct), fs_class, set()) - - -def _remove_variables(fstruct, fs_class, visited): - if id(fstruct) in visited: - return - visited.add(id(fstruct)) - - if _is_mapping(fstruct): - items = list(fstruct.items()) - elif _is_sequence(fstruct): - items = list(enumerate(fstruct)) - else: - raise ValueError("Expected mapping or sequence") - - for (fname, fval) in items: - if isinstance(fval, Variable): - del fstruct[fname] - elif isinstance(fval, fs_class): - _remove_variables(fval, fs_class, visited) - return fstruct - - -###################################################################### -# Unification -###################################################################### - - -class _UnificationFailure: - def __repr__(self): - return "nltk.featstruct.UnificationFailure" - - -UnificationFailure = _UnificationFailure() -"""A unique value used to indicate unification failure. It can be - returned by ``Feature.unify_base_values()`` or by custom ``fail()`` - functions to indicate that unificaiton should fail.""" - - -# The basic unification algorithm: -# 1. Make copies of self and other (preserving reentrance) -# 2. Destructively unify self and other -# 3. Apply forward pointers, to preserve reentrance. -# 4. Replace bound variables with their values. -def unify( - fstruct1, - fstruct2, - bindings=None, - trace=False, - fail=None, - rename_vars=True, - fs_class="default", -): - """ - Unify ``fstruct1`` with ``fstruct2``, and return the resulting feature - structure. This unified feature structure is the minimal - feature structure that contains all feature value assignments from both - ``fstruct1`` and ``fstruct2``, and that preserves all reentrancies. - - If no such feature structure exists (because ``fstruct1`` and - ``fstruct2`` specify incompatible values for some feature), then - unification fails, and ``unify`` returns None. - - Bound variables are replaced by their values. Aliased - variables are replaced by their representative variable - (if unbound) or the value of their representative variable - (if bound). I.e., if variable *v* is in ``bindings``, - then *v* is replaced by ``bindings[v]``. This will - be repeated until the variable is replaced by an unbound - variable or a non-variable value. - - Unbound variables are bound when they are unified with - values; and aliased when they are unified with variables. - I.e., if variable *v* is not in ``bindings``, and is - unified with a variable or value *x*, then - ``bindings[v]`` is set to *x*. - - If ``bindings`` is unspecified, then all variables are - assumed to be unbound. I.e., ``bindings`` defaults to an - empty dict. - - >>> from nltk.featstruct import FeatStruct - >>> FeatStruct('[a=?x]').unify(FeatStruct('[b=?x]')) - [a=?x, b=?x2] - - :type bindings: dict(Variable -> any) - :param bindings: A set of variable bindings to be used and - updated during unification. - :type trace: bool - :param trace: If true, generate trace output. - :type rename_vars: bool - :param rename_vars: If True, then rename any variables in - ``fstruct2`` that are also used in ``fstruct1``, in order to - avoid collisions on variable names. - """ - # Decide which class(es) will be treated as feature structures, - # for the purposes of unification. - if fs_class == "default": - fs_class = _default_fs_class(fstruct1) - if _default_fs_class(fstruct2) != fs_class: - raise ValueError( - "Mixing FeatStruct objects with Python " - "dicts and lists is not supported." - ) - assert isinstance(fstruct1, fs_class) - assert isinstance(fstruct2, fs_class) - - # If bindings are unspecified, use an empty set of bindings. - user_bindings = bindings is not None - if bindings is None: - bindings = {} - - # Make copies of fstruct1 and fstruct2 (since the unification - # algorithm is destructive). Do it all at once, to preserve - # reentrance links between fstruct1 and fstruct2. Copy bindings - # as well, in case there are any bound vars that contain parts - # of fstruct1 or fstruct2. - (fstruct1copy, fstruct2copy, bindings_copy) = copy.deepcopy( - (fstruct1, fstruct2, bindings) - ) - - # Copy the bindings back to the original bindings dict. - bindings.update(bindings_copy) - - if rename_vars: - vars1 = find_variables(fstruct1copy, fs_class) - vars2 = find_variables(fstruct2copy, fs_class) - _rename_variables(fstruct2copy, vars1, vars2, {}, fs_class, set()) - - # Do the actual unification. If it fails, return None. - forward = {} - if trace: - _trace_unify_start((), fstruct1copy, fstruct2copy) - try: - result = _destructively_unify( - fstruct1copy, fstruct2copy, bindings, forward, trace, fail, fs_class, () - ) - except _UnificationFailureError: - return None - - # _destructively_unify might return UnificationFailure, e.g. if we - # tried to unify a mapping with a sequence. - if result is UnificationFailure: - if fail is None: - return None - else: - return fail(fstruct1copy, fstruct2copy, ()) - - # Replace any feature structure that has a forward pointer - # with the target of its forward pointer. - result = _apply_forwards(result, forward, fs_class, set()) - if user_bindings: - _apply_forwards_to_bindings(forward, bindings) - - # Replace bound vars with values. - _resolve_aliases(bindings) - _substitute_bindings(result, bindings, fs_class, set()) - - # Return the result. - if trace: - _trace_unify_succeed((), result) - if trace: - _trace_bindings((), bindings) - return result - - -class _UnificationFailureError(Exception): - """An exception that is used by ``_destructively_unify`` to abort - unification when a failure is encountered.""" - - -def _destructively_unify( - fstruct1, fstruct2, bindings, forward, trace, fail, fs_class, path -): - """ - Attempt to unify ``fstruct1`` and ``fstruct2`` by modifying them - in-place. If the unification succeeds, then ``fstruct1`` will - contain the unified value, the value of ``fstruct2`` is undefined, - and forward[id(fstruct2)] is set to fstruct1. If the unification - fails, then a _UnificationFailureError is raised, and the - values of ``fstruct1`` and ``fstruct2`` are undefined. - - :param bindings: A dictionary mapping variables to values. - :param forward: A dictionary mapping feature structures ids - to replacement structures. When two feature structures - are merged, a mapping from one to the other will be added - to the forward dictionary; and changes will be made only - to the target of the forward dictionary. - ``_destructively_unify`` will always 'follow' any links - in the forward dictionary for fstruct1 and fstruct2 before - actually unifying them. - :param trace: If true, generate trace output - :param path: The feature path that led us to this unification - step. Used for trace output. - """ - # If fstruct1 is already identical to fstruct2, we're done. - # Note: this, together with the forward pointers, ensures - # that unification will terminate even for cyclic structures. - if fstruct1 is fstruct2: - if trace: - _trace_unify_identity(path, fstruct1) - return fstruct1 - - # Set fstruct2's forward pointer to point to fstruct1; this makes - # fstruct1 the canonical copy for fstruct2. Note that we need to - # do this before we recurse into any child structures, in case - # they're cyclic. - forward[id(fstruct2)] = fstruct1 - - # Unifying two mappings: - if _is_mapping(fstruct1) and _is_mapping(fstruct2): - for fname in fstruct1: - if getattr(fname, "default", None) is not None: - fstruct2.setdefault(fname, fname.default) - for fname in fstruct2: - if getattr(fname, "default", None) is not None: - fstruct1.setdefault(fname, fname.default) - - # Unify any values that are defined in both fstruct1 and - # fstruct2. Copy any values that are defined in fstruct2 but - # not in fstruct1 to fstruct1. Note: sorting fstruct2's - # features isn't actually necessary; but we do it to give - # deterministic behavior, e.g. for tracing. - for fname, fval2 in sorted(fstruct2.items()): - if fname in fstruct1: - fstruct1[fname] = _unify_feature_values( - fname, - fstruct1[fname], - fval2, - bindings, - forward, - trace, - fail, - fs_class, - path + (fname,), - ) - else: - fstruct1[fname] = fval2 - - return fstruct1 # Contains the unified value. - - # Unifying two sequences: - elif _is_sequence(fstruct1) and _is_sequence(fstruct2): - # If the lengths don't match, fail. - if len(fstruct1) != len(fstruct2): - return UnificationFailure - - # Unify corresponding values in fstruct1 and fstruct2. - for findex in range(len(fstruct1)): - fstruct1[findex] = _unify_feature_values( - findex, - fstruct1[findex], - fstruct2[findex], - bindings, - forward, - trace, - fail, - fs_class, - path + (findex,), - ) - - return fstruct1 # Contains the unified value. - - # Unifying sequence & mapping: fail. The failure function - # doesn't get a chance to recover in this case. - elif (_is_sequence(fstruct1) or _is_mapping(fstruct1)) and ( - _is_sequence(fstruct2) or _is_mapping(fstruct2) - ): - return UnificationFailure - - # Unifying anything else: not allowed! - raise TypeError("Expected mappings or sequences") - - -def _unify_feature_values( - fname, fval1, fval2, bindings, forward, trace, fail, fs_class, fpath -): - """ - Attempt to unify ``fval1`` and and ``fval2``, and return the - resulting unified value. The method of unification will depend on - the types of ``fval1`` and ``fval2``: - - 1. If they're both feature structures, then destructively - unify them (see ``_destructively_unify()``. - 2. If they're both unbound variables, then alias one variable - to the other (by setting bindings[v2]=v1). - 3. If one is an unbound variable, and the other is a value, - then bind the unbound variable to the value. - 4. If one is a feature structure, and the other is a base value, - then fail. - 5. If they're both base values, then unify them. By default, - this will succeed if they are equal, and fail otherwise. - """ - if trace: - _trace_unify_start(fpath, fval1, fval2) - - # Look up the "canonical" copy of fval1 and fval2 - while id(fval1) in forward: - fval1 = forward[id(fval1)] - while id(fval2) in forward: - fval2 = forward[id(fval2)] - - # If fval1 or fval2 is a bound variable, then - # replace it by the variable's bound value. This - # includes aliased variables, which are encoded as - # variables bound to other variables. - fvar1 = fvar2 = None - while isinstance(fval1, Variable) and fval1 in bindings: - fvar1 = fval1 - fval1 = bindings[fval1] - while isinstance(fval2, Variable) and fval2 in bindings: - fvar2 = fval2 - fval2 = bindings[fval2] - - # Case 1: Two feature structures (recursive case) - if isinstance(fval1, fs_class) and isinstance(fval2, fs_class): - result = _destructively_unify( - fval1, fval2, bindings, forward, trace, fail, fs_class, fpath - ) - - # Case 2: Two unbound variables (create alias) - elif isinstance(fval1, Variable) and isinstance(fval2, Variable): - if fval1 != fval2: - bindings[fval2] = fval1 - result = fval1 - - # Case 3: An unbound variable and a value (bind) - elif isinstance(fval1, Variable): - bindings[fval1] = fval2 - result = fval1 - elif isinstance(fval2, Variable): - bindings[fval2] = fval1 - result = fval2 - - # Case 4: A feature structure & a base value (fail) - elif isinstance(fval1, fs_class) or isinstance(fval2, fs_class): - result = UnificationFailure - - # Case 5: Two base values - else: - # Case 5a: Feature defines a custom unification method for base values - if isinstance(fname, Feature): - result = fname.unify_base_values(fval1, fval2, bindings) - # Case 5b: Feature value defines custom unification method - elif isinstance(fval1, CustomFeatureValue): - result = fval1.unify(fval2) - # Sanity check: unify value should be symmetric - if isinstance(fval2, CustomFeatureValue) and result != fval2.unify(fval1): - raise AssertionError( - "CustomFeatureValue objects %r and %r disagree " - "about unification value: %r vs. %r" - % (fval1, fval2, result, fval2.unify(fval1)) - ) - elif isinstance(fval2, CustomFeatureValue): - result = fval2.unify(fval1) - # Case 5c: Simple values -- check if they're equal. - else: - if fval1 == fval2: - result = fval1 - else: - result = UnificationFailure - - # If either value was a bound variable, then update the - # bindings. (This is really only necessary if fname is a - # Feature or if either value is a CustomFeatureValue.) - if result is not UnificationFailure: - if fvar1 is not None: - bindings[fvar1] = result - result = fvar1 - if fvar2 is not None and fvar2 != fvar1: - bindings[fvar2] = result - result = fvar2 - - # If we unification failed, call the failure function; it - # might decide to continue anyway. - if result is UnificationFailure: - if fail is not None: - result = fail(fval1, fval2, fpath) - if trace: - _trace_unify_fail(fpath[:-1], result) - if result is UnificationFailure: - raise _UnificationFailureError - - # Normalize the result. - if isinstance(result, fs_class): - result = _apply_forwards(result, forward, fs_class, set()) - - if trace: - _trace_unify_succeed(fpath, result) - if trace and isinstance(result, fs_class): - _trace_bindings(fpath, bindings) - - return result - - -def _apply_forwards_to_bindings(forward, bindings): - """ - Replace any feature structure that has a forward pointer with - the target of its forward pointer (to preserve reentrancy). - """ - for (var, value) in bindings.items(): - while id(value) in forward: - value = forward[id(value)] - bindings[var] = value - - -def _apply_forwards(fstruct, forward, fs_class, visited): - """ - Replace any feature structure that has a forward pointer with - the target of its forward pointer (to preserve reentrancy). - """ - # Follow our own forwards pointers (if any) - while id(fstruct) in forward: - fstruct = forward[id(fstruct)] - - # Visit each node only once: - if id(fstruct) in visited: - return - visited.add(id(fstruct)) - - if _is_mapping(fstruct): - items = fstruct.items() - elif _is_sequence(fstruct): - items = enumerate(fstruct) - else: - raise ValueError("Expected mapping or sequence") - for fname, fval in items: - if isinstance(fval, fs_class): - # Replace w/ forwarded value. - while id(fval) in forward: - fval = forward[id(fval)] - fstruct[fname] = fval - # Recurse to child. - _apply_forwards(fval, forward, fs_class, visited) - - return fstruct - - -def _resolve_aliases(bindings): - """ - Replace any bound aliased vars with their binding; and replace - any unbound aliased vars with their representative var. - """ - for (var, value) in bindings.items(): - while isinstance(value, Variable) and value in bindings: - value = bindings[var] = bindings[value] - - -def _trace_unify_start(path, fval1, fval2): - if path == (): - print("\nUnification trace:") - else: - fullname = ".".join("%s" % n for n in path) - print(" " + "| " * (len(path) - 1) + "|") - print(" " + "| " * (len(path) - 1) + "| Unify feature: %s" % fullname) - print(" " + "| " * len(path) + " / " + _trace_valrepr(fval1)) - print(" " + "| " * len(path) + "|\\ " + _trace_valrepr(fval2)) - - -def _trace_unify_identity(path, fval1): - print(" " + "| " * len(path) + "|") - print(" " + "| " * len(path) + "| (identical objects)") - print(" " + "| " * len(path) + "|") - print(" " + "| " * len(path) + "+-->" + repr(fval1)) - - -def _trace_unify_fail(path, result): - if result is UnificationFailure: - resume = "" - else: - resume = " (nonfatal)" - print(" " + "| " * len(path) + "| |") - print(" " + "X " * len(path) + "X X <-- FAIL" + resume) - - -def _trace_unify_succeed(path, fval1): - # Print the result. - print(" " + "| " * len(path) + "|") - print(" " + "| " * len(path) + "+-->" + repr(fval1)) - - -def _trace_bindings(path, bindings): - # Print the bindings (if any). - if len(bindings) > 0: - binditems = sorted(bindings.items(), key=lambda v: v[0].name) - bindstr = "{%s}" % ", ".join( - f"{var}: {_trace_valrepr(val)}" for (var, val) in binditems - ) - print(" " + "| " * len(path) + " Bindings: " + bindstr) - - -def _trace_valrepr(val): - if isinstance(val, Variable): - return "%s" % val - else: - return "%s" % repr(val) - - -def subsumes(fstruct1, fstruct2): - """ - Return True if ``fstruct1`` subsumes ``fstruct2``. I.e., return - true if unifying ``fstruct1`` with ``fstruct2`` would result in a - feature structure equal to ``fstruct2.`` - - :rtype: bool - """ - return fstruct2 == unify(fstruct1, fstruct2) - - -def conflicts(fstruct1, fstruct2, trace=0): - """ - Return a list of the feature paths of all features which are - assigned incompatible values by ``fstruct1`` and ``fstruct2``. - - :rtype: list(tuple) - """ - conflict_list = [] - - def add_conflict(fval1, fval2, path): - conflict_list.append(path) - return fval1 - - unify(fstruct1, fstruct2, fail=add_conflict, trace=trace) - return conflict_list - - -###################################################################### -# Helper Functions -###################################################################### - - -def _is_mapping(v): - return hasattr(v, "__contains__") and hasattr(v, "keys") - - -def _is_sequence(v): - return hasattr(v, "__iter__") and hasattr(v, "__len__") and not isinstance(v, str) - - -def _default_fs_class(obj): - if isinstance(obj, FeatStruct): - return FeatStruct - if isinstance(obj, (dict, list)): - return (dict, list) - else: - raise ValueError( - "To unify objects of type %s, you must specify " - "fs_class explicitly." % obj.__class__.__name__ - ) - - -###################################################################### -# FeatureValueSet & FeatureValueTuple -###################################################################### - - -class SubstituteBindingsSequence(SubstituteBindingsI): - """ - A mixin class for sequence classes that distributes variables() and - substitute_bindings() over the object's elements. - """ - - def variables(self): - return [elt for elt in self if isinstance(elt, Variable)] + sum( - ( - list(elt.variables()) - for elt in self - if isinstance(elt, SubstituteBindingsI) - ), - [], - ) - - def substitute_bindings(self, bindings): - return self.__class__([self.subst(v, bindings) for v in self]) - - def subst(self, v, bindings): - if isinstance(v, SubstituteBindingsI): - return v.substitute_bindings(bindings) - else: - return bindings.get(v, v) - - -class FeatureValueTuple(SubstituteBindingsSequence, tuple): - """ - A base feature value that is a tuple of other base feature values. - FeatureValueTuple implements ``SubstituteBindingsI``, so it any - variable substitutions will be propagated to the elements - contained by the set. A ``FeatureValueTuple`` is immutable. - """ - - def __repr__(self): # [xx] really use %s here? - if len(self) == 0: - return "()" - return "(%s)" % ", ".join(f"{b}" for b in self) - - -class FeatureValueSet(SubstituteBindingsSequence, frozenset): - """ - A base feature value that is a set of other base feature values. - FeatureValueSet implements ``SubstituteBindingsI``, so it any - variable substitutions will be propagated to the elements - contained by the set. A ``FeatureValueSet`` is immutable. - """ - - def __repr__(self): # [xx] really use %s here? - if len(self) == 0: - return "{/}" # distinguish from dict. - # n.b., we sort the string reprs of our elements, to ensure - # that our own repr is deterministic. - return "{%s}" % ", ".join(sorted(f"{b}" for b in self)) - - __str__ = __repr__ - - -class FeatureValueUnion(SubstituteBindingsSequence, frozenset): - """ - A base feature value that represents the union of two or more - ``FeatureValueSet`` or ``Variable``. - """ - - def __new__(cls, values): - # If values contains FeatureValueUnions, then collapse them. - values = _flatten(values, FeatureValueUnion) - - # If the resulting list contains no variables, then - # use a simple FeatureValueSet instead. - if sum(isinstance(v, Variable) for v in values) == 0: - values = _flatten(values, FeatureValueSet) - return FeatureValueSet(values) - - # If we contain a single variable, return that variable. - if len(values) == 1: - return list(values)[0] - - # Otherwise, build the FeatureValueUnion. - return frozenset.__new__(cls, values) - - def __repr__(self): - # n.b., we sort the string reprs of our elements, to ensure - # that our own repr is deterministic. also, note that len(self) - # is guaranteed to be 2 or more. - return "{%s}" % "+".join(sorted(f"{b}" for b in self)) - - -class FeatureValueConcat(SubstituteBindingsSequence, tuple): - """ - A base feature value that represents the concatenation of two or - more ``FeatureValueTuple`` or ``Variable``. - """ - - def __new__(cls, values): - # If values contains FeatureValueConcats, then collapse them. - values = _flatten(values, FeatureValueConcat) - - # If the resulting list contains no variables, then - # use a simple FeatureValueTuple instead. - if sum(isinstance(v, Variable) for v in values) == 0: - values = _flatten(values, FeatureValueTuple) - return FeatureValueTuple(values) - - # If we contain a single variable, return that variable. - if len(values) == 1: - return list(values)[0] - - # Otherwise, build the FeatureValueConcat. - return tuple.__new__(cls, values) - - def __repr__(self): - # n.b.: len(self) is guaranteed to be 2 or more. - return "(%s)" % "+".join(f"{b}" for b in self) - - -def _flatten(lst, cls): - """ - Helper function -- return a copy of list, with all elements of - type ``cls`` spliced in rather than appended in. - """ - result = [] - for elt in lst: - if isinstance(elt, cls): - result.extend(elt) - else: - result.append(elt) - return result - - -###################################################################### -# Specialized Features -###################################################################### - - -@total_ordering -class Feature: - """ - A feature identifier that's specialized to put additional - constraints, default values, etc. - """ - - def __init__(self, name, default=None, display=None): - assert display in (None, "prefix", "slash") - - self._name = name # [xx] rename to .identifier? - self._default = default # [xx] not implemented yet. - self._display = display - - if self._display == "prefix": - self._sortkey = (-1, self._name) - elif self._display == "slash": - self._sortkey = (1, self._name) - else: - self._sortkey = (0, self._name) - - @property - def name(self): - """The name of this feature.""" - return self._name - - @property - def default(self): - """Default value for this feature.""" - return self._default - - @property - def display(self): - """Custom display location: can be prefix, or slash.""" - return self._display - - def __repr__(self): - return "*%s*" % self.name - - def __lt__(self, other): - if isinstance(other, str): - return True - if not isinstance(other, Feature): - raise_unorderable_types("<", self, other) - return self._sortkey < other._sortkey - - def __eq__(self, other): - return type(self) == type(other) and self._name == other._name - - def __ne__(self, other): - return not self == other - - def __hash__(self): - return hash(self._name) - - # //////////////////////////////////////////////////////////// - # These can be overridden by subclasses: - # //////////////////////////////////////////////////////////// - - def read_value(self, s, position, reentrances, parser): - return parser.read_value(s, position, reentrances) - - def unify_base_values(self, fval1, fval2, bindings): - """ - If possible, return a single value.. If not, return - the value ``UnificationFailure``. - """ - if fval1 == fval2: - return fval1 - else: - return UnificationFailure - - -class SlashFeature(Feature): - def read_value(self, s, position, reentrances, parser): - return parser.read_partial(s, position, reentrances) - - -class RangeFeature(Feature): - RANGE_RE = re.compile(r"(-?\d+):(-?\d+)") - - def read_value(self, s, position, reentrances, parser): - m = self.RANGE_RE.match(s, position) - if not m: - raise ValueError("range", position) - return (int(m.group(1)), int(m.group(2))), m.end() - - def unify_base_values(self, fval1, fval2, bindings): - if fval1 is None: - return fval2 - if fval2 is None: - return fval1 - rng = max(fval1[0], fval2[0]), min(fval1[1], fval2[1]) - if rng[1] < rng[0]: - return UnificationFailure - return rng - - -SLASH = SlashFeature("slash", default=False, display="slash") -TYPE = Feature("type", display="prefix") - - -###################################################################### -# Specialized Feature Values -###################################################################### - - -@total_ordering -class CustomFeatureValue: - """ - An abstract base class for base values that define a custom - unification method. The custom unification method of - ``CustomFeatureValue`` will be used during unification if: - - - The ``CustomFeatureValue`` is unified with another base value. - - The ``CustomFeatureValue`` is not the value of a customized - ``Feature`` (which defines its own unification method). - - If two ``CustomFeatureValue`` objects are unified with one another - during feature structure unification, then the unified base values - they return *must* be equal; otherwise, an ``AssertionError`` will - be raised. - - Subclasses must define ``unify()``, ``__eq__()`` and ``__lt__()``. - Subclasses may also wish to define ``__hash__()``. - """ - - def unify(self, other): - """ - If this base value unifies with ``other``, then return the - unified value. Otherwise, return ``UnificationFailure``. - """ - raise NotImplementedError("abstract base class") - - def __eq__(self, other): - return NotImplemented - - def __ne__(self, other): - return not self == other - - def __lt__(self, other): - return NotImplemented - - def __hash__(self): - raise TypeError("%s objects or unhashable" % self.__class__.__name__) - - -###################################################################### -# Feature Structure Reader -###################################################################### - - -class FeatStructReader: - def __init__( - self, - features=(SLASH, TYPE), - fdict_class=FeatStruct, - flist_class=FeatList, - logic_parser=None, - ): - self._features = {f.name: f for f in features} - self._fdict_class = fdict_class - self._flist_class = flist_class - self._prefix_feature = None - self._slash_feature = None - for feature in features: - if feature.display == "slash": - if self._slash_feature: - raise ValueError("Multiple features w/ display=slash") - self._slash_feature = feature - if feature.display == "prefix": - if self._prefix_feature: - raise ValueError("Multiple features w/ display=prefix") - self._prefix_feature = feature - self._features_with_defaults = [ - feature for feature in features if feature.default is not None - ] - if logic_parser is None: - logic_parser = LogicParser() - self._logic_parser = logic_parser - - def fromstring(self, s, fstruct=None): - """ - Convert a string representation of a feature structure (as - displayed by repr) into a ``FeatStruct``. This process - imposes the following restrictions on the string - representation: - - - Feature names cannot contain any of the following: - whitespace, parentheses, quote marks, equals signs, - dashes, commas, and square brackets. Feature names may - not begin with plus signs or minus signs. - - Only the following basic feature value are supported: - strings, integers, variables, None, and unquoted - alphanumeric strings. - - For reentrant values, the first mention must specify - a reentrance identifier and a value; and any subsequent - mentions must use arrows (``'->'``) to reference the - reentrance identifier. - """ - s = s.strip() - value, position = self.read_partial(s, 0, {}, fstruct) - if position != len(s): - self._error(s, "end of string", position) - return value - - _START_FSTRUCT_RE = re.compile(r"\s*(?:\((\d+)\)\s*)?(\??[\w-]+)?(\[)") - _END_FSTRUCT_RE = re.compile(r"\s*]\s*") - _SLASH_RE = re.compile(r"/") - _FEATURE_NAME_RE = re.compile(r'\s*([+-]?)([^\s\(\)<>"\'\-=\[\],]+)\s*') - _REENTRANCE_RE = re.compile(r"\s*->\s*") - _TARGET_RE = re.compile(r"\s*\((\d+)\)\s*") - _ASSIGN_RE = re.compile(r"\s*=\s*") - _COMMA_RE = re.compile(r"\s*,\s*") - _BARE_PREFIX_RE = re.compile(r"\s*(?:\((\d+)\)\s*)?(\??[\w-]+\s*)()") - # This one is used to distinguish fdicts from flists: - _START_FDICT_RE = re.compile( - r"(%s)|(%s\s*(%s\s*(=|->)|[+-]%s|\]))" - % ( - _BARE_PREFIX_RE.pattern, - _START_FSTRUCT_RE.pattern, - _FEATURE_NAME_RE.pattern, - _FEATURE_NAME_RE.pattern, - ) - ) - - def read_partial(self, s, position=0, reentrances=None, fstruct=None): - """ - Helper function that reads in a feature structure. - - :param s: The string to read. - :param position: The position in the string to start parsing. - :param reentrances: A dictionary from reentrance ids to values. - Defaults to an empty dictionary. - :return: A tuple (val, pos) of the feature structure created by - parsing and the position where the parsed feature structure ends. - :rtype: bool - """ - if reentrances is None: - reentrances = {} - try: - return self._read_partial(s, position, reentrances, fstruct) - except ValueError as e: - if len(e.args) != 2: - raise - self._error(s, *e.args) - - def _read_partial(self, s, position, reentrances, fstruct=None): - # Create the new feature structure - if fstruct is None: - if self._START_FDICT_RE.match(s, position): - fstruct = self._fdict_class() - else: - fstruct = self._flist_class() - - # Read up to the open bracket. - match = self._START_FSTRUCT_RE.match(s, position) - if not match: - match = self._BARE_PREFIX_RE.match(s, position) - if not match: - raise ValueError("open bracket or identifier", position) - position = match.end() - - # If there as an identifier, record it. - if match.group(1): - identifier = match.group(1) - if identifier in reentrances: - raise ValueError("new identifier", match.start(1)) - reentrances[identifier] = fstruct - - if isinstance(fstruct, FeatDict): - fstruct.clear() - return self._read_partial_featdict(s, position, match, reentrances, fstruct) - else: - del fstruct[:] - return self._read_partial_featlist(s, position, match, reentrances, fstruct) - - def _read_partial_featlist(self, s, position, match, reentrances, fstruct): - # Prefix features are not allowed: - if match.group(2): - raise ValueError("open bracket") - # Bare prefixes are not allowed: - if not match.group(3): - raise ValueError("open bracket") - - # Build a list of the features defined by the structure. - while position < len(s): - # Check for the close bracket. - match = self._END_FSTRUCT_RE.match(s, position) - if match is not None: - return fstruct, match.end() - - # Reentances have the form "-> (target)" - match = self._REENTRANCE_RE.match(s, position) - if match: - position = match.end() - match = self._TARGET_RE.match(s, position) - if not match: - raise ValueError("identifier", position) - target = match.group(1) - if target not in reentrances: - raise ValueError("bound identifier", position) - position = match.end() - fstruct.append(reentrances[target]) - - # Anything else is a value. - else: - value, position = self._read_value(0, s, position, reentrances) - fstruct.append(value) - - # If there's a close bracket, handle it at the top of the loop. - if self._END_FSTRUCT_RE.match(s, position): - continue - - # Otherwise, there should be a comma - match = self._COMMA_RE.match(s, position) - if match is None: - raise ValueError("comma", position) - position = match.end() - - # We never saw a close bracket. - raise ValueError("close bracket", position) - - def _read_partial_featdict(self, s, position, match, reentrances, fstruct): - # If there was a prefix feature, record it. - if match.group(2): - if self._prefix_feature is None: - raise ValueError("open bracket or identifier", match.start(2)) - prefixval = match.group(2).strip() - if prefixval.startswith("?"): - prefixval = Variable(prefixval) - fstruct[self._prefix_feature] = prefixval - - # If group 3 is empty, then we just have a bare prefix, so - # we're done. - if not match.group(3): - return self._finalize(s, match.end(), reentrances, fstruct) - - # Build a list of the features defined by the structure. - # Each feature has one of the three following forms: - # name = value - # name -> (target) - # +name - # -name - while position < len(s): - # Use these variables to hold info about each feature: - name = value = None - - # Check for the close bracket. - match = self._END_FSTRUCT_RE.match(s, position) - if match is not None: - return self._finalize(s, match.end(), reentrances, fstruct) - - # Get the feature name's name - match = self._FEATURE_NAME_RE.match(s, position) - if match is None: - raise ValueError("feature name", position) - name = match.group(2) - position = match.end() - - # Check if it's a special feature. - if name[0] == "*" and name[-1] == "*": - name = self._features.get(name[1:-1]) - if name is None: - raise ValueError("known special feature", match.start(2)) - - # Check if this feature has a value already. - if name in fstruct: - raise ValueError("new name", match.start(2)) - - # Boolean value ("+name" or "-name") - if match.group(1) == "+": - value = True - if match.group(1) == "-": - value = False - - # Reentrance link ("-> (target)") - if value is None: - match = self._REENTRANCE_RE.match(s, position) - if match is not None: - position = match.end() - match = self._TARGET_RE.match(s, position) - if not match: - raise ValueError("identifier", position) - target = match.group(1) - if target not in reentrances: - raise ValueError("bound identifier", position) - position = match.end() - value = reentrances[target] - - # Assignment ("= value"). - if value is None: - match = self._ASSIGN_RE.match(s, position) - if match: - position = match.end() - value, position = self._read_value(name, s, position, reentrances) - # None of the above: error. - else: - raise ValueError("equals sign", position) - - # Store the value. - fstruct[name] = value - - # If there's a close bracket, handle it at the top of the loop. - if self._END_FSTRUCT_RE.match(s, position): - continue - - # Otherwise, there should be a comma - match = self._COMMA_RE.match(s, position) - if match is None: - raise ValueError("comma", position) - position = match.end() - - # We never saw a close bracket. - raise ValueError("close bracket", position) - - def _finalize(self, s, pos, reentrances, fstruct): - """ - Called when we see the close brace -- checks for a slash feature, - and adds in default values. - """ - # Add the slash feature (if any) - match = self._SLASH_RE.match(s, pos) - if match: - name = self._slash_feature - v, pos = self._read_value(name, s, match.end(), reentrances) - fstruct[name] = v - ## Add any default features. -- handle in unficiation instead? - # for feature in self._features_with_defaults: - # fstruct.setdefault(feature, feature.default) - # Return the value. - return fstruct, pos - - def _read_value(self, name, s, position, reentrances): - if isinstance(name, Feature): - return name.read_value(s, position, reentrances, self) - else: - return self.read_value(s, position, reentrances) - - def read_value(self, s, position, reentrances): - for (handler, regexp) in self.VALUE_HANDLERS: - match = regexp.match(s, position) - if match: - handler_func = getattr(self, handler) - return handler_func(s, position, reentrances, match) - raise ValueError("value", position) - - def _error(self, s, expected, position): - lines = s.split("\n") - while position > len(lines[0]): - position -= len(lines.pop(0)) + 1 # +1 for the newline. - estr = ( - "Error parsing feature structure\n " - + lines[0] - + "\n " - + " " * position - + "^ " - + "Expected %s" % expected - ) - raise ValueError(estr) - - # //////////////////////////////////////////////////////////// - # { Value Readers - # //////////////////////////////////////////////////////////// - - #: A table indicating how feature values should be processed. Each - #: entry in the table is a pair (handler, regexp). The first entry - #: with a matching regexp will have its handler called. Handlers - #: should have the following signature:: - #: - #: def handler(s, position, reentrances, match): ... - #: - #: and should return a tuple (value, position), where position is - #: the string position where the value ended. (n.b.: order is - #: important here!) - VALUE_HANDLERS = [ - ("read_fstruct_value", _START_FSTRUCT_RE), - ("read_var_value", re.compile(r"\?[a-zA-Z_][a-zA-Z0-9_]*")), - ("read_str_value", re.compile("[uU]?[rR]?(['\"])")), - ("read_int_value", re.compile(r"-?\d+")), - ("read_sym_value", re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*")), - ( - "read_app_value", - re.compile(r"<(app)\((\?[a-z][a-z]*)\s*," r"\s*(\?[a-z][a-z]*)\)>"), - ), - # ('read_logic_value', re.compile(r'<([^>]*)>')), - # lazily match any character after '<' until we hit a '>' not preceded by '-' - ("read_logic_value", re.compile(r"<(.*?)(?")), - ("read_set_value", re.compile(r"{")), - ("read_tuple_value", re.compile(r"\(")), - ] - - def read_fstruct_value(self, s, position, reentrances, match): - return self.read_partial(s, position, reentrances) - - def read_str_value(self, s, position, reentrances, match): - return read_str(s, position) - - def read_int_value(self, s, position, reentrances, match): - return int(match.group()), match.end() - - # Note: the '?' is included in the variable name. - def read_var_value(self, s, position, reentrances, match): - return Variable(match.group()), match.end() - - _SYM_CONSTS = {"None": None, "True": True, "False": False} - - def read_sym_value(self, s, position, reentrances, match): - val, end = match.group(), match.end() - return self._SYM_CONSTS.get(val, val), end - - def read_app_value(self, s, position, reentrances, match): - """Mainly included for backwards compat.""" - return self._logic_parser.parse("%s(%s)" % match.group(2, 3)), match.end() - - def read_logic_value(self, s, position, reentrances, match): - try: - try: - expr = self._logic_parser.parse(match.group(1)) - except LogicalExpressionException as e: - raise ValueError from e - return expr, match.end() - except ValueError as e: - raise ValueError("logic expression", match.start(1)) from e - - def read_tuple_value(self, s, position, reentrances, match): - return self._read_seq_value( - s, position, reentrances, match, ")", FeatureValueTuple, FeatureValueConcat - ) - - def read_set_value(self, s, position, reentrances, match): - return self._read_seq_value( - s, position, reentrances, match, "}", FeatureValueSet, FeatureValueUnion - ) - - def _read_seq_value( - self, s, position, reentrances, match, close_paren, seq_class, plus_class - ): - """ - Helper function used by read_tuple_value and read_set_value. - """ - cp = re.escape(close_paren) - position = match.end() - # Special syntax of empty tuples: - m = re.compile(r"\s*/?\s*%s" % cp).match(s, position) - if m: - return seq_class(), m.end() - # Read values: - values = [] - seen_plus = False - while True: - # Close paren: return value. - m = re.compile(r"\s*%s" % cp).match(s, position) - if m: - if seen_plus: - return plus_class(values), m.end() - else: - return seq_class(values), m.end() - - # Read the next value. - val, position = self.read_value(s, position, reentrances) - values.append(val) - - # Comma or looking at close paren - m = re.compile(r"\s*(,|\+|(?=%s))\s*" % cp).match(s, position) - if not m: - raise ValueError("',' or '+' or '%s'" % cp, position) - if m.group(1) == "+": - seen_plus = True - position = m.end() - - -###################################################################### -# { Demo -###################################################################### - - -def display_unification(fs1, fs2, indent=" "): - # Print the two input feature structures, side by side. - fs1_lines = ("%s" % fs1).split("\n") - fs2_lines = ("%s" % fs2).split("\n") - if len(fs1_lines) > len(fs2_lines): - blankline = "[" + " " * (len(fs2_lines[0]) - 2) + "]" - fs2_lines += [blankline] * len(fs1_lines) - else: - blankline = "[" + " " * (len(fs1_lines[0]) - 2) + "]" - fs1_lines += [blankline] * len(fs2_lines) - for (fs1_line, fs2_line) in zip(fs1_lines, fs2_lines): - print(indent + fs1_line + " " + fs2_line) - print(indent + "-" * len(fs1_lines[0]) + " " + "-" * len(fs2_lines[0])) - - linelen = len(fs1_lines[0]) * 2 + 3 - print(indent + "| |".center(linelen)) - print(indent + "+-----UNIFY-----+".center(linelen)) - print(indent + "|".center(linelen)) - print(indent + "V".center(linelen)) - - bindings = {} - - result = fs1.unify(fs2, bindings) - if result is None: - print(indent + "(FAILED)".center(linelen)) - else: - print( - "\n".join(indent + l.center(linelen) for l in ("%s" % result).split("\n")) - ) - if bindings and len(bindings.bound_variables()) > 0: - print(repr(bindings).center(linelen)) - return result - - -def interactive_demo(trace=False): - import random - import sys - - HELP = """ - 1-%d: Select the corresponding feature structure - q: Quit - t: Turn tracing on or off - l: List all feature structures - ?: Help - """ - - print( - """ - This demo will repeatedly present you with a list of feature - structures, and ask you to choose two for unification. Whenever a - new feature structure is generated, it is added to the list of - choices that you can pick from. However, since this can be a - large number of feature structures, the demo will only print out a - random subset for you to choose between at a given time. If you - want to see the complete lists, type "l". For a list of valid - commands, type "?". - """ - ) - print('Press "Enter" to continue...') - sys.stdin.readline() - - fstruct_strings = [ - "[agr=[number=sing, gender=masc]]", - "[agr=[gender=masc, person=3]]", - "[agr=[gender=fem, person=3]]", - "[subj=[agr=(1)[]], agr->(1)]", - "[obj=?x]", - "[subj=?x]", - "[/=None]", - "[/=NP]", - "[cat=NP]", - "[cat=VP]", - "[cat=PP]", - "[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]", - "[gender=masc, agr=?C]", - "[gender=?S, agr=[gender=?S,person=3]]", - ] - - all_fstructs = [ - (i, FeatStruct(fstruct_strings[i])) for i in range(len(fstruct_strings)) - ] - - def list_fstructs(fstructs): - for i, fstruct in fstructs: - print() - lines = ("%s" % fstruct).split("\n") - print("%3d: %s" % (i + 1, lines[0])) - for line in lines[1:]: - print(" " + line) - print() - - while True: - # Pick 5 feature structures at random from the master list. - MAX_CHOICES = 5 - if len(all_fstructs) > MAX_CHOICES: - fstructs = sorted(random.sample(all_fstructs, MAX_CHOICES)) - else: - fstructs = all_fstructs - - print("_" * 75) - - print("Choose two feature structures to unify:") - list_fstructs(fstructs) - - selected = [None, None] - for (nth, i) in (("First", 0), ("Second", 1)): - while selected[i] is None: - print( - ( - "%s feature structure (1-%d,q,t,l,?): " - % (nth, len(all_fstructs)) - ), - end=" ", - ) - try: - input = sys.stdin.readline().strip() - if input in ("q", "Q", "x", "X"): - return - if input in ("t", "T"): - trace = not trace - print(" Trace = %s" % trace) - continue - if input in ("h", "H", "?"): - print(HELP % len(fstructs)) - continue - if input in ("l", "L"): - list_fstructs(all_fstructs) - continue - num = int(input) - 1 - selected[i] = all_fstructs[num][1] - print() - except: - print("Bad sentence number") - continue - - if trace: - result = selected[0].unify(selected[1], trace=1) - else: - result = display_unification(selected[0], selected[1]) - if result is not None: - for i, fstruct in all_fstructs: - if repr(result) == repr(fstruct): - break - else: - all_fstructs.append((len(all_fstructs), result)) - - print('\nType "Enter" to continue unifying; or "q" to quit.') - input = sys.stdin.readline().strip() - if input in ("q", "Q", "x", "X"): - return - - -def demo(trace=False): - """ - Just for testing - """ - # import random - - # processor breaks with values like '3rd' - fstruct_strings = [ - "[agr=[number=sing, gender=masc]]", - "[agr=[gender=masc, person=3]]", - "[agr=[gender=fem, person=3]]", - "[subj=[agr=(1)[]], agr->(1)]", - "[obj=?x]", - "[subj=?x]", - "[/=None]", - "[/=NP]", - "[cat=NP]", - "[cat=VP]", - "[cat=PP]", - "[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]", - "[gender=masc, agr=?C]", - "[gender=?S, agr=[gender=?S,person=3]]", - ] - all_fstructs = [FeatStruct(fss) for fss in fstruct_strings] - # MAX_CHOICES = 5 - # if len(all_fstructs) > MAX_CHOICES: - # fstructs = random.sample(all_fstructs, MAX_CHOICES) - # fstructs.sort() - # else: - # fstructs = all_fstructs - - for fs1 in all_fstructs: - for fs2 in all_fstructs: - print( - "\n*******************\nfs1 is:\n%s\n\nfs2 is:\n%s\n\nresult is:\n%s" - % (fs1, fs2, unify(fs1, fs2)) - ) - - -if __name__ == "__main__": - demo() - -__all__ = [ - "FeatStruct", - "FeatDict", - "FeatList", - "unify", - "subsumes", - "conflicts", - "Feature", - "SlashFeature", - "RangeFeature", - "SLASH", - "TYPE", - "FeatStructReader", -] diff --git a/pipeline/nltk/grammar.py b/pipeline/nltk/grammar.py deleted file mode 100644 index c0f1fe736a4a84e0982780e514108a6812f6876b..0000000000000000000000000000000000000000 --- a/pipeline/nltk/grammar.py +++ /dev/null @@ -1,1708 +0,0 @@ -# Natural Language Toolkit: Context Free Grammars -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Edward Loper -# Jason Narad -# Peter Ljunglöf -# Tom Aarsen <> -# URL: -# For license information, see LICENSE.TXT -# - -""" -Basic data classes for representing context free grammars. A -"grammar" specifies which trees can represent the structure of a -given text. Each of these trees is called a "parse tree" for the -text (or simply a "parse"). In a "context free" grammar, the set of -parse trees for any piece of a text can depend only on that piece, and -not on the rest of the text (i.e., the piece's context). Context free -grammars are often used to find possible syntactic structures for -sentences. In this context, the leaves of a parse tree are word -tokens; and the node values are phrasal categories, such as ``NP`` -and ``VP``. - -The ``CFG`` class is used to encode context free grammars. Each -``CFG`` consists of a start symbol and a set of productions. -The "start symbol" specifies the root node value for parse trees. For example, -the start symbol for syntactic parsing is usually ``S``. Start -symbols are encoded using the ``Nonterminal`` class, which is discussed -below. - -A Grammar's "productions" specify what parent-child relationships a parse -tree can contain. Each production specifies that a particular -node can be the parent of a particular set of children. For example, -the production `` -> `` specifies that an ``S`` node can -be the parent of an ``NP`` node and a ``VP`` node. - -Grammar productions are implemented by the ``Production`` class. -Each ``Production`` consists of a left hand side and a right hand -side. The "left hand side" is a ``Nonterminal`` that specifies the -node type for a potential parent; and the "right hand side" is a list -that specifies allowable children for that parent. This lists -consists of ``Nonterminals`` and text types: each ``Nonterminal`` -indicates that the corresponding child may be a ``TreeToken`` with the -specified node type; and each text type indicates that the -corresponding child may be a ``Token`` with the with that type. - -The ``Nonterminal`` class is used to distinguish node values from leaf -values. This prevents the grammar from accidentally using a leaf -value (such as the English word "A") as the node of a subtree. Within -a ``CFG``, all node values are wrapped in the ``Nonterminal`` -class. Note, however, that the trees that are specified by the grammar do -*not* include these ``Nonterminal`` wrappers. - -Grammars can also be given a more procedural interpretation. According to -this interpretation, a Grammar specifies any tree structure *tree* that -can be produced by the following procedure: - -| Set tree to the start symbol -| Repeat until tree contains no more nonterminal leaves: -| Choose a production prod with whose left hand side -| lhs is a nonterminal leaf of tree. -| Replace the nonterminal leaf with a subtree, whose node -| value is the value wrapped by the nonterminal lhs, and -| whose children are the right hand side of prod. - -The operation of replacing the left hand side (*lhs*) of a production -with the right hand side (*rhs*) in a tree (*tree*) is known as -"expanding" *lhs* to *rhs* in *tree*. -""" -import re -from functools import total_ordering - -from nltk.featstruct import SLASH, TYPE, FeatDict, FeatStruct, FeatStructReader -from nltk.internals import raise_unorderable_types -from nltk.probability import ImmutableProbabilisticMixIn -from nltk.util import invert_graph, transitive_closure - -################################################################# -# Nonterminal -################################################################# - - -@total_ordering -class Nonterminal: - """ - A non-terminal symbol for a context free grammar. ``Nonterminal`` - is a wrapper class for node values; it is used by ``Production`` - objects to distinguish node values from leaf values. - The node value that is wrapped by a ``Nonterminal`` is known as its - "symbol". Symbols are typically strings representing phrasal - categories (such as ``"NP"`` or ``"VP"``). However, more complex - symbol types are sometimes used (e.g., for lexicalized grammars). - Since symbols are node values, they must be immutable and - hashable. Two ``Nonterminals`` are considered equal if their - symbols are equal. - - :see: ``CFG``, ``Production`` - :type _symbol: any - :ivar _symbol: The node value corresponding to this - ``Nonterminal``. This value must be immutable and hashable. - """ - - def __init__(self, symbol): - """ - Construct a new non-terminal from the given symbol. - - :type symbol: any - :param symbol: The node value corresponding to this - ``Nonterminal``. This value must be immutable and - hashable. - """ - self._symbol = symbol - - def symbol(self): - """ - Return the node value corresponding to this ``Nonterminal``. - - :rtype: (any) - """ - return self._symbol - - def __eq__(self, other): - """ - Return True if this non-terminal is equal to ``other``. In - particular, return True if ``other`` is a ``Nonterminal`` - and this non-terminal's symbol is equal to ``other`` 's symbol. - - :rtype: bool - """ - return type(self) == type(other) and self._symbol == other._symbol - - def __ne__(self, other): - return not self == other - - def __lt__(self, other): - if not isinstance(other, Nonterminal): - raise_unorderable_types("<", self, other) - return self._symbol < other._symbol - - def __hash__(self): - return hash(self._symbol) - - def __repr__(self): - """ - Return a string representation for this ``Nonterminal``. - - :rtype: str - """ - if isinstance(self._symbol, str): - return "%s" % self._symbol - else: - return "%s" % repr(self._symbol) - - def __str__(self): - """ - Return a string representation for this ``Nonterminal``. - - :rtype: str - """ - if isinstance(self._symbol, str): - return "%s" % self._symbol - else: - return "%s" % repr(self._symbol) - - def __div__(self, rhs): - """ - Return a new nonterminal whose symbol is ``A/B``, where ``A`` is - the symbol for this nonterminal, and ``B`` is the symbol for rhs. - - :param rhs: The nonterminal used to form the right hand side - of the new nonterminal. - :type rhs: Nonterminal - :rtype: Nonterminal - """ - return Nonterminal(f"{self._symbol}/{rhs._symbol}") - - def __truediv__(self, rhs): - """ - Return a new nonterminal whose symbol is ``A/B``, where ``A`` is - the symbol for this nonterminal, and ``B`` is the symbol for rhs. - This function allows use of the slash ``/`` operator with - the future import of division. - - :param rhs: The nonterminal used to form the right hand side - of the new nonterminal. - :type rhs: Nonterminal - :rtype: Nonterminal - """ - return self.__div__(rhs) - - -def nonterminals(symbols): - """ - Given a string containing a list of symbol names, return a list of - ``Nonterminals`` constructed from those symbols. - - :param symbols: The symbol name string. This string can be - delimited by either spaces or commas. - :type symbols: str - :return: A list of ``Nonterminals`` constructed from the symbol - names given in ``symbols``. The ``Nonterminals`` are sorted - in the same order as the symbols names. - :rtype: list(Nonterminal) - """ - if "," in symbols: - symbol_list = symbols.split(",") - else: - symbol_list = symbols.split() - return [Nonterminal(s.strip()) for s in symbol_list] - - -class FeatStructNonterminal(FeatDict, Nonterminal): - """A feature structure that's also a nonterminal. It acts as its - own symbol, and automatically freezes itself when hashed.""" - - def __hash__(self): - self.freeze() - return FeatStruct.__hash__(self) - - def symbol(self): - return self - - -def is_nonterminal(item): - """ - :return: True if the item is a ``Nonterminal``. - :rtype: bool - """ - return isinstance(item, Nonterminal) - - -################################################################# -# Terminals -################################################################# - - -def is_terminal(item): - """ - Return True if the item is a terminal, which currently is - if it is hashable and not a ``Nonterminal``. - - :rtype: bool - """ - return hasattr(item, "__hash__") and not isinstance(item, Nonterminal) - - -################################################################# -# Productions -################################################################# - - -@total_ordering -class Production: - """ - A grammar production. Each production maps a single symbol - on the "left-hand side" to a sequence of symbols on the - "right-hand side". (In the case of context-free productions, - the left-hand side must be a ``Nonterminal``, and the right-hand - side is a sequence of terminals and ``Nonterminals``.) - "terminals" can be any immutable hashable object that is - not a ``Nonterminal``. Typically, terminals are strings - representing words, such as ``"dog"`` or ``"under"``. - - :see: ``CFG`` - :see: ``DependencyGrammar`` - :see: ``Nonterminal`` - :type _lhs: Nonterminal - :ivar _lhs: The left-hand side of the production. - :type _rhs: tuple(Nonterminal, terminal) - :ivar _rhs: The right-hand side of the production. - """ - - def __init__(self, lhs, rhs): - """ - Construct a new ``Production``. - - :param lhs: The left-hand side of the new ``Production``. - :type lhs: Nonterminal - :param rhs: The right-hand side of the new ``Production``. - :type rhs: sequence(Nonterminal and terminal) - """ - if isinstance(rhs, str): - raise TypeError( - "production right hand side should be a list, " "not a string" - ) - self._lhs = lhs - self._rhs = tuple(rhs) - - def lhs(self): - """ - Return the left-hand side of this ``Production``. - - :rtype: Nonterminal - """ - return self._lhs - - def rhs(self): - """ - Return the right-hand side of this ``Production``. - - :rtype: sequence(Nonterminal and terminal) - """ - return self._rhs - - def __len__(self): - """ - Return the length of the right-hand side. - - :rtype: int - """ - return len(self._rhs) - - def is_nonlexical(self): - """ - Return True if the right-hand side only contains ``Nonterminals`` - - :rtype: bool - """ - return all(is_nonterminal(n) for n in self._rhs) - - def is_lexical(self): - """ - Return True if the right-hand contain at least one terminal token. - - :rtype: bool - """ - return not self.is_nonlexical() - - def __str__(self): - """ - Return a verbose string representation of the ``Production``. - - :rtype: str - """ - result = "%s -> " % repr(self._lhs) - result += " ".join(repr(el) for el in self._rhs) - return result - - def __repr__(self): - """ - Return a concise string representation of the ``Production``. - - :rtype: str - """ - return "%s" % self - - def __eq__(self, other): - """ - Return True if this ``Production`` is equal to ``other``. - - :rtype: bool - """ - return ( - type(self) == type(other) - and self._lhs == other._lhs - and self._rhs == other._rhs - ) - - def __ne__(self, other): - return not self == other - - def __lt__(self, other): - if not isinstance(other, Production): - raise_unorderable_types("<", self, other) - return (self._lhs, self._rhs) < (other._lhs, other._rhs) - - def __hash__(self): - """ - Return a hash value for the ``Production``. - - :rtype: int - """ - return hash((self._lhs, self._rhs)) - - -class DependencyProduction(Production): - """ - A dependency grammar production. Each production maps a single - head word to an unordered list of one or more modifier words. - """ - - def __str__(self): - """ - Return a verbose string representation of the ``DependencyProduction``. - - :rtype: str - """ - result = f"'{self._lhs}' ->" - for elt in self._rhs: - result += f" '{elt}'" - return result - - -class ProbabilisticProduction(Production, ImmutableProbabilisticMixIn): - """ - A probabilistic context free grammar production. - A PCFG ``ProbabilisticProduction`` is essentially just a ``Production`` that - has an associated probability, which represents how likely it is that - this production will be used. In particular, the probability of a - ``ProbabilisticProduction`` records the likelihood that its right-hand side is - the correct instantiation for any given occurrence of its left-hand side. - - :see: ``Production`` - """ - - def __init__(self, lhs, rhs, **prob): - """ - Construct a new ``ProbabilisticProduction``. - - :param lhs: The left-hand side of the new ``ProbabilisticProduction``. - :type lhs: Nonterminal - :param rhs: The right-hand side of the new ``ProbabilisticProduction``. - :type rhs: sequence(Nonterminal and terminal) - :param prob: Probability parameters of the new ``ProbabilisticProduction``. - """ - ImmutableProbabilisticMixIn.__init__(self, **prob) - Production.__init__(self, lhs, rhs) - - def __str__(self): - return super().__str__() + ( - " [1.0]" if (self.prob() == 1.0) else " [%g]" % self.prob() - ) - - def __eq__(self, other): - return ( - type(self) == type(other) - and self._lhs == other._lhs - and self._rhs == other._rhs - and self.prob() == other.prob() - ) - - def __ne__(self, other): - return not self == other - - def __hash__(self): - return hash((self._lhs, self._rhs, self.prob())) - - -################################################################# -# Grammars -################################################################# - - -class CFG: - """ - A context-free grammar. A grammar consists of a start state and - a set of productions. The set of terminals and nonterminals is - implicitly specified by the productions. - - If you need efficient key-based access to productions, you - can use a subclass to implement it. - """ - - def __init__(self, start, productions, calculate_leftcorners=True): - """ - Create a new context-free grammar, from the given start state - and set of ``Production`` instances. - - :param start: The start symbol - :type start: Nonterminal - :param productions: The list of productions that defines the grammar - :type productions: list(Production) - :param calculate_leftcorners: False if we don't want to calculate the - leftcorner relation. In that case, some optimized chart parsers won't work. - :type calculate_leftcorners: bool - """ - if not is_nonterminal(start): - raise TypeError( - "start should be a Nonterminal object," - " not a %s" % type(start).__name__ - ) - - self._start = start - self._productions = productions - self._categories = {prod.lhs() for prod in productions} - self._calculate_indexes() - self._calculate_grammar_forms() - if calculate_leftcorners: - self._calculate_leftcorners() - - def _calculate_indexes(self): - self._lhs_index = {} - self._rhs_index = {} - self._empty_index = {} - self._lexical_index = {} - for prod in self._productions: - # Left hand side. - lhs = prod._lhs - if lhs not in self._lhs_index: - self._lhs_index[lhs] = [] - self._lhs_index[lhs].append(prod) - if prod._rhs: - # First item in right hand side. - rhs0 = prod._rhs[0] - if rhs0 not in self._rhs_index: - self._rhs_index[rhs0] = [] - self._rhs_index[rhs0].append(prod) - else: - # The right hand side is empty. - self._empty_index[prod.lhs()] = prod - # Lexical tokens in the right hand side. - for token in prod._rhs: - if is_terminal(token): - self._lexical_index.setdefault(token, set()).add(prod) - - def _calculate_leftcorners(self): - # Calculate leftcorner relations, for use in optimized parsing. - self._immediate_leftcorner_categories = {cat: {cat} for cat in self._categories} - self._immediate_leftcorner_words = {cat: set() for cat in self._categories} - for prod in self.productions(): - if len(prod) > 0: - cat, left = prod.lhs(), prod.rhs()[0] - if is_nonterminal(left): - self._immediate_leftcorner_categories[cat].add(left) - else: - self._immediate_leftcorner_words[cat].add(left) - - lc = transitive_closure(self._immediate_leftcorner_categories, reflexive=True) - self._leftcorners = lc - self._leftcorner_parents = invert_graph(lc) - - nr_leftcorner_categories = sum( - map(len, self._immediate_leftcorner_categories.values()) - ) - nr_leftcorner_words = sum(map(len, self._immediate_leftcorner_words.values())) - if nr_leftcorner_words > nr_leftcorner_categories > 10000: - # If the grammar is big, the leftcorner-word dictionary will be too large. - # In that case it is better to calculate the relation on demand. - self._leftcorner_words = None - return - - self._leftcorner_words = {} - for cat in self._leftcorners: - lefts = self._leftcorners[cat] - lc = self._leftcorner_words[cat] = set() - for left in lefts: - lc.update(self._immediate_leftcorner_words.get(left, set())) - - @classmethod - def fromstring(cls, input, encoding=None): - """ - Return the grammar instance corresponding to the input string(s). - - :param input: a grammar, either in the form of a string or as a list of strings. - """ - start, productions = read_grammar( - input, standard_nonterm_parser, encoding=encoding - ) - return cls(start, productions) - - def start(self): - """ - Return the start symbol of the grammar - - :rtype: Nonterminal - """ - return self._start - - # tricky to balance readability and efficiency here! - # can't use set operations as they don't preserve ordering - def productions(self, lhs=None, rhs=None, empty=False): - """ - Return the grammar productions, filtered by the left-hand side - or the first item in the right-hand side. - - :param lhs: Only return productions with the given left-hand side. - :param rhs: Only return productions with the given first item - in the right-hand side. - :param empty: Only return productions with an empty right-hand side. - :return: A list of productions matching the given constraints. - :rtype: list(Production) - """ - if rhs and empty: - raise ValueError( - "You cannot select empty and non-empty " "productions at the same time." - ) - - # no constraints so return everything - if not lhs and not rhs: - if not empty: - return self._productions - else: - return self._empty_index.values() - - # only lhs specified so look up its index - elif lhs and not rhs: - if not empty: - return self._lhs_index.get(lhs, []) - elif lhs in self._empty_index: - return [self._empty_index[lhs]] - else: - return [] - - # only rhs specified so look up its index - elif rhs and not lhs: - return self._rhs_index.get(rhs, []) - - # intersect - else: - return [ - prod - for prod in self._lhs_index.get(lhs, []) - if prod in self._rhs_index.get(rhs, []) - ] - - def leftcorners(self, cat): - """ - Return the set of all nonterminals that the given nonterminal - can start with, including itself. - - This is the reflexive, transitive closure of the immediate - leftcorner relation: (A > B) iff (A -> B beta) - - :param cat: the parent of the leftcorners - :type cat: Nonterminal - :return: the set of all leftcorners - :rtype: set(Nonterminal) - """ - return self._leftcorners.get(cat, {cat}) - - def is_leftcorner(self, cat, left): - """ - True if left is a leftcorner of cat, where left can be a - terminal or a nonterminal. - - :param cat: the parent of the leftcorner - :type cat: Nonterminal - :param left: the suggested leftcorner - :type left: Terminal or Nonterminal - :rtype: bool - """ - if is_nonterminal(left): - return left in self.leftcorners(cat) - elif self._leftcorner_words: - return left in self._leftcorner_words.get(cat, set()) - else: - return any( - left in self._immediate_leftcorner_words.get(parent, set()) - for parent in self.leftcorners(cat) - ) - - def leftcorner_parents(self, cat): - """ - Return the set of all nonterminals for which the given category - is a left corner. This is the inverse of the leftcorner relation. - - :param cat: the suggested leftcorner - :type cat: Nonterminal - :return: the set of all parents to the leftcorner - :rtype: set(Nonterminal) - """ - return self._leftcorner_parents.get(cat, {cat}) - - def check_coverage(self, tokens): - """ - Check whether the grammar rules cover the given list of tokens. - If not, then raise an exception. - - :type tokens: list(str) - """ - missing = [tok for tok in tokens if not self._lexical_index.get(tok)] - if missing: - missing = ", ".join(f"{w!r}" for w in missing) - raise ValueError( - "Grammar does not cover some of the " "input words: %r." % missing - ) - - def _calculate_grammar_forms(self): - """ - Pre-calculate of which form(s) the grammar is. - """ - prods = self._productions - self._is_lexical = all(p.is_lexical() for p in prods) - self._is_nonlexical = all(p.is_nonlexical() for p in prods if len(p) != 1) - self._min_len = min(len(p) for p in prods) - self._max_len = max(len(p) for p in prods) - self._all_unary_are_lexical = all(p.is_lexical() for p in prods if len(p) == 1) - - def is_lexical(self): - """ - Return True if all productions are lexicalised. - """ - return self._is_lexical - - def is_nonlexical(self): - """ - Return True if all lexical rules are "preterminals", that is, - unary rules which can be separated in a preprocessing step. - - This means that all productions are of the forms - A -> B1 ... Bn (n>=0), or A -> "s". - - Note: is_lexical() and is_nonlexical() are not opposites. - There are grammars which are neither, and grammars which are both. - """ - return self._is_nonlexical - - def min_len(self): - """ - Return the right-hand side length of the shortest grammar production. - """ - return self._min_len - - def max_len(self): - """ - Return the right-hand side length of the longest grammar production. - """ - return self._max_len - - def is_nonempty(self): - """ - Return True if there are no empty productions. - """ - return self._min_len > 0 - - def is_binarised(self): - """ - Return True if all productions are at most binary. - Note that there can still be empty and unary productions. - """ - return self._max_len <= 2 - - def is_flexible_chomsky_normal_form(self): - """ - Return True if all productions are of the forms - A -> B C, A -> B, or A -> "s". - """ - return self.is_nonempty() and self.is_nonlexical() and self.is_binarised() - - def is_chomsky_normal_form(self): - """ - Return True if the grammar is of Chomsky Normal Form, i.e. all productions - are of the form A -> B C, or A -> "s". - """ - return self.is_flexible_chomsky_normal_form() and self._all_unary_are_lexical - - def chomsky_normal_form(self, new_token_padding="@$@", flexible=False): - """ - Returns a new Grammar that is in chomsky normal - - :param: new_token_padding - Customise new rule formation during binarisation - """ - if self.is_chomsky_normal_form(): - return self - if self.productions(empty=True): - raise ValueError( - "Grammar has Empty rules. " "Cannot deal with them at the moment" - ) - - # check for mixed rules - for rule in self.productions(): - if rule.is_lexical() and len(rule.rhs()) > 1: - raise ValueError( - f"Cannot handled mixed rule {rule.lhs()} => {rule.rhs()}" - ) - - step1 = CFG.eliminate_start(self) - step2 = CFG.binarize(step1, new_token_padding) - if flexible: - return step2 - step3 = CFG.remove_unitary_rules(step2) - step4 = CFG(step3.start(), list(set(step3.productions()))) - return step4 - - @classmethod - def remove_unitary_rules(cls, grammar): - """ - Remove nonlexical unitary rules and convert them to - lexical - """ - result = [] - unitary = [] - for rule in grammar.productions(): - if len(rule) == 1 and rule.is_nonlexical(): - unitary.append(rule) - else: - result.append(rule) - - while unitary: - rule = unitary.pop(0) - for item in grammar.productions(lhs=rule.rhs()[0]): - new_rule = Production(rule.lhs(), item.rhs()) - if len(new_rule) != 1 or new_rule.is_lexical(): - result.append(new_rule) - else: - unitary.append(new_rule) - - n_grammar = CFG(grammar.start(), result) - return n_grammar - - @classmethod - def binarize(cls, grammar, padding="@$@"): - """ - Convert all non-binary rules into binary by introducing - new tokens. - Example:: - - Original: - A => B C D - After Conversion: - A => B A@$@B - A@$@B => C D - """ - result = [] - - for rule in grammar.productions(): - if len(rule.rhs()) > 2: - # this rule needs to be broken down - left_side = rule.lhs() - for k in range(0, len(rule.rhs()) - 2): - tsym = rule.rhs()[k] - new_sym = Nonterminal(left_side.symbol() + padding + tsym.symbol()) - new_production = Production(left_side, (tsym, new_sym)) - left_side = new_sym - result.append(new_production) - last_prd = Production(left_side, rule.rhs()[-2:]) - result.append(last_prd) - else: - result.append(rule) - - n_grammar = CFG(grammar.start(), result) - return n_grammar - - @classmethod - def eliminate_start(cls, grammar): - """ - Eliminate start rule in case it appears on RHS - Example: S -> S0 S1 and S0 -> S1 S - Then another rule S0_Sigma -> S is added - """ - start = grammar.start() - result = [] - need_to_add = None - for rule in grammar.productions(): - if start in rule.rhs(): - need_to_add = True - result.append(rule) - if need_to_add: - start = Nonterminal("S0_SIGMA") - result.append(Production(start, [grammar.start()])) - n_grammar = CFG(start, result) - return n_grammar - return grammar - - def __repr__(self): - return "" % len(self._productions) - - def __str__(self): - result = "Grammar with %d productions" % len(self._productions) - result += " (start state = %r)" % self._start - for production in self._productions: - result += "\n %s" % production - return result - - -class FeatureGrammar(CFG): - """ - A feature-based grammar. This is equivalent to a - ``CFG`` whose nonterminals are all - ``FeatStructNonterminal``. - - A grammar consists of a start state and a set of - productions. The set of terminals and nonterminals - is implicitly specified by the productions. - """ - - def __init__(self, start, productions): - """ - Create a new feature-based grammar, from the given start - state and set of ``Productions``. - - :param start: The start symbol - :type start: FeatStructNonterminal - :param productions: The list of productions that defines the grammar - :type productions: list(Production) - """ - CFG.__init__(self, start, productions) - - # The difference with CFG is that the productions are - # indexed on the TYPE feature of the nonterminals. - # This is calculated by the method _get_type_if_possible(). - - def _calculate_indexes(self): - self._lhs_index = {} - self._rhs_index = {} - self._empty_index = {} - self._empty_productions = [] - self._lexical_index = {} - for prod in self._productions: - # Left hand side. - lhs = self._get_type_if_possible(prod._lhs) - if lhs not in self._lhs_index: - self._lhs_index[lhs] = [] - self._lhs_index[lhs].append(prod) - if prod._rhs: - # First item in right hand side. - rhs0 = self._get_type_if_possible(prod._rhs[0]) - if rhs0 not in self._rhs_index: - self._rhs_index[rhs0] = [] - self._rhs_index[rhs0].append(prod) - else: - # The right hand side is empty. - if lhs not in self._empty_index: - self._empty_index[lhs] = [] - self._empty_index[lhs].append(prod) - self._empty_productions.append(prod) - # Lexical tokens in the right hand side. - for token in prod._rhs: - if is_terminal(token): - self._lexical_index.setdefault(token, set()).add(prod) - - @classmethod - def fromstring( - cls, input, features=None, logic_parser=None, fstruct_reader=None, encoding=None - ): - """ - Return a feature structure based grammar. - - :param input: a grammar, either in the form of a string or else - as a list of strings. - :param features: a tuple of features (default: SLASH, TYPE) - :param logic_parser: a parser for lambda-expressions, - by default, ``LogicParser()`` - :param fstruct_reader: a feature structure parser - (only if features and logic_parser is None) - """ - if features is None: - features = (SLASH, TYPE) - - if fstruct_reader is None: - fstruct_reader = FeatStructReader( - features, FeatStructNonterminal, logic_parser=logic_parser - ) - elif logic_parser is not None: - raise Exception( - "'logic_parser' and 'fstruct_reader' must " "not both be set" - ) - - start, productions = read_grammar( - input, fstruct_reader.read_partial, encoding=encoding - ) - return cls(start, productions) - - def productions(self, lhs=None, rhs=None, empty=False): - """ - Return the grammar productions, filtered by the left-hand side - or the first item in the right-hand side. - - :param lhs: Only return productions with the given left-hand side. - :param rhs: Only return productions with the given first item - in the right-hand side. - :param empty: Only return productions with an empty right-hand side. - :rtype: list(Production) - """ - if rhs and empty: - raise ValueError( - "You cannot select empty and non-empty " "productions at the same time." - ) - - # no constraints so return everything - if not lhs and not rhs: - if empty: - return self._empty_productions - else: - return self._productions - - # only lhs specified so look up its index - elif lhs and not rhs: - if empty: - return self._empty_index.get(self._get_type_if_possible(lhs), []) - else: - return self._lhs_index.get(self._get_type_if_possible(lhs), []) - - # only rhs specified so look up its index - elif rhs and not lhs: - return self._rhs_index.get(self._get_type_if_possible(rhs), []) - - # intersect - else: - return [ - prod - for prod in self._lhs_index.get(self._get_type_if_possible(lhs), []) - if prod in self._rhs_index.get(self._get_type_if_possible(rhs), []) - ] - - def leftcorners(self, cat): - """ - Return the set of all words that the given category can start with. - Also called the "first set" in compiler construction. - """ - raise NotImplementedError("Not implemented yet") - - def leftcorner_parents(self, cat): - """ - Return the set of all categories for which the given category - is a left corner. - """ - raise NotImplementedError("Not implemented yet") - - def _get_type_if_possible(self, item): - """ - Helper function which returns the ``TYPE`` feature of the ``item``, - if it exists, otherwise it returns the ``item`` itself - """ - if isinstance(item, dict) and TYPE in item: - return FeatureValueType(item[TYPE]) - else: - return item - - -@total_ordering -class FeatureValueType: - """ - A helper class for ``FeatureGrammars``, designed to be different - from ordinary strings. This is to stop the ``FeatStruct`` - ``FOO[]`` from being compare equal to the terminal "FOO". - """ - - def __init__(self, value): - self._value = value - - def __repr__(self): - return "<%s>" % self._value - - def __eq__(self, other): - return type(self) == type(other) and self._value == other._value - - def __ne__(self, other): - return not self == other - - def __lt__(self, other): - if not isinstance(other, FeatureValueType): - raise_unorderable_types("<", self, other) - return self._value < other._value - - def __hash__(self): - return hash(self._value) - - -class DependencyGrammar: - """ - A dependency grammar. A DependencyGrammar consists of a set of - productions. Each production specifies a head/modifier relationship - between a pair of words. - """ - - def __init__(self, productions): - """ - Create a new dependency grammar, from the set of ``Productions``. - - :param productions: The list of productions that defines the grammar - :type productions: list(Production) - """ - self._productions = productions - - @classmethod - def fromstring(cls, input): - productions = [] - for linenum, line in enumerate(input.split("\n")): - line = line.strip() - if line.startswith("#") or line == "": - continue - try: - productions += _read_dependency_production(line) - except ValueError as e: - raise ValueError(f"Unable to parse line {linenum}: {line}") from e - if len(productions) == 0: - raise ValueError("No productions found!") - return cls(productions) - - def contains(self, head, mod): - """ - :param head: A head word. - :type head: str - :param mod: A mod word, to test as a modifier of 'head'. - :type mod: str - - :return: true if this ``DependencyGrammar`` contains a - ``DependencyProduction`` mapping 'head' to 'mod'. - :rtype: bool - """ - for production in self._productions: - for possibleMod in production._rhs: - if production._lhs == head and possibleMod == mod: - return True - return False - - def __contains__(self, head_mod): - """ - Return True if this ``DependencyGrammar`` contains a - ``DependencyProduction`` mapping 'head' to 'mod'. - - :param head_mod: A tuple of a head word and a mod word, - to test as a modifier of 'head'. - :type head: Tuple[str, str] - :rtype: bool - """ - try: - head, mod = head_mod - except ValueError as e: - raise ValueError( - "Must use a tuple of strings, e.g. `('price', 'of') in grammar`" - ) from e - return self.contains(head, mod) - - # # should be rewritten, the set comp won't work in all comparisons - # def contains_exactly(self, head, modlist): - # for production in self._productions: - # if(len(production._rhs) == len(modlist)): - # if(production._lhs == head): - # set1 = Set(production._rhs) - # set2 = Set(modlist) - # if(set1 == set2): - # return True - # return False - - def __str__(self): - """ - Return a verbose string representation of the ``DependencyGrammar`` - - :rtype: str - """ - str = "Dependency grammar with %d productions" % len(self._productions) - for production in self._productions: - str += "\n %s" % production - return str - - def __repr__(self): - """ - Return a concise string representation of the ``DependencyGrammar`` - """ - return "Dependency grammar with %d productions" % len(self._productions) - - -class ProbabilisticDependencyGrammar: - """ """ - - def __init__(self, productions, events, tags): - self._productions = productions - self._events = events - self._tags = tags - - def contains(self, head, mod): - """ - Return True if this ``DependencyGrammar`` contains a - ``DependencyProduction`` mapping 'head' to 'mod'. - - :param head: A head word. - :type head: str - :param mod: A mod word, to test as a modifier of 'head'. - :type mod: str - :rtype: bool - """ - for production in self._productions: - for possibleMod in production._rhs: - if production._lhs == head and possibleMod == mod: - return True - return False - - def __str__(self): - """ - Return a verbose string representation of the ``ProbabilisticDependencyGrammar`` - - :rtype: str - """ - str = "Statistical dependency grammar with %d productions" % len( - self._productions - ) - for production in self._productions: - str += "\n %s" % production - str += "\nEvents:" - for event in self._events: - str += "\n %d:%s" % (self._events[event], event) - str += "\nTags:" - for tag_word in self._tags: - str += f"\n {tag_word}:\t({self._tags[tag_word]})" - return str - - def __repr__(self): - """ - Return a concise string representation of the ``ProbabilisticDependencyGrammar`` - """ - return "Statistical Dependency grammar with %d productions" % len( - self._productions - ) - - -class PCFG(CFG): - """ - A probabilistic context-free grammar. A PCFG consists of a - start state and a set of productions with probabilities. The set of - terminals and nonterminals is implicitly specified by the productions. - - PCFG productions use the ``ProbabilisticProduction`` class. - ``PCFGs`` impose the constraint that the set of productions with - any given left-hand-side must have probabilities that sum to 1 - (allowing for a small margin of error). - - If you need efficient key-based access to productions, you can use - a subclass to implement it. - - :type EPSILON: float - :cvar EPSILON: The acceptable margin of error for checking that - productions with a given left-hand side have probabilities - that sum to 1. - """ - - EPSILON = 0.01 - - def __init__(self, start, productions, calculate_leftcorners=True): - """ - Create a new context-free grammar, from the given start state - and set of ``ProbabilisticProductions``. - - :param start: The start symbol - :type start: Nonterminal - :param productions: The list of productions that defines the grammar - :type productions: list(Production) - :raise ValueError: if the set of productions with any left-hand-side - do not have probabilities that sum to a value within - EPSILON of 1. - :param calculate_leftcorners: False if we don't want to calculate the - leftcorner relation. In that case, some optimized chart parsers won't work. - :type calculate_leftcorners: bool - """ - CFG.__init__(self, start, productions, calculate_leftcorners) - - # Make sure that the probabilities sum to one. - probs = {} - for production in productions: - probs[production.lhs()] = probs.get(production.lhs(), 0) + production.prob() - for (lhs, p) in probs.items(): - if not ((1 - PCFG.EPSILON) < p < (1 + PCFG.EPSILON)): - raise ValueError("Productions for %r do not sum to 1" % lhs) - - @classmethod - def fromstring(cls, input, encoding=None): - """ - Return a probabilistic context-free grammar corresponding to the - input string(s). - - :param input: a grammar, either in the form of a string or else - as a list of strings. - """ - start, productions = read_grammar( - input, standard_nonterm_parser, probabilistic=True, encoding=encoding - ) - return cls(start, productions) - - -################################################################# -# Inducing Grammars -################################################################# - -# Contributed by Nathan Bodenstab - - -def induce_pcfg(start, productions): - r""" - Induce a PCFG grammar from a list of productions. - - The probability of a production A -> B C in a PCFG is: - - | count(A -> B C) - | P(B, C | A) = --------------- where \* is any right hand side - | count(A -> \*) - - :param start: The start symbol - :type start: Nonterminal - :param productions: The list of productions that defines the grammar - :type productions: list(Production) - """ - # Production count: the number of times a given production occurs - pcount = {} - - # LHS-count: counts the number of times a given lhs occurs - lcount = {} - - for prod in productions: - lcount[prod.lhs()] = lcount.get(prod.lhs(), 0) + 1 - pcount[prod] = pcount.get(prod, 0) + 1 - - prods = [ - ProbabilisticProduction(p.lhs(), p.rhs(), prob=pcount[p] / lcount[p.lhs()]) - for p in pcount - ] - return PCFG(start, prods) - - -################################################################# -# Helper functions for reading productions -################################################################# - - -def _read_cfg_production(input): - """ - Return a list of context-free ``Productions``. - """ - return _read_production(input, standard_nonterm_parser) - - -def _read_pcfg_production(input): - """ - Return a list of PCFG ``ProbabilisticProductions``. - """ - return _read_production(input, standard_nonterm_parser, probabilistic=True) - - -def _read_fcfg_production(input, fstruct_reader): - """ - Return a list of feature-based ``Productions``. - """ - return _read_production(input, fstruct_reader) - - -# Parsing generic grammars - -_ARROW_RE = re.compile(r"\s* -> \s*", re.VERBOSE) -_PROBABILITY_RE = re.compile(r"( \[ [\d\.]+ \] ) \s*", re.VERBOSE) -_TERMINAL_RE = re.compile(r'( "[^"]*" | \'[^\']*\' ) \s*', re.VERBOSE) -_DISJUNCTION_RE = re.compile(r"\| \s*", re.VERBOSE) - - -def _read_production(line, nonterm_parser, probabilistic=False): - """ - Parse a grammar rule, given as a string, and return - a list of productions. - """ - pos = 0 - - # Parse the left-hand side. - lhs, pos = nonterm_parser(line, pos) - - # Skip over the arrow. - m = _ARROW_RE.match(line, pos) - if not m: - raise ValueError("Expected an arrow") - pos = m.end() - - # Parse the right hand side. - probabilities = [0.0] - rhsides = [[]] - while pos < len(line): - # Probability. - m = _PROBABILITY_RE.match(line, pos) - if probabilistic and m: - pos = m.end() - probabilities[-1] = float(m.group(1)[1:-1]) - if probabilities[-1] > 1.0: - raise ValueError( - "Production probability %f, " - "should not be greater than 1.0" % (probabilities[-1],) - ) - - # String -- add terminal. - elif line[pos] in "'\"": - m = _TERMINAL_RE.match(line, pos) - if not m: - raise ValueError("Unterminated string") - rhsides[-1].append(m.group(1)[1:-1]) - pos = m.end() - - # Vertical bar -- start new rhside. - elif line[pos] == "|": - m = _DISJUNCTION_RE.match(line, pos) - probabilities.append(0.0) - rhsides.append([]) - pos = m.end() - - # Anything else -- nonterminal. - else: - nonterm, pos = nonterm_parser(line, pos) - rhsides[-1].append(nonterm) - - if probabilistic: - return [ - ProbabilisticProduction(lhs, rhs, prob=probability) - for (rhs, probability) in zip(rhsides, probabilities) - ] - else: - return [Production(lhs, rhs) for rhs in rhsides] - - -################################################################# -# Reading Phrase Structure Grammars -################################################################# - - -def read_grammar(input, nonterm_parser, probabilistic=False, encoding=None): - """ - Return a pair consisting of a starting category and a list of - ``Productions``. - - :param input: a grammar, either in the form of a string or else - as a list of strings. - :param nonterm_parser: a function for parsing nonterminals. - It should take a ``(string, position)`` as argument and - return a ``(nonterminal, position)`` as result. - :param probabilistic: are the grammar rules probabilistic? - :type probabilistic: bool - :param encoding: the encoding of the grammar, if it is a binary string - :type encoding: str - """ - if encoding is not None: - input = input.decode(encoding) - if isinstance(input, str): - lines = input.split("\n") - else: - lines = input - - start = None - productions = [] - continue_line = "" - for linenum, line in enumerate(lines): - line = continue_line + line.strip() - if line.startswith("#") or line == "": - continue - if line.endswith("\\"): - continue_line = line[:-1].rstrip() + " " - continue - continue_line = "" - try: - if line[0] == "%": - directive, args = line[1:].split(None, 1) - if directive == "start": - start, pos = nonterm_parser(args, 0) - if pos != len(args): - raise ValueError("Bad argument to start directive") - else: - raise ValueError("Bad directive") - else: - # expand out the disjunctions on the RHS - productions += _read_production(line, nonterm_parser, probabilistic) - except ValueError as e: - raise ValueError(f"Unable to parse line {linenum + 1}: {line}\n{e}") from e - - if not productions: - raise ValueError("No productions found!") - if not start: - start = productions[0].lhs() - return (start, productions) - - -_STANDARD_NONTERM_RE = re.compile(r"( [\w/][\w/^<>-]* ) \s*", re.VERBOSE) - - -def standard_nonterm_parser(string, pos): - m = _STANDARD_NONTERM_RE.match(string, pos) - if not m: - raise ValueError("Expected a nonterminal, found: " + string[pos:]) - return (Nonterminal(m.group(1)), m.end()) - - -################################################################# -# Reading Dependency Grammars -################################################################# - -_READ_DG_RE = re.compile( - r"""^\s* # leading whitespace - ('[^']+')\s* # single-quoted lhs - (?:[-=]+>)\s* # arrow - (?:( # rhs: - "[^"]+" # doubled-quoted terminal - | '[^']+' # single-quoted terminal - | \| # disjunction - ) - \s*) # trailing space - *$""", # zero or more copies - re.VERBOSE, -) -_SPLIT_DG_RE = re.compile(r"""('[^']'|[-=]+>|"[^"]+"|'[^']+'|\|)""") - - -def _read_dependency_production(s): - if not _READ_DG_RE.match(s): - raise ValueError("Bad production string") - pieces = _SPLIT_DG_RE.split(s) - pieces = [p for i, p in enumerate(pieces) if i % 2 == 1] - lhside = pieces[0].strip("'\"") - rhsides = [[]] - for piece in pieces[2:]: - if piece == "|": - rhsides.append([]) - else: - rhsides[-1].append(piece.strip("'\"")) - return [DependencyProduction(lhside, rhside) for rhside in rhsides] - - -################################################################# -# Demonstration -################################################################# - - -def cfg_demo(): - """ - A demonstration showing how ``CFGs`` can be created and used. - """ - - from nltk import CFG, Production, nonterminals - - # Create some nonterminals - S, NP, VP, PP = nonterminals("S, NP, VP, PP") - N, V, P, Det = nonterminals("N, V, P, Det") - VP_slash_NP = VP / NP - - print("Some nonterminals:", [S, NP, VP, PP, N, V, P, Det, VP / NP]) - print(" S.symbol() =>", repr(S.symbol())) - print() - - print(Production(S, [NP])) - - # Create some Grammar Productions - grammar = CFG.fromstring( - """ - S -> NP VP - PP -> P NP - NP -> Det N | NP PP - VP -> V NP | VP PP - Det -> 'a' | 'the' - N -> 'dog' | 'cat' - V -> 'chased' | 'sat' - P -> 'on' | 'in' - """ - ) - - print("A Grammar:", repr(grammar)) - print(" grammar.start() =>", repr(grammar.start())) - print(" grammar.productions() =>", end=" ") - # Use string.replace(...) is to line-wrap the output. - print(repr(grammar.productions()).replace(",", ",\n" + " " * 25)) - print() - - -def pcfg_demo(): - """ - A demonstration showing how a ``PCFG`` can be created and used. - """ - - from nltk import induce_pcfg, treetransforms - from nltk.corpus import treebank - from nltk.parse import pchart - - toy_pcfg1 = PCFG.fromstring( - """ - S -> NP VP [1.0] - NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] - Det -> 'the' [0.8] | 'my' [0.2] - N -> 'man' [0.5] | 'telescope' [0.5] - VP -> VP PP [0.1] | V NP [0.7] | V [0.2] - V -> 'ate' [0.35] | 'saw' [0.65] - PP -> P NP [1.0] - P -> 'with' [0.61] | 'under' [0.39] - """ - ) - - toy_pcfg2 = PCFG.fromstring( - """ - S -> NP VP [1.0] - VP -> V NP [.59] - VP -> V [.40] - VP -> VP PP [.01] - NP -> Det N [.41] - NP -> Name [.28] - NP -> NP PP [.31] - PP -> P NP [1.0] - V -> 'saw' [.21] - V -> 'ate' [.51] - V -> 'ran' [.28] - N -> 'boy' [.11] - N -> 'cookie' [.12] - N -> 'table' [.13] - N -> 'telescope' [.14] - N -> 'hill' [.5] - Name -> 'Jack' [.52] - Name -> 'Bob' [.48] - P -> 'with' [.61] - P -> 'under' [.39] - Det -> 'the' [.41] - Det -> 'a' [.31] - Det -> 'my' [.28] - """ - ) - - pcfg_prods = toy_pcfg1.productions() - - pcfg_prod = pcfg_prods[2] - print("A PCFG production:", repr(pcfg_prod)) - print(" pcfg_prod.lhs() =>", repr(pcfg_prod.lhs())) - print(" pcfg_prod.rhs() =>", repr(pcfg_prod.rhs())) - print(" pcfg_prod.prob() =>", repr(pcfg_prod.prob())) - print() - - grammar = toy_pcfg2 - print("A PCFG grammar:", repr(grammar)) - print(" grammar.start() =>", repr(grammar.start())) - print(" grammar.productions() =>", end=" ") - # Use .replace(...) is to line-wrap the output. - print(repr(grammar.productions()).replace(",", ",\n" + " " * 26)) - print() - - # extract productions from three trees and induce the PCFG - print("Induce PCFG grammar from treebank data:") - - productions = [] - item = treebank._fileids[0] - for tree in treebank.parsed_sents(item)[:3]: - # perform optional tree transformations, e.g.: - tree.collapse_unary(collapsePOS=False) - tree.chomsky_normal_form(horzMarkov=2) - - productions += tree.productions() - - S = Nonterminal("S") - grammar = induce_pcfg(S, productions) - print(grammar) - print() - - print("Parse sentence using induced grammar:") - - parser = pchart.InsideChartParser(grammar) - parser.trace(3) - - # doesn't work as tokens are different: - # sent = treebank.tokenized('wsj_0001.mrg')[0] - - sent = treebank.parsed_sents(item)[0].leaves() - print(sent) - for parse in parser.parse(sent): - print(parse) - - -def fcfg_demo(): - import nltk.data - - g = nltk.data.load("grammars/book_grammars/feat0.fcfg") - print(g) - print() - - -def dg_demo(): - """ - A demonstration showing the creation and inspection of a - ``DependencyGrammar``. - """ - grammar = DependencyGrammar.fromstring( - """ - 'scratch' -> 'cats' | 'walls' - 'walls' -> 'the' - 'cats' -> 'the' - """ - ) - print(grammar) - - -def sdg_demo(): - """ - A demonstration of how to read a string representation of - a CoNLL format dependency tree. - """ - from nltk.parse import DependencyGraph - - dg = DependencyGraph( - """ - 1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _ - 2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _ - 3 met met Prep Prep voor 8 mod _ _ - 4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _ - 5 moeder moeder N N soort|ev|neut 3 obj1 _ _ - 6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _ - 7 gaan ga V V hulp|inf 6 vc _ _ - 8 winkelen winkel V V intrans|inf 11 cnj _ _ - 9 , , Punc Punc komma 8 punct _ _ - 10 zwemmen zwem V V intrans|inf 11 cnj _ _ - 11 of of Conj Conj neven 7 vc _ _ - 12 terrassen terras N N soort|mv|neut 11 cnj _ _ - 13 . . Punc Punc punt 12 punct _ _ - """ - ) - tree = dg.tree() - print(tree.pprint()) - - -def demo(): - cfg_demo() - pcfg_demo() - fcfg_demo() - dg_demo() - sdg_demo() - - -if __name__ == "__main__": - demo() - -__all__ = [ - "Nonterminal", - "nonterminals", - "CFG", - "Production", - "PCFG", - "ProbabilisticProduction", - "DependencyGrammar", - "DependencyProduction", - "ProbabilisticDependencyGrammar", - "induce_pcfg", - "read_grammar", -] diff --git a/pipeline/nltk/help.py b/pipeline/nltk/help.py deleted file mode 100644 index e0b5f7b876cda304a7fdaeecfbab9278113058df..0000000000000000000000000000000000000000 --- a/pipeline/nltk/help.py +++ /dev/null @@ -1,64 +0,0 @@ -# Natural Language Toolkit (NLTK) Help -# -# Copyright (C) 2001-2023 NLTK Project -# Authors: Steven Bird -# URL: -# For license information, see LICENSE.TXT - -""" -Provide structured access to documentation. -""" - -import re -from textwrap import wrap - -from nltk.data import load - - -def brown_tagset(tagpattern=None): - _format_tagset("brown_tagset", tagpattern) - - -def claws5_tagset(tagpattern=None): - _format_tagset("claws5_tagset", tagpattern) - - -def upenn_tagset(tagpattern=None): - _format_tagset("upenn_tagset", tagpattern) - - -##################################################################### -# UTILITIES -##################################################################### - - -def _print_entries(tags, tagdict): - for tag in tags: - entry = tagdict[tag] - defn = [tag + ": " + entry[0]] - examples = wrap( - entry[1], width=75, initial_indent=" ", subsequent_indent=" " - ) - print("\n".join(defn + examples)) - - -def _format_tagset(tagset, tagpattern=None): - tagdict = load("help/tagsets/" + tagset + ".pickle") - if not tagpattern: - _print_entries(sorted(tagdict), tagdict) - elif tagpattern in tagdict: - _print_entries([tagpattern], tagdict) - else: - tagpattern = re.compile(tagpattern) - tags = [tag for tag in sorted(tagdict) if tagpattern.match(tag)] - if tags: - _print_entries(tags, tagdict) - else: - print("No matching tags found.") - - -if __name__ == "__main__": - brown_tagset(r"NN.*") - upenn_tagset(r".*\$") - claws5_tagset("UNDEFINED") - brown_tagset(r"NN") diff --git a/pipeline/nltk/inference/__init__.py b/pipeline/nltk/inference/__init__.py deleted file mode 100644 index 754b3d2d78286799b229700bb19bd21cb683b855..0000000000000000000000000000000000000000 --- a/pipeline/nltk/inference/__init__.py +++ /dev/null @@ -1,24 +0,0 @@ -# Natural Language Toolkit: Inference -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Dan Garrette -# Ewan Klein -# -# URL: -# For license information, see LICENSE.TXT - -""" -Classes and interfaces for theorem proving and model building. -""" - -from nltk.inference.api import ParallelProverBuilder, ParallelProverBuilderCommand -from nltk.inference.discourse import ( - CfgReadingCommand, - DiscourseTester, - DrtGlueReadingCommand, - ReadingCommand, -) -from nltk.inference.mace import Mace, MaceCommand -from nltk.inference.prover9 import Prover9, Prover9Command -from nltk.inference.resolution import ResolutionProver, ResolutionProverCommand -from nltk.inference.tableau import TableauProver, TableauProverCommand diff --git a/pipeline/nltk/inference/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/inference/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 254a6b0fe2c6594cdad764723be51ff7a7b6c176..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/inference/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/inference/__pycache__/api.cpython-39.pyc b/pipeline/nltk/inference/__pycache__/api.cpython-39.pyc deleted file mode 100644 index 468c5339243bc820fadbbf41bad48f41415de413..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/inference/__pycache__/api.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/inference/__pycache__/discourse.cpython-39.pyc b/pipeline/nltk/inference/__pycache__/discourse.cpython-39.pyc deleted file mode 100644 index 7498e93ae55ceff545de3b643da20510c78d1db9..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/inference/__pycache__/discourse.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/inference/__pycache__/mace.cpython-39.pyc b/pipeline/nltk/inference/__pycache__/mace.cpython-39.pyc deleted file mode 100644 index ca2411c30c17169245cf25672d2904f2d0c87c2d..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/inference/__pycache__/mace.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/inference/__pycache__/nonmonotonic.cpython-39.pyc b/pipeline/nltk/inference/__pycache__/nonmonotonic.cpython-39.pyc deleted file mode 100644 index b23fb2983e3a5f51195cebab45b694fd213d0dc8..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/inference/__pycache__/nonmonotonic.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/inference/__pycache__/prover9.cpython-39.pyc b/pipeline/nltk/inference/__pycache__/prover9.cpython-39.pyc deleted file mode 100644 index 8c87bf8f82d2987d40cf5ef8244f737fb9e49265..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/inference/__pycache__/prover9.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/inference/__pycache__/resolution.cpython-39.pyc b/pipeline/nltk/inference/__pycache__/resolution.cpython-39.pyc deleted file mode 100644 index 49eb4e99a552d2af8ddee044c2260890487fd419..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/inference/__pycache__/resolution.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/inference/__pycache__/tableau.cpython-39.pyc b/pipeline/nltk/inference/__pycache__/tableau.cpython-39.pyc deleted file mode 100644 index 87908e2f8a373185ecc9dfc16ed370c0fdf45ce2..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/inference/__pycache__/tableau.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/inference/api.py b/pipeline/nltk/inference/api.py deleted file mode 100644 index 12f1c099941280c1a72f40f957330dc5497a1b27..0000000000000000000000000000000000000000 --- a/pipeline/nltk/inference/api.py +++ /dev/null @@ -1,614 +0,0 @@ -# Natural Language Toolkit: Classifier Interface -# -# Author: Ewan Klein -# Dan Garrette -# -# URL: -# For license information, see LICENSE.TXT - -""" -Interfaces and base classes for theorem provers and model builders. - -``Prover`` is a standard interface for a theorem prover which tries to prove a goal from a -list of assumptions. - -``ModelBuilder`` is a standard interface for a model builder. Given just a set of assumptions. -the model builder tries to build a model for the assumptions. Given a set of assumptions and a -goal *G*, the model builder tries to find a counter-model, in the sense of a model that will satisfy -the assumptions plus the negation of *G*. -""" - -import threading -import time -from abc import ABCMeta, abstractmethod - - -class Prover(metaclass=ABCMeta): - """ - Interface for trying to prove a goal from assumptions. Both the goal and - the assumptions are constrained to be formulas of ``logic.Expression``. - """ - - def prove(self, goal=None, assumptions=None, verbose=False): - """ - :return: Whether the proof was successful or not. - :rtype: bool - """ - return self._prove(goal, assumptions, verbose)[0] - - @abstractmethod - def _prove(self, goal=None, assumptions=None, verbose=False): - """ - :return: Whether the proof was successful or not, along with the proof - :rtype: tuple: (bool, str) - """ - - -class ModelBuilder(metaclass=ABCMeta): - """ - Interface for trying to build a model of set of formulas. - Open formulas are assumed to be universally quantified. - Both the goal and the assumptions are constrained to be formulas - of ``logic.Expression``. - """ - - def build_model(self, goal=None, assumptions=None, verbose=False): - """ - Perform the actual model building. - :return: Whether a model was generated - :rtype: bool - """ - return self._build_model(goal, assumptions, verbose)[0] - - @abstractmethod - def _build_model(self, goal=None, assumptions=None, verbose=False): - """ - Perform the actual model building. - :return: Whether a model was generated, and the model itself - :rtype: tuple(bool, sem.Valuation) - """ - - -class TheoremToolCommand(metaclass=ABCMeta): - """ - This class holds a goal and a list of assumptions to be used in proving - or model building. - """ - - @abstractmethod - def add_assumptions(self, new_assumptions): - """ - Add new assumptions to the assumption list. - - :param new_assumptions: new assumptions - :type new_assumptions: list(sem.Expression) - """ - - @abstractmethod - def retract_assumptions(self, retracted, debug=False): - """ - Retract assumptions from the assumption list. - - :param debug: If True, give warning when ``retracted`` is not present on - assumptions list. - :type debug: bool - :param retracted: assumptions to be retracted - :type retracted: list(sem.Expression) - """ - - @abstractmethod - def assumptions(self): - """ - List the current assumptions. - - :return: list of ``Expression`` - """ - - @abstractmethod - def goal(self): - """ - Return the goal - - :return: ``Expression`` - """ - - @abstractmethod - def print_assumptions(self): - """ - Print the list of the current assumptions. - """ - - -class ProverCommand(TheoremToolCommand): - """ - This class holds a ``Prover``, a goal, and a list of assumptions. When - prove() is called, the ``Prover`` is executed with the goal and assumptions. - """ - - @abstractmethod - def prove(self, verbose=False): - """ - Perform the actual proof. - """ - - @abstractmethod - def proof(self, simplify=True): - """ - Return the proof string - :param simplify: bool simplify the proof? - :return: str - """ - - @abstractmethod - def get_prover(self): - """ - Return the prover object - :return: ``Prover`` - """ - - -class ModelBuilderCommand(TheoremToolCommand): - """ - This class holds a ``ModelBuilder``, a goal, and a list of assumptions. - When build_model() is called, the ``ModelBuilder`` is executed with the goal - and assumptions. - """ - - @abstractmethod - def build_model(self, verbose=False): - """ - Perform the actual model building. - :return: A model if one is generated; None otherwise. - :rtype: sem.Valuation - """ - - @abstractmethod - def model(self, format=None): - """ - Return a string representation of the model - - :param simplify: bool simplify the proof? - :return: str - """ - - @abstractmethod - def get_model_builder(self): - """ - Return the model builder object - :return: ``ModelBuilder`` - """ - - -class BaseTheoremToolCommand(TheoremToolCommand): - """ - This class holds a goal and a list of assumptions to be used in proving - or model building. - """ - - def __init__(self, goal=None, assumptions=None): - """ - :param goal: Input expression to prove - :type goal: sem.Expression - :param assumptions: Input expressions to use as assumptions in - the proof. - :type assumptions: list(sem.Expression) - """ - self._goal = goal - - if not assumptions: - self._assumptions = [] - else: - self._assumptions = list(assumptions) - - self._result = None - """A holder for the result, to prevent unnecessary re-proving""" - - def add_assumptions(self, new_assumptions): - """ - Add new assumptions to the assumption list. - - :param new_assumptions: new assumptions - :type new_assumptions: list(sem.Expression) - """ - self._assumptions.extend(new_assumptions) - self._result = None - - def retract_assumptions(self, retracted, debug=False): - """ - Retract assumptions from the assumption list. - - :param debug: If True, give warning when ``retracted`` is not present on - assumptions list. - :type debug: bool - :param retracted: assumptions to be retracted - :type retracted: list(sem.Expression) - """ - retracted = set(retracted) - result_list = list(filter(lambda a: a not in retracted, self._assumptions)) - if debug and result_list == self._assumptions: - print(Warning("Assumptions list has not been changed:")) - self.print_assumptions() - - self._assumptions = result_list - - self._result = None - - def assumptions(self): - """ - List the current assumptions. - - :return: list of ``Expression`` - """ - return self._assumptions - - def goal(self): - """ - Return the goal - - :return: ``Expression`` - """ - return self._goal - - def print_assumptions(self): - """ - Print the list of the current assumptions. - """ - for a in self.assumptions(): - print(a) - - -class BaseProverCommand(BaseTheoremToolCommand, ProverCommand): - """ - This class holds a ``Prover``, a goal, and a list of assumptions. When - prove() is called, the ``Prover`` is executed with the goal and assumptions. - """ - - def __init__(self, prover, goal=None, assumptions=None): - """ - :param prover: The theorem tool to execute with the assumptions - :type prover: Prover - :see: ``BaseTheoremToolCommand`` - """ - self._prover = prover - """The theorem tool to execute with the assumptions""" - - BaseTheoremToolCommand.__init__(self, goal, assumptions) - - self._proof = None - - def prove(self, verbose=False): - """ - Perform the actual proof. Store the result to prevent unnecessary - re-proving. - """ - if self._result is None: - self._result, self._proof = self._prover._prove( - self.goal(), self.assumptions(), verbose - ) - return self._result - - def proof(self, simplify=True): - """ - Return the proof string - :param simplify: bool simplify the proof? - :return: str - """ - if self._result is None: - raise LookupError("You have to call prove() first to get a proof!") - else: - return self.decorate_proof(self._proof, simplify) - - def decorate_proof(self, proof_string, simplify=True): - """ - Modify and return the proof string - :param proof_string: str the proof to decorate - :param simplify: bool simplify the proof? - :return: str - """ - return proof_string - - def get_prover(self): - return self._prover - - -class BaseModelBuilderCommand(BaseTheoremToolCommand, ModelBuilderCommand): - """ - This class holds a ``ModelBuilder``, a goal, and a list of assumptions. When - build_model() is called, the ``ModelBuilder`` is executed with the goal and - assumptions. - """ - - def __init__(self, modelbuilder, goal=None, assumptions=None): - """ - :param modelbuilder: The theorem tool to execute with the assumptions - :type modelbuilder: ModelBuilder - :see: ``BaseTheoremToolCommand`` - """ - self._modelbuilder = modelbuilder - """The theorem tool to execute with the assumptions""" - - BaseTheoremToolCommand.__init__(self, goal, assumptions) - - self._model = None - - def build_model(self, verbose=False): - """ - Attempt to build a model. Store the result to prevent unnecessary - re-building. - """ - if self._result is None: - self._result, self._model = self._modelbuilder._build_model( - self.goal(), self.assumptions(), verbose - ) - return self._result - - def model(self, format=None): - """ - Return a string representation of the model - - :param simplify: bool simplify the proof? - :return: str - """ - if self._result is None: - raise LookupError("You have to call build_model() first to " "get a model!") - else: - return self._decorate_model(self._model, format) - - def _decorate_model(self, valuation_str, format=None): - """ - :param valuation_str: str with the model builder's output - :param format: str indicating the format for displaying - :return: str - """ - return valuation_str - - def get_model_builder(self): - return self._modelbuilder - - -class TheoremToolCommandDecorator(TheoremToolCommand): - """ - A base decorator for the ``ProverCommandDecorator`` and - ``ModelBuilderCommandDecorator`` classes from which decorators can extend. - """ - - def __init__(self, command): - """ - :param command: ``TheoremToolCommand`` to decorate - """ - self._command = command - - # The decorator has its own versions of 'result' different from the - # underlying command - self._result = None - - def assumptions(self): - return self._command.assumptions() - - def goal(self): - return self._command.goal() - - def add_assumptions(self, new_assumptions): - self._command.add_assumptions(new_assumptions) - self._result = None - - def retract_assumptions(self, retracted, debug=False): - self._command.retract_assumptions(retracted, debug) - self._result = None - - def print_assumptions(self): - self._command.print_assumptions() - - -class ProverCommandDecorator(TheoremToolCommandDecorator, ProverCommand): - """ - A base decorator for the ``ProverCommand`` class from which other - prover command decorators can extend. - """ - - def __init__(self, proverCommand): - """ - :param proverCommand: ``ProverCommand`` to decorate - """ - TheoremToolCommandDecorator.__init__(self, proverCommand) - - # The decorator has its own versions of 'result' and 'proof' - # because they may be different from the underlying command - self._proof = None - - def prove(self, verbose=False): - if self._result is None: - prover = self.get_prover() - self._result, self._proof = prover._prove( - self.goal(), self.assumptions(), verbose - ) - return self._result - - def proof(self, simplify=True): - """ - Return the proof string - :param simplify: bool simplify the proof? - :return: str - """ - if self._result is None: - raise LookupError("You have to call prove() first to get a proof!") - else: - return self.decorate_proof(self._proof, simplify) - - def decorate_proof(self, proof_string, simplify=True): - """ - Modify and return the proof string - :param proof_string: str the proof to decorate - :param simplify: bool simplify the proof? - :return: str - """ - return self._command.decorate_proof(proof_string, simplify) - - def get_prover(self): - return self._command.get_prover() - - -class ModelBuilderCommandDecorator(TheoremToolCommandDecorator, ModelBuilderCommand): - """ - A base decorator for the ``ModelBuilderCommand`` class from which other - prover command decorators can extend. - """ - - def __init__(self, modelBuilderCommand): - """ - :param modelBuilderCommand: ``ModelBuilderCommand`` to decorate - """ - TheoremToolCommandDecorator.__init__(self, modelBuilderCommand) - - # The decorator has its own versions of 'result' and 'valuation' - # because they may be different from the underlying command - self._model = None - - def build_model(self, verbose=False): - """ - Attempt to build a model. Store the result to prevent unnecessary - re-building. - """ - if self._result is None: - modelbuilder = self.get_model_builder() - self._result, self._model = modelbuilder._build_model( - self.goal(), self.assumptions(), verbose - ) - return self._result - - def model(self, format=None): - """ - Return a string representation of the model - - :param simplify: bool simplify the proof? - :return: str - """ - if self._result is None: - raise LookupError("You have to call build_model() first to " "get a model!") - else: - return self._decorate_model(self._model, format) - - def _decorate_model(self, valuation_str, format=None): - """ - Modify and return the proof string - :param valuation_str: str with the model builder's output - :param format: str indicating the format for displaying - :return: str - """ - return self._command._decorate_model(valuation_str, format) - - def get_model_builder(self): - return self._command.get_prover() - - -class ParallelProverBuilder(Prover, ModelBuilder): - """ - This class stores both a prover and a model builder and when either - prove() or build_model() is called, then both theorem tools are run in - parallel. Whichever finishes first, the prover or the model builder, is the - result that will be used. - """ - - def __init__(self, prover, modelbuilder): - self._prover = prover - self._modelbuilder = modelbuilder - - def _prove(self, goal=None, assumptions=None, verbose=False): - return self._run(goal, assumptions, verbose), "" - - def _build_model(self, goal=None, assumptions=None, verbose=False): - return not self._run(goal, assumptions, verbose), "" - - def _run(self, goal, assumptions, verbose): - # Set up two thread, Prover and ModelBuilder to run in parallel - tp_thread = TheoremToolThread( - lambda: self._prover.prove(goal, assumptions, verbose), verbose, "TP" - ) - mb_thread = TheoremToolThread( - lambda: self._modelbuilder.build_model(goal, assumptions, verbose), - verbose, - "MB", - ) - - tp_thread.start() - mb_thread.start() - - while tp_thread.is_alive() and mb_thread.is_alive(): - # wait until either the prover or the model builder is done - pass - - if tp_thread.result is not None: - return tp_thread.result - elif mb_thread.result is not None: - return not mb_thread.result - else: - return None - - -class ParallelProverBuilderCommand(BaseProverCommand, BaseModelBuilderCommand): - """ - This command stores both a prover and a model builder and when either - prove() or build_model() is called, then both theorem tools are run in - parallel. Whichever finishes first, the prover or the model builder, is the - result that will be used. - - Because the theorem prover result is the opposite of the model builder - result, we will treat self._result as meaning "proof found/no model found". - """ - - def __init__(self, prover, modelbuilder, goal=None, assumptions=None): - BaseProverCommand.__init__(self, prover, goal, assumptions) - BaseModelBuilderCommand.__init__(self, modelbuilder, goal, assumptions) - - def prove(self, verbose=False): - return self._run(verbose) - - def build_model(self, verbose=False): - return not self._run(verbose) - - def _run(self, verbose): - # Set up two thread, Prover and ModelBuilder to run in parallel - tp_thread = TheoremToolThread( - lambda: BaseProverCommand.prove(self, verbose), verbose, "TP" - ) - mb_thread = TheoremToolThread( - lambda: BaseModelBuilderCommand.build_model(self, verbose), verbose, "MB" - ) - - tp_thread.start() - mb_thread.start() - - while tp_thread.is_alive() and mb_thread.is_alive(): - # wait until either the prover or the model builder is done - pass - - if tp_thread.result is not None: - self._result = tp_thread.result - elif mb_thread.result is not None: - self._result = not mb_thread.result - return self._result - - -class TheoremToolThread(threading.Thread): - def __init__(self, command, verbose, name=None): - threading.Thread.__init__(self) - self._command = command - self._result = None - self._verbose = verbose - self._name = name - - def run(self): - try: - self._result = self._command() - if self._verbose: - print( - "Thread %s finished with result %s at %s" - % (self._name, self._result, time.localtime(time.time())) - ) - except Exception as e: - print(e) - print("Thread %s completed abnormally" % (self._name)) - - @property - def result(self): - return self._result diff --git a/pipeline/nltk/inference/discourse.py b/pipeline/nltk/inference/discourse.py deleted file mode 100644 index 9630234dcf3837d9da2b4213fe26d22491899932..0000000000000000000000000000000000000000 --- a/pipeline/nltk/inference/discourse.py +++ /dev/null @@ -1,651 +0,0 @@ -# Natural Language Toolkit: Discourse Processing -# -# Author: Ewan Klein -# Dan Garrette -# -# URL: -# For license information, see LICENSE.TXT - -r""" -Module for incrementally developing simple discourses, and checking for semantic ambiguity, -consistency and informativeness. - -Many of the ideas are based on the CURT family of programs of Blackburn and Bos -(see http://homepages.inf.ed.ac.uk/jbos/comsem/book1.html). - -Consistency checking is carried out by using the ``mace`` module to call the Mace4 model builder. -Informativeness checking is carried out with a call to ``Prover.prove()`` from -the ``inference`` module. - -``DiscourseTester`` is a constructor for discourses. -The basic data structure is a list of sentences, stored as ``self._sentences``. Each sentence in the list -is assigned a "sentence ID" (``sid``) of the form ``s``\ *i*. For example:: - - s0: A boxer walks - s1: Every boxer chases a girl - -Each sentence can be ambiguous between a number of readings, each of which receives a -"reading ID" (``rid``) of the form ``s``\ *i* -``r``\ *j*. For example:: - - s0 readings: - - s0-r1: some x.(boxer(x) & walk(x)) - s0-r0: some x.(boxerdog(x) & walk(x)) - -A "thread" is a list of readings, represented as a list of ``rid``\ s. -Each thread receives a "thread ID" (``tid``) of the form ``d``\ *i*. -For example:: - - d0: ['s0-r0', 's1-r0'] - -The set of all threads for a discourse is the Cartesian product of all the readings of the sequences of sentences. -(This is not intended to scale beyond very short discourses!) The method ``readings(filter=True)`` will only show -those threads which are consistent (taking into account any background assumptions). -""" - -import os -from abc import ABCMeta, abstractmethod -from functools import reduce -from operator import add, and_ - -from nltk.data import show_cfg -from nltk.inference.mace import MaceCommand -from nltk.inference.prover9 import Prover9Command -from nltk.parse import load_parser -from nltk.parse.malt import MaltParser -from nltk.sem.drt import AnaphoraResolutionException, resolve_anaphora -from nltk.sem.glue import DrtGlue -from nltk.sem.logic import Expression -from nltk.tag import RegexpTagger - - -class ReadingCommand(metaclass=ABCMeta): - @abstractmethod - def parse_to_readings(self, sentence): - """ - :param sentence: the sentence to read - :type sentence: str - """ - - def process_thread(self, sentence_readings): - """ - This method should be used to handle dependencies between readings such - as resolving anaphora. - - :param sentence_readings: readings to process - :type sentence_readings: list(Expression) - :return: the list of readings after processing - :rtype: list(Expression) - """ - return sentence_readings - - @abstractmethod - def combine_readings(self, readings): - """ - :param readings: readings to combine - :type readings: list(Expression) - :return: one combined reading - :rtype: Expression - """ - - @abstractmethod - def to_fol(self, expression): - """ - Convert this expression into a First-Order Logic expression. - - :param expression: an expression - :type expression: Expression - :return: a FOL version of the input expression - :rtype: Expression - """ - - -class CfgReadingCommand(ReadingCommand): - def __init__(self, gramfile=None): - """ - :param gramfile: name of file where grammar can be loaded - :type gramfile: str - """ - self._gramfile = ( - gramfile if gramfile else "grammars/book_grammars/discourse.fcfg" - ) - self._parser = load_parser(self._gramfile) - - def parse_to_readings(self, sentence): - """:see: ReadingCommand.parse_to_readings()""" - from nltk.sem import root_semrep - - tokens = sentence.split() - trees = self._parser.parse(tokens) - return [root_semrep(tree) for tree in trees] - - def combine_readings(self, readings): - """:see: ReadingCommand.combine_readings()""" - return reduce(and_, readings) - - def to_fol(self, expression): - """:see: ReadingCommand.to_fol()""" - return expression - - -class DrtGlueReadingCommand(ReadingCommand): - def __init__(self, semtype_file=None, remove_duplicates=False, depparser=None): - """ - :param semtype_file: name of file where grammar can be loaded - :param remove_duplicates: should duplicates be removed? - :param depparser: the dependency parser - """ - if semtype_file is None: - semtype_file = os.path.join( - "grammars", "sample_grammars", "drt_glue.semtype" - ) - self._glue = DrtGlue( - semtype_file=semtype_file, - remove_duplicates=remove_duplicates, - depparser=depparser, - ) - - def parse_to_readings(self, sentence): - """:see: ReadingCommand.parse_to_readings()""" - return self._glue.parse_to_meaning(sentence) - - def process_thread(self, sentence_readings): - """:see: ReadingCommand.process_thread()""" - try: - return [self.combine_readings(sentence_readings)] - except AnaphoraResolutionException: - return [] - - def combine_readings(self, readings): - """:see: ReadingCommand.combine_readings()""" - thread_reading = reduce(add, readings) - return resolve_anaphora(thread_reading.simplify()) - - def to_fol(self, expression): - """:see: ReadingCommand.to_fol()""" - return expression.fol() - - -class DiscourseTester: - """ - Check properties of an ongoing discourse. - """ - - def __init__(self, input, reading_command=None, background=None): - """ - Initialize a ``DiscourseTester``. - - :param input: the discourse sentences - :type input: list of str - :param background: Formulas which express background assumptions - :type background: list(Expression) - """ - self._input = input - self._sentences = {"s%s" % i: sent for i, sent in enumerate(input)} - self._models = None - self._readings = {} - self._reading_command = ( - reading_command if reading_command else CfgReadingCommand() - ) - self._threads = {} - self._filtered_threads = {} - if background is not None: - from nltk.sem.logic import Expression - - for e in background: - assert isinstance(e, Expression) - self._background = background - else: - self._background = [] - - ############################### - # Sentences - ############################### - - def sentences(self): - """ - Display the list of sentences in the current discourse. - """ - for id in sorted(self._sentences): - print(f"{id}: {self._sentences[id]}") - - def add_sentence(self, sentence, informchk=False, consistchk=False): - """ - Add a sentence to the current discourse. - - Updates ``self._input`` and ``self._sentences``. - :param sentence: An input sentence - :type sentence: str - :param informchk: if ``True``, check that the result of adding the sentence is thread-informative. Updates ``self._readings``. - :param consistchk: if ``True``, check that the result of adding the sentence is thread-consistent. Updates ``self._readings``. - - """ - # check whether the new sentence is informative (i.e. not entailed by the previous discourse) - if informchk: - self.readings(verbose=False) - for tid in sorted(self._threads): - assumptions = [reading for (rid, reading) in self.expand_threads(tid)] - assumptions += self._background - for sent_reading in self._get_readings(sentence): - tp = Prover9Command(goal=sent_reading, assumptions=assumptions) - if tp.prove(): - print( - "Sentence '%s' under reading '%s':" - % (sentence, str(sent_reading)) - ) - print("Not informative relative to thread '%s'" % tid) - - self._input.append(sentence) - self._sentences = {"s%s" % i: sent for i, sent in enumerate(self._input)} - # check whether adding the new sentence to the discourse preserves consistency (i.e. a model can be found for the combined set of - # of assumptions - if consistchk: - self.readings(verbose=False) - self.models(show=False) - - def retract_sentence(self, sentence, verbose=True): - """ - Remove a sentence from the current discourse. - - Updates ``self._input``, ``self._sentences`` and ``self._readings``. - :param sentence: An input sentence - :type sentence: str - :param verbose: If ``True``, report on the updated list of sentences. - """ - try: - self._input.remove(sentence) - except ValueError: - print( - "Retraction failed. The sentence '%s' is not part of the current discourse:" - % sentence - ) - self.sentences() - return None - self._sentences = {"s%s" % i: sent for i, sent in enumerate(self._input)} - self.readings(verbose=False) - if verbose: - print("Current sentences are ") - self.sentences() - - def grammar(self): - """ - Print out the grammar in use for parsing input sentences - """ - show_cfg(self._reading_command._gramfile) - - ############################### - # Readings and Threads - ############################### - - def _get_readings(self, sentence): - """ - Build a list of semantic readings for a sentence. - - :rtype: list(Expression) - """ - return self._reading_command.parse_to_readings(sentence) - - def _construct_readings(self): - """ - Use ``self._sentences`` to construct a value for ``self._readings``. - """ - # re-initialize self._readings in case we have retracted a sentence - self._readings = {} - for sid in sorted(self._sentences): - sentence = self._sentences[sid] - readings = self._get_readings(sentence) - self._readings[sid] = { - f"{sid}-r{rid}": reading.simplify() - for rid, reading in enumerate(sorted(readings, key=str)) - } - - def _construct_threads(self): - """ - Use ``self._readings`` to construct a value for ``self._threads`` - and use the model builder to construct a value for ``self._filtered_threads`` - """ - thread_list = [[]] - for sid in sorted(self._readings): - thread_list = self.multiply(thread_list, sorted(self._readings[sid])) - self._threads = {"d%s" % tid: thread for tid, thread in enumerate(thread_list)} - # re-initialize the filtered threads - self._filtered_threads = {} - # keep the same ids, but only include threads which get models - consistency_checked = self._check_consistency(self._threads) - for (tid, thread) in self._threads.items(): - if (tid, True) in consistency_checked: - self._filtered_threads[tid] = thread - - def _show_readings(self, sentence=None): - """ - Print out the readings for the discourse (or a single sentence). - """ - if sentence is not None: - print("The sentence '%s' has these readings:" % sentence) - for r in [str(reading) for reading in (self._get_readings(sentence))]: - print(" %s" % r) - else: - for sid in sorted(self._readings): - print() - print("%s readings:" % sid) - print() #'-' * 30 - for rid in sorted(self._readings[sid]): - lf = self._readings[sid][rid] - print(f"{rid}: {lf.normalize()}") - - def _show_threads(self, filter=False, show_thread_readings=False): - """ - Print out the value of ``self._threads`` or ``self._filtered_hreads`` - """ - threads = self._filtered_threads if filter else self._threads - for tid in sorted(threads): - if show_thread_readings: - readings = [ - self._readings[rid.split("-")[0]][rid] for rid in self._threads[tid] - ] - try: - thread_reading = ( - ": %s" - % self._reading_command.combine_readings(readings).normalize() - ) - except Exception as e: - thread_reading = ": INVALID: %s" % e.__class__.__name__ - else: - thread_reading = "" - - print("%s:" % tid, self._threads[tid], thread_reading) - - def readings( - self, - sentence=None, - threaded=False, - verbose=True, - filter=False, - show_thread_readings=False, - ): - """ - Construct and show the readings of the discourse (or of a single sentence). - - :param sentence: test just this sentence - :type sentence: str - :param threaded: if ``True``, print out each thread ID and the corresponding thread. - :param filter: if ``True``, only print out consistent thread IDs and threads. - """ - self._construct_readings() - self._construct_threads() - - # if we are filtering or showing thread readings, show threads - if filter or show_thread_readings: - threaded = True - - if verbose: - if not threaded: - self._show_readings(sentence=sentence) - else: - self._show_threads( - filter=filter, show_thread_readings=show_thread_readings - ) - - def expand_threads(self, thread_id, threads=None): - """ - Given a thread ID, find the list of ``logic.Expression`` objects corresponding to the reading IDs in that thread. - - :param thread_id: thread ID - :type thread_id: str - :param threads: a mapping from thread IDs to lists of reading IDs - :type threads: dict - :return: A list of pairs ``(rid, reading)`` where reading is the ``logic.Expression`` associated with a reading ID - :rtype: list of tuple - """ - if threads is None: - threads = self._threads - return [ - (rid, self._readings[sid][rid]) - for rid in threads[thread_id] - for sid in rid.split("-")[:1] - ] - - ############################### - # Models and Background - ############################### - - def _check_consistency(self, threads, show=False, verbose=False): - results = [] - for tid in sorted(threads): - assumptions = [ - reading for (rid, reading) in self.expand_threads(tid, threads=threads) - ] - assumptions = list( - map( - self._reading_command.to_fol, - self._reading_command.process_thread(assumptions), - ) - ) - if assumptions: - assumptions += self._background - # if Mace4 finds a model, it always seems to find it quickly - mb = MaceCommand(None, assumptions, max_models=20) - modelfound = mb.build_model() - else: - modelfound = False - results.append((tid, modelfound)) - if show: - spacer(80) - print("Model for Discourse Thread %s" % tid) - spacer(80) - if verbose: - for a in assumptions: - print(a) - spacer(80) - if modelfound: - print(mb.model(format="cooked")) - else: - print("No model found!\n") - return results - - def models(self, thread_id=None, show=True, verbose=False): - """ - Call Mace4 to build a model for each current discourse thread. - - :param thread_id: thread ID - :type thread_id: str - :param show: If ``True``, display the model that has been found. - """ - self._construct_readings() - self._construct_threads() - threads = {thread_id: self._threads[thread_id]} if thread_id else self._threads - - for (tid, modelfound) in self._check_consistency( - threads, show=show, verbose=verbose - ): - idlist = [rid for rid in threads[tid]] - - if not modelfound: - print(f"Inconsistent discourse: {tid} {idlist}:") - for rid, reading in self.expand_threads(tid): - print(f" {rid}: {reading.normalize()}") - print() - else: - print(f"Consistent discourse: {tid} {idlist}:") - for rid, reading in self.expand_threads(tid): - print(f" {rid}: {reading.normalize()}") - print() - - def add_background(self, background, verbose=False): - """ - Add a list of background assumptions for reasoning about the discourse. - - When called, this method also updates the discourse model's set of readings and threads. - :param background: Formulas which contain background information - :type background: list(Expression) - """ - from nltk.sem.logic import Expression - - for (count, e) in enumerate(background): - assert isinstance(e, Expression) - if verbose: - print("Adding assumption %s to background" % count) - self._background.append(e) - - # update the state - self._construct_readings() - self._construct_threads() - - def background(self): - """ - Show the current background assumptions. - """ - for e in self._background: - print(str(e)) - - ############################### - # Misc - ############################### - - @staticmethod - def multiply(discourse, readings): - """ - Multiply every thread in ``discourse`` by every reading in ``readings``. - - Given discourse = [['A'], ['B']], readings = ['a', 'b', 'c'] , returns - [['A', 'a'], ['A', 'b'], ['A', 'c'], ['B', 'a'], ['B', 'b'], ['B', 'c']] - - :param discourse: the current list of readings - :type discourse: list of lists - :param readings: an additional list of readings - :type readings: list(Expression) - :rtype: A list of lists - """ - result = [] - for sublist in discourse: - for r in readings: - new = [] - new += sublist - new.append(r) - result.append(new) - return result - - -def load_fol(s): - """ - Temporarily duplicated from ``nltk.sem.util``. - Convert a file of first order formulas into a list of ``Expression`` objects. - - :param s: the contents of the file - :type s: str - :return: a list of parsed formulas. - :rtype: list(Expression) - """ - statements = [] - for linenum, line in enumerate(s.splitlines()): - line = line.strip() - if line.startswith("#") or line == "": - continue - try: - statements.append(Expression.fromstring(line)) - except Exception as e: - raise ValueError(f"Unable to parse line {linenum}: {line}") from e - return statements - - -############################### -# Demo -############################### -def discourse_demo(reading_command=None): - """ - Illustrate the various methods of ``DiscourseTester`` - """ - dt = DiscourseTester( - ["A boxer walks", "Every boxer chases a girl"], reading_command - ) - dt.models() - print() - # dt.grammar() - print() - dt.sentences() - print() - dt.readings() - print() - dt.readings(threaded=True) - print() - dt.models("d1") - dt.add_sentence("John is a boxer") - print() - dt.sentences() - print() - dt.readings(threaded=True) - print() - dt = DiscourseTester( - ["A student dances", "Every student is a person"], reading_command - ) - print() - dt.add_sentence("No person dances", consistchk=True) - print() - dt.readings() - print() - dt.retract_sentence("No person dances", verbose=True) - print() - dt.models() - print() - dt.readings("A person dances") - print() - dt.add_sentence("A person dances", informchk=True) - dt = DiscourseTester( - ["Vincent is a boxer", "Fido is a boxer", "Vincent is married", "Fido barks"], - reading_command, - ) - dt.readings(filter=True) - import nltk.data - - background_file = os.path.join("grammars", "book_grammars", "background.fol") - background = nltk.data.load(background_file) - - print() - dt.add_background(background, verbose=False) - dt.background() - print() - dt.readings(filter=True) - print() - dt.models() - - -def drt_discourse_demo(reading_command=None): - """ - Illustrate the various methods of ``DiscourseTester`` - """ - dt = DiscourseTester(["every dog chases a boy", "he runs"], reading_command) - dt.models() - print() - dt.sentences() - print() - dt.readings() - print() - dt.readings(show_thread_readings=True) - print() - dt.readings(filter=True, show_thread_readings=True) - - -def spacer(num=30): - print("-" * num) - - -def demo(): - discourse_demo() - - tagger = RegexpTagger( - [ - ("^(chases|runs)$", "VB"), - ("^(a)$", "ex_quant"), - ("^(every)$", "univ_quant"), - ("^(dog|boy)$", "NN"), - ("^(he)$", "PRP"), - ] - ) - depparser = MaltParser(tagger=tagger) - drt_discourse_demo( - DrtGlueReadingCommand(remove_duplicates=False, depparser=depparser) - ) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/inference/mace.py b/pipeline/nltk/inference/mace.py deleted file mode 100644 index ee4d9e8e38d7db34c4b58f9c37dee330d397e123..0000000000000000000000000000000000000000 --- a/pipeline/nltk/inference/mace.py +++ /dev/null @@ -1,383 +0,0 @@ -# Natural Language Toolkit: Interface to the Mace4 Model Builder -# -# Author: Dan Garrette -# Ewan Klein - -# URL: -# For license information, see LICENSE.TXT - -""" -A model builder that makes use of the external 'Mace4' package. -""" - -import os -import tempfile - -from nltk.inference.api import BaseModelBuilderCommand, ModelBuilder -from nltk.inference.prover9 import Prover9CommandParent, Prover9Parent -from nltk.sem import Expression, Valuation -from nltk.sem.logic import is_indvar - - -class MaceCommand(Prover9CommandParent, BaseModelBuilderCommand): - """ - A ``MaceCommand`` specific to the ``Mace`` model builder. It contains - a print_assumptions() method that is used to print the list - of assumptions in multiple formats. - """ - - _interpformat_bin = None - - def __init__(self, goal=None, assumptions=None, max_models=500, model_builder=None): - """ - :param goal: Input expression to prove - :type goal: sem.Expression - :param assumptions: Input expressions to use as assumptions in - the proof. - :type assumptions: list(sem.Expression) - :param max_models: The maximum number of models that Mace will try before - simply returning false. (Use 0 for no maximum.) - :type max_models: int - """ - if model_builder is not None: - assert isinstance(model_builder, Mace) - else: - model_builder = Mace(max_models) - - BaseModelBuilderCommand.__init__(self, model_builder, goal, assumptions) - - @property - def valuation(mbc): - return mbc.model("valuation") - - def _convert2val(self, valuation_str): - """ - Transform the output file into an NLTK-style Valuation. - - :return: A model if one is generated; None otherwise. - :rtype: sem.Valuation - """ - valuation_standard_format = self._transform_output(valuation_str, "standard") - - val = [] - for line in valuation_standard_format.splitlines(False): - l = line.strip() - - if l.startswith("interpretation"): - # find the number of entities in the model - num_entities = int(l[l.index("(") + 1 : l.index(",")].strip()) - - elif l.startswith("function") and l.find("_") == -1: - # replace the integer identifier with a corresponding alphabetic character - name = l[l.index("(") + 1 : l.index(",")].strip() - if is_indvar(name): - name = name.upper() - value = int(l[l.index("[") + 1 : l.index("]")].strip()) - val.append((name, MaceCommand._make_model_var(value))) - - elif l.startswith("relation"): - l = l[l.index("(") + 1 :] - if "(" in l: - # relation is not nullary - name = l[: l.index("(")].strip() - values = [ - int(v.strip()) - for v in l[l.index("[") + 1 : l.index("]")].split(",") - ] - val.append( - (name, MaceCommand._make_relation_set(num_entities, values)) - ) - else: - # relation is nullary - name = l[: l.index(",")].strip() - value = int(l[l.index("[") + 1 : l.index("]")].strip()) - val.append((name, value == 1)) - - return Valuation(val) - - @staticmethod - def _make_relation_set(num_entities, values): - """ - Convert a Mace4-style relation table into a dictionary. - - :param num_entities: the number of entities in the model; determines the row length in the table. - :type num_entities: int - :param values: a list of 1's and 0's that represent whether a relation holds in a Mace4 model. - :type values: list of int - """ - r = set() - for position in [pos for (pos, v) in enumerate(values) if v == 1]: - r.add( - tuple(MaceCommand._make_relation_tuple(position, values, num_entities)) - ) - return r - - @staticmethod - def _make_relation_tuple(position, values, num_entities): - if len(values) == 1: - return [] - else: - sublist_size = len(values) // num_entities - sublist_start = position // sublist_size - sublist_position = int(position % sublist_size) - - sublist = values[ - sublist_start * sublist_size : (sublist_start + 1) * sublist_size - ] - return [ - MaceCommand._make_model_var(sublist_start) - ] + MaceCommand._make_relation_tuple( - sublist_position, sublist, num_entities - ) - - @staticmethod - def _make_model_var(value): - """ - Pick an alphabetic character as identifier for an entity in the model. - - :param value: where to index into the list of characters - :type value: int - """ - letter = [ - "a", - "b", - "c", - "d", - "e", - "f", - "g", - "h", - "i", - "j", - "k", - "l", - "m", - "n", - "o", - "p", - "q", - "r", - "s", - "t", - "u", - "v", - "w", - "x", - "y", - "z", - ][value] - num = value // 26 - return letter + str(num) if num > 0 else letter - - def _decorate_model(self, valuation_str, format): - """ - Print out a Mace4 model using any Mace4 ``interpformat`` format. - See https://www.cs.unm.edu/~mccune/mace4/manual/ for details. - - :param valuation_str: str with the model builder's output - :param format: str indicating the format for displaying - models. Defaults to 'standard' format. - :return: str - """ - if not format: - return valuation_str - elif format == "valuation": - return self._convert2val(valuation_str) - else: - return self._transform_output(valuation_str, format) - - def _transform_output(self, valuation_str, format): - """ - Transform the output file into any Mace4 ``interpformat`` format. - - :param format: Output format for displaying models. - :type format: str - """ - if format in [ - "standard", - "standard2", - "portable", - "tabular", - "raw", - "cooked", - "xml", - "tex", - ]: - return self._call_interpformat(valuation_str, [format])[0] - else: - raise LookupError("The specified format does not exist") - - def _call_interpformat(self, input_str, args=[], verbose=False): - """ - Call the ``interpformat`` binary with the given input. - - :param input_str: A string whose contents are used as stdin. - :param args: A list of command-line arguments. - :return: A tuple (stdout, returncode) - :see: ``config_prover9`` - """ - if self._interpformat_bin is None: - self._interpformat_bin = self._modelbuilder._find_binary( - "interpformat", verbose - ) - - return self._modelbuilder._call( - input_str, self._interpformat_bin, args, verbose - ) - - -class Mace(Prover9Parent, ModelBuilder): - _mace4_bin = None - - def __init__(self, end_size=500): - self._end_size = end_size - """The maximum model size that Mace will try before - simply returning false. (Use -1 for no maximum.)""" - - def _build_model(self, goal=None, assumptions=None, verbose=False): - """ - Use Mace4 to build a first order model. - - :return: ``True`` if a model was found (i.e. Mace returns value of 0), - else ``False`` - """ - if not assumptions: - assumptions = [] - - stdout, returncode = self._call_mace4( - self.prover9_input(goal, assumptions), verbose=verbose - ) - return (returncode == 0, stdout) - - def _call_mace4(self, input_str, args=[], verbose=False): - """ - Call the ``mace4`` binary with the given input. - - :param input_str: A string whose contents are used as stdin. - :param args: A list of command-line arguments. - :return: A tuple (stdout, returncode) - :see: ``config_prover9`` - """ - if self._mace4_bin is None: - self._mace4_bin = self._find_binary("mace4", verbose) - - updated_input_str = "" - if self._end_size > 0: - updated_input_str += "assign(end_size, %d).\n\n" % self._end_size - updated_input_str += input_str - - return self._call(updated_input_str, self._mace4_bin, args, verbose) - - -def spacer(num=30): - print("-" * num) - - -def decode_result(found): - """ - Decode the result of model_found() - - :param found: The output of model_found() - :type found: bool - """ - return {True: "Countermodel found", False: "No countermodel found", None: "None"}[ - found - ] - - -def test_model_found(arguments): - """ - Try some proofs and exhibit the results. - """ - for (goal, assumptions) in arguments: - g = Expression.fromstring(goal) - alist = [lp.parse(a) for a in assumptions] - m = MaceCommand(g, assumptions=alist, max_models=50) - found = m.build_model() - for a in alist: - print(" %s" % a) - print(f"|- {g}: {decode_result(found)}\n") - - -def test_build_model(arguments): - """ - Try to build a ``nltk.sem.Valuation``. - """ - g = Expression.fromstring("all x.man(x)") - alist = [ - Expression.fromstring(a) - for a in [ - "man(John)", - "man(Socrates)", - "man(Bill)", - "some x.(-(x = John) & man(x) & sees(John,x))", - "some x.(-(x = Bill) & man(x))", - "all x.some y.(man(x) -> gives(Socrates,x,y))", - ] - ] - - m = MaceCommand(g, assumptions=alist) - m.build_model() - spacer() - print("Assumptions and Goal") - spacer() - for a in alist: - print(" %s" % a) - print(f"|- {g}: {decode_result(m.build_model())}\n") - spacer() - # print(m.model('standard')) - # print(m.model('cooked')) - print("Valuation") - spacer() - print(m.valuation, "\n") - - -def test_transform_output(argument_pair): - """ - Transform the model into various Mace4 ``interpformat`` formats. - """ - g = Expression.fromstring(argument_pair[0]) - alist = [lp.parse(a) for a in argument_pair[1]] - m = MaceCommand(g, assumptions=alist) - m.build_model() - for a in alist: - print(" %s" % a) - print(f"|- {g}: {m.build_model()}\n") - for format in ["standard", "portable", "xml", "cooked"]: - spacer() - print("Using '%s' format" % format) - spacer() - print(m.model(format=format)) - - -def test_make_relation_set(): - print( - MaceCommand._make_relation_set(num_entities=3, values=[1, 0, 1]) - == {("c",), ("a",)} - ) - print( - MaceCommand._make_relation_set( - num_entities=3, values=[0, 0, 0, 0, 0, 0, 1, 0, 0] - ) - == {("c", "a")} - ) - print( - MaceCommand._make_relation_set(num_entities=2, values=[0, 0, 1, 0, 0, 0, 1, 0]) - == {("a", "b", "a"), ("b", "b", "a")} - ) - - -arguments = [ - ("mortal(Socrates)", ["all x.(man(x) -> mortal(x))", "man(Socrates)"]), - ("(not mortal(Socrates))", ["all x.(man(x) -> mortal(x))", "man(Socrates)"]), -] - - -def demo(): - test_model_found(arguments) - test_build_model(arguments) - test_transform_output(arguments[1]) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/inference/nonmonotonic.py b/pipeline/nltk/inference/nonmonotonic.py deleted file mode 100644 index 2f7075ed11e7833201ad98c6fc80406d1ef646db..0000000000000000000000000000000000000000 --- a/pipeline/nltk/inference/nonmonotonic.py +++ /dev/null @@ -1,561 +0,0 @@ -# Natural Language Toolkit: Nonmonotonic Reasoning -# -# Author: Daniel H. Garrette -# -# Copyright (C) 2001-2023 NLTK Project -# URL: -# For license information, see LICENSE.TXT - -""" -A module to perform nonmonotonic reasoning. The ideas and demonstrations in -this module are based on "Logical Foundations of Artificial Intelligence" by -Michael R. Genesereth and Nils J. Nilsson. -""" - -from collections import defaultdict -from functools import reduce - -from nltk.inference.api import Prover, ProverCommandDecorator -from nltk.inference.prover9 import Prover9, Prover9Command -from nltk.sem.logic import ( - AbstractVariableExpression, - AllExpression, - AndExpression, - ApplicationExpression, - BooleanExpression, - EqualityExpression, - ExistsExpression, - Expression, - ImpExpression, - NegatedExpression, - Variable, - VariableExpression, - operator, - unique_variable, -) - - -class ProverParseError(Exception): - pass - - -def get_domain(goal, assumptions): - if goal is None: - all_expressions = assumptions - else: - all_expressions = assumptions + [-goal] - return reduce(operator.or_, (a.constants() for a in all_expressions), set()) - - -class ClosedDomainProver(ProverCommandDecorator): - """ - This is a prover decorator that adds domain closure assumptions before - proving. - """ - - def assumptions(self): - assumptions = [a for a in self._command.assumptions()] - goal = self._command.goal() - domain = get_domain(goal, assumptions) - return [self.replace_quants(ex, domain) for ex in assumptions] - - def goal(self): - goal = self._command.goal() - domain = get_domain(goal, self._command.assumptions()) - return self.replace_quants(goal, domain) - - def replace_quants(self, ex, domain): - """ - Apply the closed domain assumption to the expression - - - Domain = union([e.free()|e.constants() for e in all_expressions]) - - translate "exists x.P" to "(z=d1 | z=d2 | ... ) & P.replace(x,z)" OR - "P.replace(x, d1) | P.replace(x, d2) | ..." - - translate "all x.P" to "P.replace(x, d1) & P.replace(x, d2) & ..." - - :param ex: ``Expression`` - :param domain: set of {Variable}s - :return: ``Expression`` - """ - if isinstance(ex, AllExpression): - conjuncts = [ - ex.term.replace(ex.variable, VariableExpression(d)) for d in domain - ] - conjuncts = [self.replace_quants(c, domain) for c in conjuncts] - return reduce(lambda x, y: x & y, conjuncts) - elif isinstance(ex, BooleanExpression): - return ex.__class__( - self.replace_quants(ex.first, domain), - self.replace_quants(ex.second, domain), - ) - elif isinstance(ex, NegatedExpression): - return -self.replace_quants(ex.term, domain) - elif isinstance(ex, ExistsExpression): - disjuncts = [ - ex.term.replace(ex.variable, VariableExpression(d)) for d in domain - ] - disjuncts = [self.replace_quants(d, domain) for d in disjuncts] - return reduce(lambda x, y: x | y, disjuncts) - else: - return ex - - -class UniqueNamesProver(ProverCommandDecorator): - """ - This is a prover decorator that adds unique names assumptions before - proving. - """ - - def assumptions(self): - """ - - Domain = union([e.free()|e.constants() for e in all_expressions]) - - if "d1 = d2" cannot be proven from the premises, then add "d1 != d2" - """ - assumptions = self._command.assumptions() - - domain = list(get_domain(self._command.goal(), assumptions)) - - # build a dictionary of obvious equalities - eq_sets = SetHolder() - for a in assumptions: - if isinstance(a, EqualityExpression): - av = a.first.variable - bv = a.second.variable - # put 'a' and 'b' in the same set - eq_sets[av].add(bv) - - new_assumptions = [] - for i, a in enumerate(domain): - for b in domain[i + 1 :]: - # if a and b are not already in the same equality set - if b not in eq_sets[a]: - newEqEx = EqualityExpression( - VariableExpression(a), VariableExpression(b) - ) - if Prover9().prove(newEqEx, assumptions): - # we can prove that the names are the same entity. - # remember that they are equal so we don't re-check. - eq_sets[a].add(b) - else: - # we can't prove it, so assume unique names - new_assumptions.append(-newEqEx) - - return assumptions + new_assumptions - - -class SetHolder(list): - """ - A list of sets of Variables. - """ - - def __getitem__(self, item): - """ - :param item: ``Variable`` - :return: the set containing 'item' - """ - assert isinstance(item, Variable) - for s in self: - if item in s: - return s - # item is not found in any existing set. so create a new set - new = {item} - self.append(new) - return new - - -class ClosedWorldProver(ProverCommandDecorator): - """ - This is a prover decorator that completes predicates before proving. - - If the assumptions contain "P(A)", then "all x.(P(x) -> (x=A))" is the completion of "P". - If the assumptions contain "all x.(ostrich(x) -> bird(x))", then "all x.(bird(x) -> ostrich(x))" is the completion of "bird". - If the assumptions don't contain anything that are "P", then "all x.-P(x)" is the completion of "P". - - walk(Socrates) - Socrates != Bill - + all x.(walk(x) -> (x=Socrates)) - ---------------- - -walk(Bill) - - see(Socrates, John) - see(John, Mary) - Socrates != John - John != Mary - + all x.all y.(see(x,y) -> ((x=Socrates & y=John) | (x=John & y=Mary))) - ---------------- - -see(Socrates, Mary) - - all x.(ostrich(x) -> bird(x)) - bird(Tweety) - -ostrich(Sam) - Sam != Tweety - + all x.(bird(x) -> (ostrich(x) | x=Tweety)) - + all x.-ostrich(x) - ------------------- - -bird(Sam) - """ - - def assumptions(self): - assumptions = self._command.assumptions() - - predicates = self._make_predicate_dict(assumptions) - - new_assumptions = [] - for p in predicates: - predHolder = predicates[p] - new_sig = self._make_unique_signature(predHolder) - new_sig_exs = [VariableExpression(v) for v in new_sig] - - disjuncts = [] - - # Turn the signatures into disjuncts - for sig in predHolder.signatures: - equality_exs = [] - for v1, v2 in zip(new_sig_exs, sig): - equality_exs.append(EqualityExpression(v1, v2)) - disjuncts.append(reduce(lambda x, y: x & y, equality_exs)) - - # Turn the properties into disjuncts - for prop in predHolder.properties: - # replace variables from the signature with new sig variables - bindings = {} - for v1, v2 in zip(new_sig_exs, prop[0]): - bindings[v2] = v1 - disjuncts.append(prop[1].substitute_bindings(bindings)) - - # make the assumption - if disjuncts: - # disjuncts exist, so make an implication - antecedent = self._make_antecedent(p, new_sig) - consequent = reduce(lambda x, y: x | y, disjuncts) - accum = ImpExpression(antecedent, consequent) - else: - # nothing has property 'p' - accum = NegatedExpression(self._make_antecedent(p, new_sig)) - - # quantify the implication - for new_sig_var in new_sig[::-1]: - accum = AllExpression(new_sig_var, accum) - new_assumptions.append(accum) - - return assumptions + new_assumptions - - def _make_unique_signature(self, predHolder): - """ - This method figures out how many arguments the predicate takes and - returns a tuple containing that number of unique variables. - """ - return tuple(unique_variable() for i in range(predHolder.signature_len)) - - def _make_antecedent(self, predicate, signature): - """ - Return an application expression with 'predicate' as the predicate - and 'signature' as the list of arguments. - """ - antecedent = predicate - for v in signature: - antecedent = antecedent(VariableExpression(v)) - return antecedent - - def _make_predicate_dict(self, assumptions): - """ - Create a dictionary of predicates from the assumptions. - - :param assumptions: a list of ``Expression``s - :return: dict mapping ``AbstractVariableExpression`` to ``PredHolder`` - """ - predicates = defaultdict(PredHolder) - for a in assumptions: - self._map_predicates(a, predicates) - return predicates - - def _map_predicates(self, expression, predDict): - if isinstance(expression, ApplicationExpression): - func, args = expression.uncurry() - if isinstance(func, AbstractVariableExpression): - predDict[func].append_sig(tuple(args)) - elif isinstance(expression, AndExpression): - self._map_predicates(expression.first, predDict) - self._map_predicates(expression.second, predDict) - elif isinstance(expression, AllExpression): - # collect all the universally quantified variables - sig = [expression.variable] - term = expression.term - while isinstance(term, AllExpression): - sig.append(term.variable) - term = term.term - if isinstance(term, ImpExpression): - if isinstance(term.first, ApplicationExpression) and isinstance( - term.second, ApplicationExpression - ): - func1, args1 = term.first.uncurry() - func2, args2 = term.second.uncurry() - if ( - isinstance(func1, AbstractVariableExpression) - and isinstance(func2, AbstractVariableExpression) - and sig == [v.variable for v in args1] - and sig == [v.variable for v in args2] - ): - predDict[func2].append_prop((tuple(sig), term.first)) - predDict[func1].validate_sig_len(sig) - - -class PredHolder: - """ - This class will be used by a dictionary that will store information - about predicates to be used by the ``ClosedWorldProver``. - - The 'signatures' property is a list of tuples defining signatures for - which the predicate is true. For instance, 'see(john, mary)' would be - result in the signature '(john,mary)' for 'see'. - - The second element of the pair is a list of pairs such that the first - element of the pair is a tuple of variables and the second element is an - expression of those variables that makes the predicate true. For instance, - 'all x.all y.(see(x,y) -> know(x,y))' would result in "((x,y),('see(x,y)'))" - for 'know'. - """ - - def __init__(self): - self.signatures = [] - self.properties = [] - self.signature_len = None - - def append_sig(self, new_sig): - self.validate_sig_len(new_sig) - self.signatures.append(new_sig) - - def append_prop(self, new_prop): - self.validate_sig_len(new_prop[0]) - self.properties.append(new_prop) - - def validate_sig_len(self, new_sig): - if self.signature_len is None: - self.signature_len = len(new_sig) - elif self.signature_len != len(new_sig): - raise Exception("Signature lengths do not match") - - def __str__(self): - return f"({self.signatures},{self.properties},{self.signature_len})" - - def __repr__(self): - return "%s" % self - - -def closed_domain_demo(): - lexpr = Expression.fromstring - - p1 = lexpr(r"exists x.walk(x)") - p2 = lexpr(r"man(Socrates)") - c = lexpr(r"walk(Socrates)") - prover = Prover9Command(c, [p1, p2]) - print(prover.prove()) - cdp = ClosedDomainProver(prover) - print("assumptions:") - for a in cdp.assumptions(): - print(" ", a) - print("goal:", cdp.goal()) - print(cdp.prove()) - - p1 = lexpr(r"exists x.walk(x)") - p2 = lexpr(r"man(Socrates)") - p3 = lexpr(r"-walk(Bill)") - c = lexpr(r"walk(Socrates)") - prover = Prover9Command(c, [p1, p2, p3]) - print(prover.prove()) - cdp = ClosedDomainProver(prover) - print("assumptions:") - for a in cdp.assumptions(): - print(" ", a) - print("goal:", cdp.goal()) - print(cdp.prove()) - - p1 = lexpr(r"exists x.walk(x)") - p2 = lexpr(r"man(Socrates)") - p3 = lexpr(r"-walk(Bill)") - c = lexpr(r"walk(Socrates)") - prover = Prover9Command(c, [p1, p2, p3]) - print(prover.prove()) - cdp = ClosedDomainProver(prover) - print("assumptions:") - for a in cdp.assumptions(): - print(" ", a) - print("goal:", cdp.goal()) - print(cdp.prove()) - - p1 = lexpr(r"walk(Socrates)") - p2 = lexpr(r"walk(Bill)") - c = lexpr(r"all x.walk(x)") - prover = Prover9Command(c, [p1, p2]) - print(prover.prove()) - cdp = ClosedDomainProver(prover) - print("assumptions:") - for a in cdp.assumptions(): - print(" ", a) - print("goal:", cdp.goal()) - print(cdp.prove()) - - p1 = lexpr(r"girl(mary)") - p2 = lexpr(r"dog(rover)") - p3 = lexpr(r"all x.(girl(x) -> -dog(x))") - p4 = lexpr(r"all x.(dog(x) -> -girl(x))") - p5 = lexpr(r"chase(mary, rover)") - c = lexpr(r"exists y.(dog(y) & all x.(girl(x) -> chase(x,y)))") - prover = Prover9Command(c, [p1, p2, p3, p4, p5]) - print(prover.prove()) - cdp = ClosedDomainProver(prover) - print("assumptions:") - for a in cdp.assumptions(): - print(" ", a) - print("goal:", cdp.goal()) - print(cdp.prove()) - - -def unique_names_demo(): - lexpr = Expression.fromstring - - p1 = lexpr(r"man(Socrates)") - p2 = lexpr(r"man(Bill)") - c = lexpr(r"exists x.exists y.(x != y)") - prover = Prover9Command(c, [p1, p2]) - print(prover.prove()) - unp = UniqueNamesProver(prover) - print("assumptions:") - for a in unp.assumptions(): - print(" ", a) - print("goal:", unp.goal()) - print(unp.prove()) - - p1 = lexpr(r"all x.(walk(x) -> (x = Socrates))") - p2 = lexpr(r"Bill = William") - p3 = lexpr(r"Bill = Billy") - c = lexpr(r"-walk(William)") - prover = Prover9Command(c, [p1, p2, p3]) - print(prover.prove()) - unp = UniqueNamesProver(prover) - print("assumptions:") - for a in unp.assumptions(): - print(" ", a) - print("goal:", unp.goal()) - print(unp.prove()) - - -def closed_world_demo(): - lexpr = Expression.fromstring - - p1 = lexpr(r"walk(Socrates)") - p2 = lexpr(r"(Socrates != Bill)") - c = lexpr(r"-walk(Bill)") - prover = Prover9Command(c, [p1, p2]) - print(prover.prove()) - cwp = ClosedWorldProver(prover) - print("assumptions:") - for a in cwp.assumptions(): - print(" ", a) - print("goal:", cwp.goal()) - print(cwp.prove()) - - p1 = lexpr(r"see(Socrates, John)") - p2 = lexpr(r"see(John, Mary)") - p3 = lexpr(r"(Socrates != John)") - p4 = lexpr(r"(John != Mary)") - c = lexpr(r"-see(Socrates, Mary)") - prover = Prover9Command(c, [p1, p2, p3, p4]) - print(prover.prove()) - cwp = ClosedWorldProver(prover) - print("assumptions:") - for a in cwp.assumptions(): - print(" ", a) - print("goal:", cwp.goal()) - print(cwp.prove()) - - p1 = lexpr(r"all x.(ostrich(x) -> bird(x))") - p2 = lexpr(r"bird(Tweety)") - p3 = lexpr(r"-ostrich(Sam)") - p4 = lexpr(r"Sam != Tweety") - c = lexpr(r"-bird(Sam)") - prover = Prover9Command(c, [p1, p2, p3, p4]) - print(prover.prove()) - cwp = ClosedWorldProver(prover) - print("assumptions:") - for a in cwp.assumptions(): - print(" ", a) - print("goal:", cwp.goal()) - print(cwp.prove()) - - -def combination_prover_demo(): - lexpr = Expression.fromstring - - p1 = lexpr(r"see(Socrates, John)") - p2 = lexpr(r"see(John, Mary)") - c = lexpr(r"-see(Socrates, Mary)") - prover = Prover9Command(c, [p1, p2]) - print(prover.prove()) - command = ClosedDomainProver(UniqueNamesProver(ClosedWorldProver(prover))) - for a in command.assumptions(): - print(a) - print(command.prove()) - - -def default_reasoning_demo(): - lexpr = Expression.fromstring - - premises = [] - - # define taxonomy - premises.append(lexpr(r"all x.(elephant(x) -> animal(x))")) - premises.append(lexpr(r"all x.(bird(x) -> animal(x))")) - premises.append(lexpr(r"all x.(dove(x) -> bird(x))")) - premises.append(lexpr(r"all x.(ostrich(x) -> bird(x))")) - premises.append(lexpr(r"all x.(flying_ostrich(x) -> ostrich(x))")) - - # default properties - premises.append( - lexpr(r"all x.((animal(x) & -Ab1(x)) -> -fly(x))") - ) # normal animals don't fly - premises.append( - lexpr(r"all x.((bird(x) & -Ab2(x)) -> fly(x))") - ) # normal birds fly - premises.append( - lexpr(r"all x.((ostrich(x) & -Ab3(x)) -> -fly(x))") - ) # normal ostriches don't fly - - # specify abnormal entities - premises.append(lexpr(r"all x.(bird(x) -> Ab1(x))")) # flight - premises.append(lexpr(r"all x.(ostrich(x) -> Ab2(x))")) # non-flying bird - premises.append(lexpr(r"all x.(flying_ostrich(x) -> Ab3(x))")) # flying ostrich - - # define entities - premises.append(lexpr(r"elephant(E)")) - premises.append(lexpr(r"dove(D)")) - premises.append(lexpr(r"ostrich(O)")) - - # print the assumptions - prover = Prover9Command(None, premises) - command = UniqueNamesProver(ClosedWorldProver(prover)) - for a in command.assumptions(): - print(a) - - print_proof("-fly(E)", premises) - print_proof("fly(D)", premises) - print_proof("-fly(O)", premises) - - -def print_proof(goal, premises): - lexpr = Expression.fromstring - prover = Prover9Command(lexpr(goal), premises) - command = UniqueNamesProver(ClosedWorldProver(prover)) - print(goal, prover.prove(), command.prove()) - - -def demo(): - closed_domain_demo() - unique_names_demo() - closed_world_demo() - combination_prover_demo() - default_reasoning_demo() - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/inference/prover9.py b/pipeline/nltk/inference/prover9.py deleted file mode 100644 index 73345f27473f011a7628c91834606f6e1f532044..0000000000000000000000000000000000000000 --- a/pipeline/nltk/inference/prover9.py +++ /dev/null @@ -1,508 +0,0 @@ -# Natural Language Toolkit: Interface to the Prover9 Theorem Prover -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Dan Garrette -# Ewan Klein -# -# URL: -# For license information, see LICENSE.TXT -""" -A theorem prover that makes use of the external 'Prover9' package. -""" - -import os -import subprocess - -import nltk -from nltk.inference.api import BaseProverCommand, Prover -from nltk.sem.logic import ( - AllExpression, - AndExpression, - EqualityExpression, - ExistsExpression, - Expression, - IffExpression, - ImpExpression, - NegatedExpression, - OrExpression, -) - -# -# Following is not yet used. Return code for 2 actually realized as 512. -# -p9_return_codes = { - 0: True, - 1: "(FATAL)", # A fatal error occurred (user's syntax error). - 2: False, # (SOS_EMPTY) Prover9 ran out of things to do - # (sos list exhausted). - 3: "(MAX_MEGS)", # The max_megs (memory limit) parameter was exceeded. - 4: "(MAX_SECONDS)", # The max_seconds parameter was exceeded. - 5: "(MAX_GIVEN)", # The max_given parameter was exceeded. - 6: "(MAX_KEPT)", # The max_kept parameter was exceeded. - 7: "(ACTION)", # A Prover9 action terminated the search. - 101: "(SIGSEGV)", # Prover9 crashed, most probably due to a bug. -} - - -class Prover9CommandParent: - """ - A common base class used by both ``Prover9Command`` and ``MaceCommand``, - which is responsible for maintaining a goal and a set of assumptions, - and generating prover9-style input files from them. - """ - - def print_assumptions(self, output_format="nltk"): - """ - Print the list of the current assumptions. - """ - if output_format.lower() == "nltk": - for a in self.assumptions(): - print(a) - elif output_format.lower() == "prover9": - for a in convert_to_prover9(self.assumptions()): - print(a) - else: - raise NameError( - "Unrecognized value for 'output_format': %s" % output_format - ) - - -class Prover9Command(Prover9CommandParent, BaseProverCommand): - """ - A ``ProverCommand`` specific to the ``Prover9`` prover. It contains - the a print_assumptions() method that is used to print the list - of assumptions in multiple formats. - """ - - def __init__(self, goal=None, assumptions=None, timeout=60, prover=None): - """ - :param goal: Input expression to prove - :type goal: sem.Expression - :param assumptions: Input expressions to use as assumptions in - the proof. - :type assumptions: list(sem.Expression) - :param timeout: number of seconds before timeout; set to 0 for - no timeout. - :type timeout: int - :param prover: a prover. If not set, one will be created. - :type prover: Prover9 - """ - if not assumptions: - assumptions = [] - - if prover is not None: - assert isinstance(prover, Prover9) - else: - prover = Prover9(timeout) - - BaseProverCommand.__init__(self, prover, goal, assumptions) - - def decorate_proof(self, proof_string, simplify=True): - """ - :see BaseProverCommand.decorate_proof() - """ - if simplify: - return self._prover._call_prooftrans(proof_string, ["striplabels"])[ - 0 - ].rstrip() - else: - return proof_string.rstrip() - - -class Prover9Parent: - """ - A common class extended by both ``Prover9`` and ``Mace ``. - It contains the functionality required to convert NLTK-style - expressions into Prover9-style expressions. - """ - - _binary_location = None - - def config_prover9(self, binary_location, verbose=False): - if binary_location is None: - self._binary_location = None - self._prover9_bin = None - else: - name = "prover9" - self._prover9_bin = nltk.internals.find_binary( - name, - path_to_bin=binary_location, - env_vars=["PROVER9"], - url="https://www.cs.unm.edu/~mccune/prover9/", - binary_names=[name, name + ".exe"], - verbose=verbose, - ) - self._binary_location = self._prover9_bin.rsplit(os.path.sep, 1) - - def prover9_input(self, goal, assumptions): - """ - :return: The input string that should be provided to the - prover9 binary. This string is formed based on the goal, - assumptions, and timeout value of this object. - """ - s = "" - - if assumptions: - s += "formulas(assumptions).\n" - for p9_assumption in convert_to_prover9(assumptions): - s += " %s.\n" % p9_assumption - s += "end_of_list.\n\n" - - if goal: - s += "formulas(goals).\n" - s += " %s.\n" % convert_to_prover9(goal) - s += "end_of_list.\n\n" - - return s - - def binary_locations(self): - """ - A list of directories that should be searched for the prover9 - executables. This list is used by ``config_prover9`` when searching - for the prover9 executables. - """ - return [ - "/usr/local/bin/prover9", - "/usr/local/bin/prover9/bin", - "/usr/local/bin", - "/usr/bin", - "/usr/local/prover9", - "/usr/local/share/prover9", - ] - - def _find_binary(self, name, verbose=False): - binary_locations = self.binary_locations() - if self._binary_location is not None: - binary_locations += [self._binary_location] - return nltk.internals.find_binary( - name, - searchpath=binary_locations, - env_vars=["PROVER9"], - url="https://www.cs.unm.edu/~mccune/prover9/", - binary_names=[name, name + ".exe"], - verbose=verbose, - ) - - def _call(self, input_str, binary, args=[], verbose=False): - """ - Call the binary with the given input. - - :param input_str: A string whose contents are used as stdin. - :param binary: The location of the binary to call - :param args: A list of command-line arguments. - :return: A tuple (stdout, returncode) - :see: ``config_prover9`` - """ - if verbose: - print("Calling:", binary) - print("Args:", args) - print("Input:\n", input_str, "\n") - - # Call prover9 via a subprocess - cmd = [binary] + args - try: - input_str = input_str.encode("utf8") - except AttributeError: - pass - p = subprocess.Popen( - cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.PIPE - ) - (stdout, stderr) = p.communicate(input=input_str) - - if verbose: - print("Return code:", p.returncode) - if stdout: - print("stdout:\n", stdout, "\n") - if stderr: - print("stderr:\n", stderr, "\n") - - return (stdout.decode("utf-8"), p.returncode) - - -def convert_to_prover9(input): - """ - Convert a ``logic.Expression`` to Prover9 format. - """ - if isinstance(input, list): - result = [] - for s in input: - try: - result.append(_convert_to_prover9(s.simplify())) - except: - print("input %s cannot be converted to Prover9 input syntax" % input) - raise - return result - else: - try: - return _convert_to_prover9(input.simplify()) - except: - print("input %s cannot be converted to Prover9 input syntax" % input) - raise - - -def _convert_to_prover9(expression): - """ - Convert ``logic.Expression`` to Prover9 formatted string. - """ - if isinstance(expression, ExistsExpression): - return ( - "exists " - + str(expression.variable) - + " " - + _convert_to_prover9(expression.term) - ) - elif isinstance(expression, AllExpression): - return ( - "all " - + str(expression.variable) - + " " - + _convert_to_prover9(expression.term) - ) - elif isinstance(expression, NegatedExpression): - return "-(" + _convert_to_prover9(expression.term) + ")" - elif isinstance(expression, AndExpression): - return ( - "(" - + _convert_to_prover9(expression.first) - + " & " - + _convert_to_prover9(expression.second) - + ")" - ) - elif isinstance(expression, OrExpression): - return ( - "(" - + _convert_to_prover9(expression.first) - + " | " - + _convert_to_prover9(expression.second) - + ")" - ) - elif isinstance(expression, ImpExpression): - return ( - "(" - + _convert_to_prover9(expression.first) - + " -> " - + _convert_to_prover9(expression.second) - + ")" - ) - elif isinstance(expression, IffExpression): - return ( - "(" - + _convert_to_prover9(expression.first) - + " <-> " - + _convert_to_prover9(expression.second) - + ")" - ) - elif isinstance(expression, EqualityExpression): - return ( - "(" - + _convert_to_prover9(expression.first) - + " = " - + _convert_to_prover9(expression.second) - + ")" - ) - else: - return str(expression) - - -class Prover9(Prover9Parent, Prover): - _prover9_bin = None - _prooftrans_bin = None - - def __init__(self, timeout=60): - self._timeout = timeout - """The timeout value for prover9. If a proof can not be found - in this amount of time, then prover9 will return false. - (Use 0 for no timeout.)""" - - def _prove(self, goal=None, assumptions=None, verbose=False): - """ - Use Prover9 to prove a theorem. - :return: A pair whose first element is a boolean indicating if the - proof was successful (i.e. returns value of 0) and whose second element - is the output of the prover. - """ - if not assumptions: - assumptions = [] - - stdout, returncode = self._call_prover9( - self.prover9_input(goal, assumptions), verbose=verbose - ) - return (returncode == 0, stdout) - - def prover9_input(self, goal, assumptions): - """ - :see: Prover9Parent.prover9_input - """ - s = "clear(auto_denials).\n" # only one proof required - return s + Prover9Parent.prover9_input(self, goal, assumptions) - - def _call_prover9(self, input_str, args=[], verbose=False): - """ - Call the ``prover9`` binary with the given input. - - :param input_str: A string whose contents are used as stdin. - :param args: A list of command-line arguments. - :return: A tuple (stdout, returncode) - :see: ``config_prover9`` - """ - if self._prover9_bin is None: - self._prover9_bin = self._find_binary("prover9", verbose) - - updated_input_str = "" - if self._timeout > 0: - updated_input_str += "assign(max_seconds, %d).\n\n" % self._timeout - updated_input_str += input_str - - stdout, returncode = self._call( - updated_input_str, self._prover9_bin, args, verbose - ) - - if returncode not in [0, 2]: - errormsgprefix = "%%ERROR:" - if errormsgprefix in stdout: - msgstart = stdout.index(errormsgprefix) - errormsg = stdout[msgstart:].strip() - else: - errormsg = None - if returncode in [3, 4, 5, 6]: - raise Prover9LimitExceededException(returncode, errormsg) - else: - raise Prover9FatalException(returncode, errormsg) - - return stdout, returncode - - def _call_prooftrans(self, input_str, args=[], verbose=False): - """ - Call the ``prooftrans`` binary with the given input. - - :param input_str: A string whose contents are used as stdin. - :param args: A list of command-line arguments. - :return: A tuple (stdout, returncode) - :see: ``config_prover9`` - """ - if self._prooftrans_bin is None: - self._prooftrans_bin = self._find_binary("prooftrans", verbose) - - return self._call(input_str, self._prooftrans_bin, args, verbose) - - -class Prover9Exception(Exception): - def __init__(self, returncode, message): - msg = p9_return_codes[returncode] - if message: - msg += "\n%s" % message - Exception.__init__(self, msg) - - -class Prover9FatalException(Prover9Exception): - pass - - -class Prover9LimitExceededException(Prover9Exception): - pass - - -###################################################################### -# { Tests and Demos -###################################################################### - - -def test_config(): - - a = Expression.fromstring("(walk(j) & sing(j))") - g = Expression.fromstring("walk(j)") - p = Prover9Command(g, assumptions=[a]) - p._executable_path = None - p.prover9_search = [] - p.prove() - # config_prover9('/usr/local/bin') - print(p.prove()) - print(p.proof()) - - -def test_convert_to_prover9(expr): - """ - Test that parsing works OK. - """ - for t in expr: - e = Expression.fromstring(t) - print(convert_to_prover9(e)) - - -def test_prove(arguments): - """ - Try some proofs and exhibit the results. - """ - for (goal, assumptions) in arguments: - g = Expression.fromstring(goal) - alist = [Expression.fromstring(a) for a in assumptions] - p = Prover9Command(g, assumptions=alist).prove() - for a in alist: - print(" %s" % a) - print(f"|- {g}: {p}\n") - - -arguments = [ - ("(man(x) <-> (not (not man(x))))", []), - ("(not (man(x) & (not man(x))))", []), - ("(man(x) | (not man(x)))", []), - ("(man(x) & (not man(x)))", []), - ("(man(x) -> man(x))", []), - ("(not (man(x) & (not man(x))))", []), - ("(man(x) | (not man(x)))", []), - ("(man(x) -> man(x))", []), - ("(man(x) <-> man(x))", []), - ("(not (man(x) <-> (not man(x))))", []), - ("mortal(Socrates)", ["all x.(man(x) -> mortal(x))", "man(Socrates)"]), - ("((all x.(man(x) -> walks(x)) & man(Socrates)) -> some y.walks(y))", []), - ("(all x.man(x) -> all x.man(x))", []), - ("some x.all y.sees(x,y)", []), - ( - "some e3.(walk(e3) & subj(e3, mary))", - [ - "some e1.(see(e1) & subj(e1, john) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))" - ], - ), - ( - "some x e1.(see(e1) & subj(e1, x) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))", - [ - "some e1.(see(e1) & subj(e1, john) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))" - ], - ), -] - -expressions = [ - r"some x y.sees(x,y)", - r"some x.(man(x) & walks(x))", - r"\x.(man(x) & walks(x))", - r"\x y.sees(x,y)", - r"walks(john)", - r"\x.big(x, \y.mouse(y))", - r"(walks(x) & (runs(x) & (threes(x) & fours(x))))", - r"(walks(x) -> runs(x))", - r"some x.(PRO(x) & sees(John, x))", - r"some x.(man(x) & (not walks(x)))", - r"all x.(man(x) -> walks(x))", -] - - -def spacer(num=45): - print("-" * num) - - -def demo(): - print("Testing configuration") - spacer() - test_config() - print() - print("Testing conversion to Prover9 format") - spacer() - test_convert_to_prover9(expressions) - print() - print("Testing proofs") - spacer() - test_prove(arguments) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/inference/resolution.py b/pipeline/nltk/inference/resolution.py deleted file mode 100644 index 52428eb2c5d2bee410716b058165cbccbbb238a4..0000000000000000000000000000000000000000 --- a/pipeline/nltk/inference/resolution.py +++ /dev/null @@ -1,759 +0,0 @@ -# Natural Language Toolkit: First-order Resolution-based Theorem Prover -# -# Author: Dan Garrette -# -# Copyright (C) 2001-2023 NLTK Project -# URL: -# For license information, see LICENSE.TXT - -""" -Module for a resolution-based First Order theorem prover. -""" - -import operator -from collections import defaultdict -from functools import reduce - -from nltk.inference.api import BaseProverCommand, Prover -from nltk.sem import skolemize -from nltk.sem.logic import ( - AndExpression, - ApplicationExpression, - EqualityExpression, - Expression, - IndividualVariableExpression, - NegatedExpression, - OrExpression, - Variable, - VariableExpression, - is_indvar, - unique_variable, -) - - -class ProverParseError(Exception): - pass - - -class ResolutionProver(Prover): - ANSWER_KEY = "ANSWER" - _assume_false = True - - def _prove(self, goal=None, assumptions=None, verbose=False): - """ - :param goal: Input expression to prove - :type goal: sem.Expression - :param assumptions: Input expressions to use as assumptions in the proof - :type assumptions: list(sem.Expression) - """ - if not assumptions: - assumptions = [] - - result = None - try: - clauses = [] - if goal: - clauses.extend(clausify(-goal)) - for a in assumptions: - clauses.extend(clausify(a)) - result, clauses = self._attempt_proof(clauses) - if verbose: - print(ResolutionProverCommand._decorate_clauses(clauses)) - except RuntimeError as e: - if self._assume_false and str(e).startswith( - "maximum recursion depth exceeded" - ): - result = False - clauses = [] - else: - if verbose: - print(e) - else: - raise e - return (result, clauses) - - def _attempt_proof(self, clauses): - # map indices to lists of indices, to store attempted unifications - tried = defaultdict(list) - - i = 0 - while i < len(clauses): - if not clauses[i].is_tautology(): - # since we try clauses in order, we should start after the last - # index tried - if tried[i]: - j = tried[i][-1] + 1 - else: - j = i + 1 # nothing tried yet for 'i', so start with the next - - while j < len(clauses): - # don't: 1) unify a clause with itself, - # 2) use tautologies - if i != j and j and not clauses[j].is_tautology(): - tried[i].append(j) - newclauses = clauses[i].unify(clauses[j]) - if newclauses: - for newclause in newclauses: - newclause._parents = (i + 1, j + 1) - clauses.append(newclause) - if not len(newclause): # if there's an empty clause - return (True, clauses) - i = -1 # since we added a new clause, restart from the top - break - j += 1 - i += 1 - return (False, clauses) - - -class ResolutionProverCommand(BaseProverCommand): - def __init__(self, goal=None, assumptions=None, prover=None): - """ - :param goal: Input expression to prove - :type goal: sem.Expression - :param assumptions: Input expressions to use as assumptions in - the proof. - :type assumptions: list(sem.Expression) - """ - if prover is not None: - assert isinstance(prover, ResolutionProver) - else: - prover = ResolutionProver() - - BaseProverCommand.__init__(self, prover, goal, assumptions) - self._clauses = None - - def prove(self, verbose=False): - """ - Perform the actual proof. Store the result to prevent unnecessary - re-proving. - """ - if self._result is None: - self._result, clauses = self._prover._prove( - self.goal(), self.assumptions(), verbose - ) - self._clauses = clauses - self._proof = ResolutionProverCommand._decorate_clauses(clauses) - return self._result - - def find_answers(self, verbose=False): - self.prove(verbose) - - answers = set() - answer_ex = VariableExpression(Variable(ResolutionProver.ANSWER_KEY)) - for clause in self._clauses: - for term in clause: - if ( - isinstance(term, ApplicationExpression) - and term.function == answer_ex - and not isinstance(term.argument, IndividualVariableExpression) - ): - answers.add(term.argument) - return answers - - @staticmethod - def _decorate_clauses(clauses): - """ - Decorate the proof output. - """ - out = "" - max_clause_len = max(len(str(clause)) for clause in clauses) - max_seq_len = len(str(len(clauses))) - for i in range(len(clauses)): - parents = "A" - taut = "" - if clauses[i].is_tautology(): - taut = "Tautology" - if clauses[i]._parents: - parents = str(clauses[i]._parents) - parents = " " * (max_clause_len - len(str(clauses[i])) + 1) + parents - seq = " " * (max_seq_len - len(str(i + 1))) + str(i + 1) - out += f"[{seq}] {clauses[i]} {parents} {taut}\n" - return out - - -class Clause(list): - def __init__(self, data): - list.__init__(self, data) - self._is_tautology = None - self._parents = None - - def unify(self, other, bindings=None, used=None, skipped=None, debug=False): - """ - Attempt to unify this Clause with the other, returning a list of - resulting, unified, Clauses. - - :param other: ``Clause`` with which to unify - :param bindings: ``BindingDict`` containing bindings that should be used - during the unification - :param used: tuple of two lists of atoms. The first lists the - atoms from 'self' that were successfully unified with atoms from - 'other'. The second lists the atoms from 'other' that were successfully - unified with atoms from 'self'. - :param skipped: tuple of two ``Clause`` objects. The first is a list of all - the atoms from the 'self' Clause that have not been unified with - anything on the path. The second is same thing for the 'other' Clause. - :param debug: bool indicating whether debug statements should print - :return: list containing all the resulting ``Clause`` objects that could be - obtained by unification - """ - if bindings is None: - bindings = BindingDict() - if used is None: - used = ([], []) - if skipped is None: - skipped = ([], []) - if isinstance(debug, bool): - debug = DebugObject(debug) - - newclauses = _iterate_first( - self, other, bindings, used, skipped, _complete_unify_path, debug - ) - - # remove subsumed clauses. make a list of all indices of subsumed - # clauses, and then remove them from the list - subsumed = [] - for i, c1 in enumerate(newclauses): - if i not in subsumed: - for j, c2 in enumerate(newclauses): - if i != j and j not in subsumed and c1.subsumes(c2): - subsumed.append(j) - result = [] - for i in range(len(newclauses)): - if i not in subsumed: - result.append(newclauses[i]) - - return result - - def isSubsetOf(self, other): - """ - Return True iff every term in 'self' is a term in 'other'. - - :param other: ``Clause`` - :return: bool - """ - for a in self: - if a not in other: - return False - return True - - def subsumes(self, other): - """ - Return True iff 'self' subsumes 'other', this is, if there is a - substitution such that every term in 'self' can be unified with a term - in 'other'. - - :param other: ``Clause`` - :return: bool - """ - negatedother = [] - for atom in other: - if isinstance(atom, NegatedExpression): - negatedother.append(atom.term) - else: - negatedother.append(-atom) - - negatedotherClause = Clause(negatedother) - - bindings = BindingDict() - used = ([], []) - skipped = ([], []) - debug = DebugObject(False) - - return ( - len( - _iterate_first( - self, - negatedotherClause, - bindings, - used, - skipped, - _subsumes_finalize, - debug, - ) - ) - > 0 - ) - - def __getslice__(self, start, end): - return Clause(list.__getslice__(self, start, end)) - - def __sub__(self, other): - return Clause([a for a in self if a not in other]) - - def __add__(self, other): - return Clause(list.__add__(self, other)) - - def is_tautology(self): - """ - Self is a tautology if it contains ground terms P and -P. The ground - term, P, must be an exact match, ie, not using unification. - """ - if self._is_tautology is not None: - return self._is_tautology - for i, a in enumerate(self): - if not isinstance(a, EqualityExpression): - j = len(self) - 1 - while j > i: - b = self[j] - if isinstance(a, NegatedExpression): - if a.term == b: - self._is_tautology = True - return True - elif isinstance(b, NegatedExpression): - if a == b.term: - self._is_tautology = True - return True - j -= 1 - self._is_tautology = False - return False - - def free(self): - return reduce(operator.or_, ((atom.free() | atom.constants()) for atom in self)) - - def replace(self, variable, expression): - """ - Replace every instance of variable with expression across every atom - in the clause - - :param variable: ``Variable`` - :param expression: ``Expression`` - """ - return Clause([atom.replace(variable, expression) for atom in self]) - - def substitute_bindings(self, bindings): - """ - Replace every binding - - :param bindings: A list of tuples mapping Variable Expressions to the - Expressions to which they are bound. - :return: ``Clause`` - """ - return Clause([atom.substitute_bindings(bindings) for atom in self]) - - def __str__(self): - return "{" + ", ".join("%s" % item for item in self) + "}" - - def __repr__(self): - return "%s" % self - - -def _iterate_first(first, second, bindings, used, skipped, finalize_method, debug): - """ - This method facilitates movement through the terms of 'self' - """ - debug.line(f"unify({first},{second}) {bindings}") - - if not len(first) or not len(second): # if no more recursions can be performed - return finalize_method(first, second, bindings, used, skipped, debug) - else: - # explore this 'self' atom - result = _iterate_second( - first, second, bindings, used, skipped, finalize_method, debug + 1 - ) - - # skip this possible 'self' atom - newskipped = (skipped[0] + [first[0]], skipped[1]) - result += _iterate_first( - first[1:], second, bindings, used, newskipped, finalize_method, debug + 1 - ) - - try: - newbindings, newused, unused = _unify_terms( - first[0], second[0], bindings, used - ) - # Unification found, so progress with this line of unification - # put skipped and unused terms back into play for later unification. - newfirst = first[1:] + skipped[0] + unused[0] - newsecond = second[1:] + skipped[1] + unused[1] - result += _iterate_first( - newfirst, - newsecond, - newbindings, - newused, - ([], []), - finalize_method, - debug + 1, - ) - except BindingException: - # the atoms could not be unified, - pass - - return result - - -def _iterate_second(first, second, bindings, used, skipped, finalize_method, debug): - """ - This method facilitates movement through the terms of 'other' - """ - debug.line(f"unify({first},{second}) {bindings}") - - if not len(first) or not len(second): # if no more recursions can be performed - return finalize_method(first, second, bindings, used, skipped, debug) - else: - # skip this possible pairing and move to the next - newskipped = (skipped[0], skipped[1] + [second[0]]) - result = _iterate_second( - first, second[1:], bindings, used, newskipped, finalize_method, debug + 1 - ) - - try: - newbindings, newused, unused = _unify_terms( - first[0], second[0], bindings, used - ) - # Unification found, so progress with this line of unification - # put skipped and unused terms back into play for later unification. - newfirst = first[1:] + skipped[0] + unused[0] - newsecond = second[1:] + skipped[1] + unused[1] - result += _iterate_second( - newfirst, - newsecond, - newbindings, - newused, - ([], []), - finalize_method, - debug + 1, - ) - except BindingException: - # the atoms could not be unified, - pass - - return result - - -def _unify_terms(a, b, bindings=None, used=None): - """ - This method attempts to unify two terms. Two expressions are unifiable - if there exists a substitution function S such that S(a) == S(-b). - - :param a: ``Expression`` - :param b: ``Expression`` - :param bindings: ``BindingDict`` a starting set of bindings with which - the unification must be consistent - :return: ``BindingDict`` A dictionary of the bindings required to unify - :raise ``BindingException``: If the terms cannot be unified - """ - assert isinstance(a, Expression) - assert isinstance(b, Expression) - - if bindings is None: - bindings = BindingDict() - if used is None: - used = ([], []) - - # Use resolution - if isinstance(a, NegatedExpression) and isinstance(b, ApplicationExpression): - newbindings = most_general_unification(a.term, b, bindings) - newused = (used[0] + [a], used[1] + [b]) - unused = ([], []) - elif isinstance(a, ApplicationExpression) and isinstance(b, NegatedExpression): - newbindings = most_general_unification(a, b.term, bindings) - newused = (used[0] + [a], used[1] + [b]) - unused = ([], []) - - # Use demodulation - elif isinstance(a, EqualityExpression): - newbindings = BindingDict([(a.first.variable, a.second)]) - newused = (used[0] + [a], used[1]) - unused = ([], [b]) - elif isinstance(b, EqualityExpression): - newbindings = BindingDict([(b.first.variable, b.second)]) - newused = (used[0], used[1] + [b]) - unused = ([a], []) - - else: - raise BindingException((a, b)) - - return newbindings, newused, unused - - -def _complete_unify_path(first, second, bindings, used, skipped, debug): - if used[0] or used[1]: # if bindings were made along the path - newclause = Clause(skipped[0] + skipped[1] + first + second) - debug.line(" -> New Clause: %s" % newclause) - return [newclause.substitute_bindings(bindings)] - else: # no bindings made means no unification occurred. so no result - debug.line(" -> End") - return [] - - -def _subsumes_finalize(first, second, bindings, used, skipped, debug): - if not len(skipped[0]) and not len(first): - # If there are no skipped terms and no terms left in 'first', then - # all of the terms in the original 'self' were unified with terms - # in 'other'. Therefore, there exists a binding (this one) such that - # every term in self can be unified with a term in other, which - # is the definition of subsumption. - return [True] - else: - return [] - - -def clausify(expression): - """ - Skolemize, clausify, and standardize the variables apart. - """ - clause_list = [] - for clause in _clausify(skolemize(expression)): - for free in clause.free(): - if is_indvar(free.name): - newvar = VariableExpression(unique_variable()) - clause = clause.replace(free, newvar) - clause_list.append(clause) - return clause_list - - -def _clausify(expression): - """ - :param expression: a skolemized expression in CNF - """ - if isinstance(expression, AndExpression): - return _clausify(expression.first) + _clausify(expression.second) - elif isinstance(expression, OrExpression): - first = _clausify(expression.first) - second = _clausify(expression.second) - assert len(first) == 1 - assert len(second) == 1 - return [first[0] + second[0]] - elif isinstance(expression, EqualityExpression): - return [Clause([expression])] - elif isinstance(expression, ApplicationExpression): - return [Clause([expression])] - elif isinstance(expression, NegatedExpression): - if isinstance(expression.term, ApplicationExpression): - return [Clause([expression])] - elif isinstance(expression.term, EqualityExpression): - return [Clause([expression])] - raise ProverParseError() - - -class BindingDict: - def __init__(self, binding_list=None): - """ - :param binding_list: list of (``AbstractVariableExpression``, ``AtomicExpression``) to initialize the dictionary - """ - self.d = {} - - if binding_list: - for (v, b) in binding_list: - self[v] = b - - def __setitem__(self, variable, binding): - """ - A binding is consistent with the dict if its variable is not already bound, OR if its - variable is already bound to its argument. - - :param variable: ``Variable`` The variable to bind - :param binding: ``Expression`` The atomic to which 'variable' should be bound - :raise BindingException: If the variable cannot be bound in this dictionary - """ - assert isinstance(variable, Variable) - assert isinstance(binding, Expression) - - try: - existing = self[variable] - except KeyError: - existing = None - - if not existing or binding == existing: - self.d[variable] = binding - elif isinstance(binding, IndividualVariableExpression): - # Since variable is already bound, try to bind binding to variable - try: - existing = self[binding.variable] - except KeyError: - existing = None - - binding2 = VariableExpression(variable) - - if not existing or binding2 == existing: - self.d[binding.variable] = binding2 - else: - raise BindingException( - "Variable %s already bound to another " "value" % (variable) - ) - else: - raise BindingException( - "Variable %s already bound to another " "value" % (variable) - ) - - def __getitem__(self, variable): - """ - Return the expression to which 'variable' is bound - """ - assert isinstance(variable, Variable) - - intermediate = self.d[variable] - while intermediate: - try: - intermediate = self.d[intermediate] - except KeyError: - return intermediate - - def __contains__(self, item): - return item in self.d - - def __add__(self, other): - """ - :param other: ``BindingDict`` The dict with which to combine self - :return: ``BindingDict`` A new dict containing all the elements of both parameters - :raise BindingException: If the parameter dictionaries are not consistent with each other - """ - try: - combined = BindingDict() - for v in self.d: - combined[v] = self.d[v] - for v in other.d: - combined[v] = other.d[v] - return combined - except BindingException as e: - raise BindingException( - "Attempting to add two contradicting " - "BindingDicts: '%s' and '%s'" % (self, other) - ) from e - - def __len__(self): - return len(self.d) - - def __str__(self): - data_str = ", ".join(f"{v}: {self.d[v]}" for v in sorted(self.d.keys())) - return "{" + data_str + "}" - - def __repr__(self): - return "%s" % self - - -def most_general_unification(a, b, bindings=None): - """ - Find the most general unification of the two given expressions - - :param a: ``Expression`` - :param b: ``Expression`` - :param bindings: ``BindingDict`` a starting set of bindings with which the - unification must be consistent - :return: a list of bindings - :raise BindingException: if the Expressions cannot be unified - """ - if bindings is None: - bindings = BindingDict() - - if a == b: - return bindings - elif isinstance(a, IndividualVariableExpression): - return _mgu_var(a, b, bindings) - elif isinstance(b, IndividualVariableExpression): - return _mgu_var(b, a, bindings) - elif isinstance(a, ApplicationExpression) and isinstance(b, ApplicationExpression): - return most_general_unification( - a.function, b.function, bindings - ) + most_general_unification(a.argument, b.argument, bindings) - raise BindingException((a, b)) - - -def _mgu_var(var, expression, bindings): - if var.variable in expression.free() | expression.constants(): - raise BindingException((var, expression)) - else: - return BindingDict([(var.variable, expression)]) + bindings - - -class BindingException(Exception): - def __init__(self, arg): - if isinstance(arg, tuple): - Exception.__init__(self, "'%s' cannot be bound to '%s'" % arg) - else: - Exception.__init__(self, arg) - - -class UnificationException(Exception): - def __init__(self, a, b): - Exception.__init__(self, f"'{a}' cannot unify with '{b}'") - - -class DebugObject: - def __init__(self, enabled=True, indent=0): - self.enabled = enabled - self.indent = indent - - def __add__(self, i): - return DebugObject(self.enabled, self.indent + i) - - def line(self, line): - if self.enabled: - print(" " * self.indent + line) - - -def testResolutionProver(): - resolution_test(r"man(x)") - resolution_test(r"(man(x) -> man(x))") - resolution_test(r"(man(x) -> --man(x))") - resolution_test(r"-(man(x) and -man(x))") - resolution_test(r"(man(x) or -man(x))") - resolution_test(r"(man(x) -> man(x))") - resolution_test(r"-(man(x) and -man(x))") - resolution_test(r"(man(x) or -man(x))") - resolution_test(r"(man(x) -> man(x))") - resolution_test(r"(man(x) iff man(x))") - resolution_test(r"-(man(x) iff -man(x))") - resolution_test("all x.man(x)") - resolution_test("-all x.some y.F(x,y) & some x.all y.(-F(x,y))") - resolution_test("some x.all y.sees(x,y)") - - p1 = Expression.fromstring(r"all x.(man(x) -> mortal(x))") - p2 = Expression.fromstring(r"man(Socrates)") - c = Expression.fromstring(r"mortal(Socrates)") - print(f"{p1}, {p2} |- {c}: {ResolutionProver().prove(c, [p1, p2])}") - - p1 = Expression.fromstring(r"all x.(man(x) -> walks(x))") - p2 = Expression.fromstring(r"man(John)") - c = Expression.fromstring(r"some y.walks(y)") - print(f"{p1}, {p2} |- {c}: {ResolutionProver().prove(c, [p1, p2])}") - - p = Expression.fromstring(r"some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))") - c = Expression.fromstring(r"some e0.walk(e0,mary)") - print(f"{p} |- {c}: {ResolutionProver().prove(c, [p])}") - - -def resolution_test(e): - f = Expression.fromstring(e) - t = ResolutionProver().prove(f) - print(f"|- {f}: {t}") - - -def test_clausify(): - lexpr = Expression.fromstring - - print(clausify(lexpr("P(x) | Q(x)"))) - print(clausify(lexpr("(P(x) & Q(x)) | R(x)"))) - print(clausify(lexpr("P(x) | (Q(x) & R(x))"))) - print(clausify(lexpr("(P(x) & Q(x)) | (R(x) & S(x))"))) - - print(clausify(lexpr("P(x) | Q(x) | R(x)"))) - print(clausify(lexpr("P(x) | (Q(x) & R(x)) | S(x)"))) - - print(clausify(lexpr("exists x.P(x) | Q(x)"))) - - print(clausify(lexpr("-(-P(x) & Q(x))"))) - print(clausify(lexpr("P(x) <-> Q(x)"))) - print(clausify(lexpr("-(P(x) <-> Q(x))"))) - print(clausify(lexpr("-(all x.P(x))"))) - print(clausify(lexpr("-(some x.P(x))"))) - - print(clausify(lexpr("some x.P(x)"))) - print(clausify(lexpr("some x.all y.P(x,y)"))) - print(clausify(lexpr("all y.some x.P(x,y)"))) - print(clausify(lexpr("all z.all y.some x.P(x,y,z)"))) - print(clausify(lexpr("all x.(all y.P(x,y) -> -all y.(Q(x,y) -> R(x,y)))"))) - - -def demo(): - test_clausify() - print() - testResolutionProver() - print() - - p = Expression.fromstring("man(x)") - print(ResolutionProverCommand(p, [p]).prove()) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/inference/tableau.py b/pipeline/nltk/inference/tableau.py deleted file mode 100644 index 620f21b465225f3d8dc91a05414bfd9bbbe3e5c2..0000000000000000000000000000000000000000 --- a/pipeline/nltk/inference/tableau.py +++ /dev/null @@ -1,712 +0,0 @@ -# Natural Language Toolkit: First-Order Tableau Theorem Prover -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Dan Garrette -# -# URL: -# For license information, see LICENSE.TXT - -""" -Module for a tableau-based First Order theorem prover. -""" - -from nltk.inference.api import BaseProverCommand, Prover -from nltk.internals import Counter -from nltk.sem.logic import ( - AbstractVariableExpression, - AllExpression, - AndExpression, - ApplicationExpression, - EqualityExpression, - ExistsExpression, - Expression, - FunctionVariableExpression, - IffExpression, - ImpExpression, - LambdaExpression, - NegatedExpression, - OrExpression, - Variable, - VariableExpression, - unique_variable, -) - -_counter = Counter() - - -class ProverParseError(Exception): - pass - - -class TableauProver(Prover): - _assume_false = False - - def _prove(self, goal=None, assumptions=None, verbose=False): - if not assumptions: - assumptions = [] - - result = None - try: - agenda = Agenda() - if goal: - agenda.put(-goal) - agenda.put_all(assumptions) - debugger = Debug(verbose) - result = self._attempt_proof(agenda, set(), set(), debugger) - except RuntimeError as e: - if self._assume_false and str(e).startswith( - "maximum recursion depth exceeded" - ): - result = False - else: - if verbose: - print(e) - else: - raise e - return (result, "\n".join(debugger.lines)) - - def _attempt_proof(self, agenda, accessible_vars, atoms, debug): - (current, context), category = agenda.pop_first() - - # if there's nothing left in the agenda, and we haven't closed the path - if not current: - debug.line("AGENDA EMPTY") - return False - - proof_method = { - Categories.ATOM: self._attempt_proof_atom, - Categories.PROP: self._attempt_proof_prop, - Categories.N_ATOM: self._attempt_proof_n_atom, - Categories.N_PROP: self._attempt_proof_n_prop, - Categories.APP: self._attempt_proof_app, - Categories.N_APP: self._attempt_proof_n_app, - Categories.N_EQ: self._attempt_proof_n_eq, - Categories.D_NEG: self._attempt_proof_d_neg, - Categories.N_ALL: self._attempt_proof_n_all, - Categories.N_EXISTS: self._attempt_proof_n_some, - Categories.AND: self._attempt_proof_and, - Categories.N_OR: self._attempt_proof_n_or, - Categories.N_IMP: self._attempt_proof_n_imp, - Categories.OR: self._attempt_proof_or, - Categories.IMP: self._attempt_proof_imp, - Categories.N_AND: self._attempt_proof_n_and, - Categories.IFF: self._attempt_proof_iff, - Categories.N_IFF: self._attempt_proof_n_iff, - Categories.EQ: self._attempt_proof_eq, - Categories.EXISTS: self._attempt_proof_some, - Categories.ALL: self._attempt_proof_all, - }[category] - - debug.line((current, context)) - return proof_method(current, context, agenda, accessible_vars, atoms, debug) - - def _attempt_proof_atom( - self, current, context, agenda, accessible_vars, atoms, debug - ): - # Check if the branch is closed. Return 'True' if it is - if (current, True) in atoms: - debug.line("CLOSED", 1) - return True - - if context: - if isinstance(context.term, NegatedExpression): - current = current.negate() - agenda.put(context(current).simplify()) - return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) - else: - # mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars - agenda.mark_alls_fresh() - return self._attempt_proof( - agenda, - accessible_vars | set(current.args), - atoms | {(current, False)}, - debug + 1, - ) - - def _attempt_proof_n_atom( - self, current, context, agenda, accessible_vars, atoms, debug - ): - # Check if the branch is closed. Return 'True' if it is - if (current.term, False) in atoms: - debug.line("CLOSED", 1) - return True - - if context: - if isinstance(context.term, NegatedExpression): - current = current.negate() - agenda.put(context(current).simplify()) - return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) - else: - # mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars - agenda.mark_alls_fresh() - return self._attempt_proof( - agenda, - accessible_vars | set(current.term.args), - atoms | {(current.term, True)}, - debug + 1, - ) - - def _attempt_proof_prop( - self, current, context, agenda, accessible_vars, atoms, debug - ): - # Check if the branch is closed. Return 'True' if it is - if (current, True) in atoms: - debug.line("CLOSED", 1) - return True - - # mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars - agenda.mark_alls_fresh() - return self._attempt_proof( - agenda, accessible_vars, atoms | {(current, False)}, debug + 1 - ) - - def _attempt_proof_n_prop( - self, current, context, agenda, accessible_vars, atoms, debug - ): - # Check if the branch is closed. Return 'True' if it is - if (current.term, False) in atoms: - debug.line("CLOSED", 1) - return True - - # mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars - agenda.mark_alls_fresh() - return self._attempt_proof( - agenda, accessible_vars, atoms | {(current.term, True)}, debug + 1 - ) - - def _attempt_proof_app( - self, current, context, agenda, accessible_vars, atoms, debug - ): - f, args = current.uncurry() - for i, arg in enumerate(args): - if not TableauProver.is_atom(arg): - ctx = f - nv = Variable("X%s" % _counter.get()) - for j, a in enumerate(args): - ctx = ctx(VariableExpression(nv)) if i == j else ctx(a) - if context: - ctx = context(ctx).simplify() - ctx = LambdaExpression(nv, ctx) - agenda.put(arg, ctx) - return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) - raise Exception("If this method is called, there must be a non-atomic argument") - - def _attempt_proof_n_app( - self, current, context, agenda, accessible_vars, atoms, debug - ): - f, args = current.term.uncurry() - for i, arg in enumerate(args): - if not TableauProver.is_atom(arg): - ctx = f - nv = Variable("X%s" % _counter.get()) - for j, a in enumerate(args): - ctx = ctx(VariableExpression(nv)) if i == j else ctx(a) - if context: - # combine new context with existing - ctx = context(ctx).simplify() - ctx = LambdaExpression(nv, -ctx) - agenda.put(-arg, ctx) - return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) - raise Exception("If this method is called, there must be a non-atomic argument") - - def _attempt_proof_n_eq( - self, current, context, agenda, accessible_vars, atoms, debug - ): - ########################################################################### - # Since 'current' is of type '~(a=b)', the path is closed if 'a' == 'b' - ########################################################################### - if current.term.first == current.term.second: - debug.line("CLOSED", 1) - return True - - agenda[Categories.N_EQ].add((current, context)) - current._exhausted = True - return self._attempt_proof( - agenda, - accessible_vars | {current.term.first, current.term.second}, - atoms, - debug + 1, - ) - - def _attempt_proof_d_neg( - self, current, context, agenda, accessible_vars, atoms, debug - ): - agenda.put(current.term.term, context) - return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) - - def _attempt_proof_n_all( - self, current, context, agenda, accessible_vars, atoms, debug - ): - agenda[Categories.EXISTS].add( - (ExistsExpression(current.term.variable, -current.term.term), context) - ) - return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) - - def _attempt_proof_n_some( - self, current, context, agenda, accessible_vars, atoms, debug - ): - agenda[Categories.ALL].add( - (AllExpression(current.term.variable, -current.term.term), context) - ) - return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) - - def _attempt_proof_and( - self, current, context, agenda, accessible_vars, atoms, debug - ): - agenda.put(current.first, context) - agenda.put(current.second, context) - return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) - - def _attempt_proof_n_or( - self, current, context, agenda, accessible_vars, atoms, debug - ): - agenda.put(-current.term.first, context) - agenda.put(-current.term.second, context) - return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) - - def _attempt_proof_n_imp( - self, current, context, agenda, accessible_vars, atoms, debug - ): - agenda.put(current.term.first, context) - agenda.put(-current.term.second, context) - return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) - - def _attempt_proof_or( - self, current, context, agenda, accessible_vars, atoms, debug - ): - new_agenda = agenda.clone() - agenda.put(current.first, context) - new_agenda.put(current.second, context) - return self._attempt_proof( - agenda, accessible_vars, atoms, debug + 1 - ) and self._attempt_proof(new_agenda, accessible_vars, atoms, debug + 1) - - def _attempt_proof_imp( - self, current, context, agenda, accessible_vars, atoms, debug - ): - new_agenda = agenda.clone() - agenda.put(-current.first, context) - new_agenda.put(current.second, context) - return self._attempt_proof( - agenda, accessible_vars, atoms, debug + 1 - ) and self._attempt_proof(new_agenda, accessible_vars, atoms, debug + 1) - - def _attempt_proof_n_and( - self, current, context, agenda, accessible_vars, atoms, debug - ): - new_agenda = agenda.clone() - agenda.put(-current.term.first, context) - new_agenda.put(-current.term.second, context) - return self._attempt_proof( - agenda, accessible_vars, atoms, debug + 1 - ) and self._attempt_proof(new_agenda, accessible_vars, atoms, debug + 1) - - def _attempt_proof_iff( - self, current, context, agenda, accessible_vars, atoms, debug - ): - new_agenda = agenda.clone() - agenda.put(current.first, context) - agenda.put(current.second, context) - new_agenda.put(-current.first, context) - new_agenda.put(-current.second, context) - return self._attempt_proof( - agenda, accessible_vars, atoms, debug + 1 - ) and self._attempt_proof(new_agenda, accessible_vars, atoms, debug + 1) - - def _attempt_proof_n_iff( - self, current, context, agenda, accessible_vars, atoms, debug - ): - new_agenda = agenda.clone() - agenda.put(current.term.first, context) - agenda.put(-current.term.second, context) - new_agenda.put(-current.term.first, context) - new_agenda.put(current.term.second, context) - return self._attempt_proof( - agenda, accessible_vars, atoms, debug + 1 - ) and self._attempt_proof(new_agenda, accessible_vars, atoms, debug + 1) - - def _attempt_proof_eq( - self, current, context, agenda, accessible_vars, atoms, debug - ): - ######################################################################### - # Since 'current' is of the form '(a = b)', replace ALL free instances - # of 'a' with 'b' - ######################################################################### - agenda.put_atoms(atoms) - agenda.replace_all(current.first, current.second) - accessible_vars.discard(current.first) - agenda.mark_neqs_fresh() - return self._attempt_proof(agenda, accessible_vars, set(), debug + 1) - - def _attempt_proof_some( - self, current, context, agenda, accessible_vars, atoms, debug - ): - new_unique_variable = VariableExpression(unique_variable()) - agenda.put(current.term.replace(current.variable, new_unique_variable), context) - agenda.mark_alls_fresh() - return self._attempt_proof( - agenda, accessible_vars | {new_unique_variable}, atoms, debug + 1 - ) - - def _attempt_proof_all( - self, current, context, agenda, accessible_vars, atoms, debug - ): - try: - current._used_vars - except AttributeError: - current._used_vars = set() - - # if there are accessible_vars on the path - if accessible_vars: - # get the set of bound variables that have not be used by this AllExpression - bv_available = accessible_vars - current._used_vars - - if bv_available: - variable_to_use = list(bv_available)[0] - debug.line("--> Using '%s'" % variable_to_use, 2) - current._used_vars |= {variable_to_use} - agenda.put( - current.term.replace(current.variable, variable_to_use), context - ) - agenda[Categories.ALL].add((current, context)) - return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) - - else: - # no more available variables to substitute - debug.line("--> Variables Exhausted", 2) - current._exhausted = True - agenda[Categories.ALL].add((current, context)) - return self._attempt_proof(agenda, accessible_vars, atoms, debug + 1) - - else: - new_unique_variable = VariableExpression(unique_variable()) - debug.line("--> Using '%s'" % new_unique_variable, 2) - current._used_vars |= {new_unique_variable} - agenda.put( - current.term.replace(current.variable, new_unique_variable), context - ) - agenda[Categories.ALL].add((current, context)) - agenda.mark_alls_fresh() - return self._attempt_proof( - agenda, accessible_vars | {new_unique_variable}, atoms, debug + 1 - ) - - @staticmethod - def is_atom(e): - if isinstance(e, NegatedExpression): - e = e.term - - if isinstance(e, ApplicationExpression): - for arg in e.args: - if not TableauProver.is_atom(arg): - return False - return True - elif isinstance(e, AbstractVariableExpression) or isinstance( - e, LambdaExpression - ): - return True - else: - return False - - -class TableauProverCommand(BaseProverCommand): - def __init__(self, goal=None, assumptions=None, prover=None): - """ - :param goal: Input expression to prove - :type goal: sem.Expression - :param assumptions: Input expressions to use as assumptions in - the proof. - :type assumptions: list(sem.Expression) - """ - if prover is not None: - assert isinstance(prover, TableauProver) - else: - prover = TableauProver() - - BaseProverCommand.__init__(self, prover, goal, assumptions) - - -class Agenda: - def __init__(self): - self.sets = tuple(set() for i in range(21)) - - def clone(self): - new_agenda = Agenda() - set_list = [s.copy() for s in self.sets] - - new_allExs = set() - for allEx, _ in set_list[Categories.ALL]: - new_allEx = AllExpression(allEx.variable, allEx.term) - try: - new_allEx._used_vars = {used for used in allEx._used_vars} - except AttributeError: - new_allEx._used_vars = set() - new_allExs.add((new_allEx, None)) - set_list[Categories.ALL] = new_allExs - - set_list[Categories.N_EQ] = { - (NegatedExpression(n_eq.term), ctx) - for (n_eq, ctx) in set_list[Categories.N_EQ] - } - - new_agenda.sets = tuple(set_list) - return new_agenda - - def __getitem__(self, index): - return self.sets[index] - - def put(self, expression, context=None): - if isinstance(expression, AllExpression): - ex_to_add = AllExpression(expression.variable, expression.term) - try: - ex_to_add._used_vars = {used for used in expression._used_vars} - except AttributeError: - ex_to_add._used_vars = set() - else: - ex_to_add = expression - self.sets[self._categorize_expression(ex_to_add)].add((ex_to_add, context)) - - def put_all(self, expressions): - for expression in expressions: - self.put(expression) - - def put_atoms(self, atoms): - for atom, neg in atoms: - if neg: - self[Categories.N_ATOM].add((-atom, None)) - else: - self[Categories.ATOM].add((atom, None)) - - def pop_first(self): - """Pop the first expression that appears in the agenda""" - for i, s in enumerate(self.sets): - if s: - if i in [Categories.N_EQ, Categories.ALL]: - for ex in s: - try: - if not ex[0]._exhausted: - s.remove(ex) - return (ex, i) - except AttributeError: - s.remove(ex) - return (ex, i) - else: - return (s.pop(), i) - return ((None, None), None) - - def replace_all(self, old, new): - for s in self.sets: - for ex, ctx in s: - ex.replace(old.variable, new) - if ctx is not None: - ctx.replace(old.variable, new) - - def mark_alls_fresh(self): - for u, _ in self.sets[Categories.ALL]: - u._exhausted = False - - def mark_neqs_fresh(self): - for neq, _ in self.sets[Categories.N_EQ]: - neq._exhausted = False - - def _categorize_expression(self, current): - if isinstance(current, NegatedExpression): - return self._categorize_NegatedExpression(current) - elif isinstance(current, FunctionVariableExpression): - return Categories.PROP - elif TableauProver.is_atom(current): - return Categories.ATOM - elif isinstance(current, AllExpression): - return Categories.ALL - elif isinstance(current, AndExpression): - return Categories.AND - elif isinstance(current, OrExpression): - return Categories.OR - elif isinstance(current, ImpExpression): - return Categories.IMP - elif isinstance(current, IffExpression): - return Categories.IFF - elif isinstance(current, EqualityExpression): - return Categories.EQ - elif isinstance(current, ExistsExpression): - return Categories.EXISTS - elif isinstance(current, ApplicationExpression): - return Categories.APP - else: - raise ProverParseError("cannot categorize %s" % current.__class__.__name__) - - def _categorize_NegatedExpression(self, current): - negated = current.term - - if isinstance(negated, NegatedExpression): - return Categories.D_NEG - elif isinstance(negated, FunctionVariableExpression): - return Categories.N_PROP - elif TableauProver.is_atom(negated): - return Categories.N_ATOM - elif isinstance(negated, AllExpression): - return Categories.N_ALL - elif isinstance(negated, AndExpression): - return Categories.N_AND - elif isinstance(negated, OrExpression): - return Categories.N_OR - elif isinstance(negated, ImpExpression): - return Categories.N_IMP - elif isinstance(negated, IffExpression): - return Categories.N_IFF - elif isinstance(negated, EqualityExpression): - return Categories.N_EQ - elif isinstance(negated, ExistsExpression): - return Categories.N_EXISTS - elif isinstance(negated, ApplicationExpression): - return Categories.N_APP - else: - raise ProverParseError("cannot categorize %s" % negated.__class__.__name__) - - -class Debug: - def __init__(self, verbose, indent=0, lines=None): - self.verbose = verbose - self.indent = indent - - if not lines: - lines = [] - self.lines = lines - - def __add__(self, increment): - return Debug(self.verbose, self.indent + 1, self.lines) - - def line(self, data, indent=0): - if isinstance(data, tuple): - ex, ctx = data - if ctx: - data = f"{ex}, {ctx}" - else: - data = "%s" % ex - - if isinstance(ex, AllExpression): - try: - used_vars = "[%s]" % ( - ",".join("%s" % ve.variable.name for ve in ex._used_vars) - ) - data += ": %s" % used_vars - except AttributeError: - data += ": []" - - newline = "{}{}".format(" " * (self.indent + indent), data) - self.lines.append(newline) - - if self.verbose: - print(newline) - - -class Categories: - ATOM = 0 - PROP = 1 - N_ATOM = 2 - N_PROP = 3 - APP = 4 - N_APP = 5 - N_EQ = 6 - D_NEG = 7 - N_ALL = 8 - N_EXISTS = 9 - AND = 10 - N_OR = 11 - N_IMP = 12 - OR = 13 - IMP = 14 - N_AND = 15 - IFF = 16 - N_IFF = 17 - EQ = 18 - EXISTS = 19 - ALL = 20 - - -def testTableauProver(): - tableau_test("P | -P") - tableau_test("P & -P") - tableau_test("Q", ["P", "(P -> Q)"]) - tableau_test("man(x)") - tableau_test("(man(x) -> man(x))") - tableau_test("(man(x) -> --man(x))") - tableau_test("-(man(x) and -man(x))") - tableau_test("(man(x) or -man(x))") - tableau_test("(man(x) -> man(x))") - tableau_test("-(man(x) and -man(x))") - tableau_test("(man(x) or -man(x))") - tableau_test("(man(x) -> man(x))") - tableau_test("(man(x) iff man(x))") - tableau_test("-(man(x) iff -man(x))") - tableau_test("all x.man(x)") - tableau_test("all x.all y.((x = y) -> (y = x))") - tableau_test("all x.all y.all z.(((x = y) & (y = z)) -> (x = z))") - # tableau_test('-all x.some y.F(x,y) & some x.all y.(-F(x,y))') - # tableau_test('some x.all y.sees(x,y)') - - p1 = "all x.(man(x) -> mortal(x))" - p2 = "man(Socrates)" - c = "mortal(Socrates)" - tableau_test(c, [p1, p2]) - - p1 = "all x.(man(x) -> walks(x))" - p2 = "man(John)" - c = "some y.walks(y)" - tableau_test(c, [p1, p2]) - - p = "((x = y) & walks(y))" - c = "walks(x)" - tableau_test(c, [p]) - - p = "((x = y) & ((y = z) & (z = w)))" - c = "(x = w)" - tableau_test(c, [p]) - - p = "some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))" - c = "some e0.walk(e0,mary)" - tableau_test(c, [p]) - - c = "(exists x.exists z3.((x = Mary) & ((z3 = John) & sees(z3,x))) <-> exists x.exists z4.((x = John) & ((z4 = Mary) & sees(x,z4))))" - tableau_test(c) - - -# p = 'some e1.some e2.((believe e1 john e2) and (walk e2 mary))' -# c = 'some x.some e3.some e4.((believe e3 x e4) and (walk e4 mary))' -# tableau_test(c, [p]) - - -def testHigherOrderTableauProver(): - tableau_test("believe(j, -lie(b))", ["believe(j, -lie(b) & -cheat(b))"]) - tableau_test("believe(j, lie(b) & cheat(b))", ["believe(j, lie(b))"]) - tableau_test( - "believe(j, lie(b))", ["lie(b)"] - ) # how do we capture that John believes all things that are true - tableau_test( - "believe(j, know(b, cheat(b)))", - ["believe(j, know(b, lie(b)) & know(b, steals(b) & cheat(b)))"], - ) - tableau_test("P(Q(y), R(y) & R(z))", ["P(Q(x) & Q(y), R(y) & R(z))"]) - - tableau_test("believe(j, cheat(b) & lie(b))", ["believe(j, lie(b) & cheat(b))"]) - tableau_test("believe(j, -cheat(b) & -lie(b))", ["believe(j, -lie(b) & -cheat(b))"]) - - -def tableau_test(c, ps=None, verbose=False): - pc = Expression.fromstring(c) - pps = [Expression.fromstring(p) for p in ps] if ps else [] - if not ps: - ps = [] - print( - "%s |- %s: %s" - % (", ".join(ps), pc, TableauProver().prove(pc, pps, verbose=verbose)) - ) - - -def demo(): - testTableauProver() - testHigherOrderTableauProver() - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/internals.py b/pipeline/nltk/internals.py deleted file mode 100644 index b53d77da5e976c08f2cb002759e9da1044dc9bf0..0000000000000000000000000000000000000000 --- a/pipeline/nltk/internals.py +++ /dev/null @@ -1,1123 +0,0 @@ -# Natural Language Toolkit: Internal utility functions -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Edward Loper -# Nitin Madnani -# URL: -# For license information, see LICENSE.TXT - -import fnmatch -import locale -import os -import re -import stat -import subprocess -import sys -import textwrap -import types -import warnings -from xml.etree import ElementTree - -########################################################################## -# Java Via Command-Line -########################################################################## - -_java_bin = None -_java_options = [] -# [xx] add classpath option to config_java? -def config_java(bin=None, options=None, verbose=False): - """ - Configure nltk's java interface, by letting nltk know where it can - find the Java binary, and what extra options (if any) should be - passed to Java when it is run. - - :param bin: The full path to the Java binary. If not specified, - then nltk will search the system for a Java binary; and if - one is not found, it will raise a ``LookupError`` exception. - :type bin: str - :param options: A list of options that should be passed to the - Java binary when it is called. A common value is - ``'-Xmx512m'``, which tells Java binary to increase - the maximum heap size to 512 megabytes. If no options are - specified, then do not modify the options list. - :type options: list(str) - """ - global _java_bin, _java_options - _java_bin = find_binary( - "java", - bin, - env_vars=["JAVAHOME", "JAVA_HOME"], - verbose=verbose, - binary_names=["java.exe"], - ) - - if options is not None: - if isinstance(options, str): - options = options.split() - _java_options = list(options) - - -def java(cmd, classpath=None, stdin=None, stdout=None, stderr=None, blocking=True): - """ - Execute the given java command, by opening a subprocess that calls - Java. If java has not yet been configured, it will be configured - by calling ``config_java()`` with no arguments. - - :param cmd: The java command that should be called, formatted as - a list of strings. Typically, the first string will be the name - of the java class; and the remaining strings will be arguments - for that java class. - :type cmd: list(str) - - :param classpath: A ``':'`` separated list of directories, JAR - archives, and ZIP archives to search for class files. - :type classpath: str - - :param stdin: Specify the executed program's - standard input file handles, respectively. Valid values are ``subprocess.PIPE``, - an existing file descriptor (a positive integer), an existing - file object, 'pipe', 'stdout', 'devnull' and None. ``subprocess.PIPE`` indicates that a - new pipe to the child should be created. With None, no - redirection will occur; the child's file handles will be - inherited from the parent. Additionally, stderr can be - ``subprocess.STDOUT``, which indicates that the stderr data - from the applications should be captured into the same file - handle as for stdout. - - :param stdout: Specify the executed program's standard output file - handle. See ``stdin`` for valid values. - - :param stderr: Specify the executed program's standard error file - handle. See ``stdin`` for valid values. - - - :param blocking: If ``false``, then return immediately after - spawning the subprocess. In this case, the return value is - the ``Popen`` object, and not a ``(stdout, stderr)`` tuple. - - :return: If ``blocking=True``, then return a tuple ``(stdout, - stderr)``, containing the stdout and stderr outputs generated - by the java command if the ``stdout`` and ``stderr`` parameters - were set to ``subprocess.PIPE``; or None otherwise. If - ``blocking=False``, then return a ``subprocess.Popen`` object. - - :raise OSError: If the java command returns a nonzero return code. - """ - - subprocess_output_dict = { - "pipe": subprocess.PIPE, - "stdout": subprocess.STDOUT, - "devnull": subprocess.DEVNULL, - } - - stdin = subprocess_output_dict.get(stdin, stdin) - stdout = subprocess_output_dict.get(stdout, stdout) - stderr = subprocess_output_dict.get(stderr, stderr) - - if isinstance(cmd, str): - raise TypeError("cmd should be a list of strings") - - # Make sure we know where a java binary is. - if _java_bin is None: - config_java() - - # Set up the classpath. - if isinstance(classpath, str): - classpaths = [classpath] - else: - classpaths = list(classpath) - classpath = os.path.pathsep.join(classpaths) - - # Construct the full command string. - cmd = list(cmd) - cmd = ["-cp", classpath] + cmd - cmd = [_java_bin] + _java_options + cmd - - # Call java via a subprocess - p = subprocess.Popen(cmd, stdin=stdin, stdout=stdout, stderr=stderr) - if not blocking: - return p - (stdout, stderr) = p.communicate() - - # Check the return code. - if p.returncode != 0: - print(_decode_stdoutdata(stderr)) - raise OSError("Java command failed : " + str(cmd)) - - return (stdout, stderr) - - -###################################################################### -# Parsing -###################################################################### - - -class ReadError(ValueError): - """ - Exception raised by read_* functions when they fail. - :param position: The index in the input string where an error occurred. - :param expected: What was expected when an error occurred. - """ - - def __init__(self, expected, position): - ValueError.__init__(self, expected, position) - self.expected = expected - self.position = position - - def __str__(self): - return f"Expected {self.expected} at {self.position}" - - -_STRING_START_RE = re.compile(r"[uU]?[rR]?(\"\"\"|\'\'\'|\"|\')") - - -def read_str(s, start_position): - """ - If a Python string literal begins at the specified position in the - given string, then return a tuple ``(val, end_position)`` - containing the value of the string literal and the position where - it ends. Otherwise, raise a ``ReadError``. - - :param s: A string that will be checked to see if within which a - Python string literal exists. - :type s: str - - :param start_position: The specified beginning position of the string ``s`` - to begin regex matching. - :type start_position: int - - :return: A tuple containing the matched string literal evaluated as a - string and the end position of the string literal. - :rtype: tuple(str, int) - - :raise ReadError: If the ``_STRING_START_RE`` regex doesn't return a - match in ``s`` at ``start_position``, i.e., open quote. If the - ``_STRING_END_RE`` regex doesn't return a match in ``s`` at the - end of the first match, i.e., close quote. - :raise ValueError: If an invalid string (i.e., contains an invalid - escape sequence) is passed into the ``eval``. - - :Example: - - >>> from nltk.internals import read_str - >>> read_str('"Hello", World!', 0) - ('Hello', 7) - - """ - # Read the open quote, and any modifiers. - m = _STRING_START_RE.match(s, start_position) - if not m: - raise ReadError("open quote", start_position) - quotemark = m.group(1) - - # Find the close quote. - _STRING_END_RE = re.compile(r"\\|%s" % quotemark) - position = m.end() - while True: - match = _STRING_END_RE.search(s, position) - if not match: - raise ReadError("close quote", position) - if match.group(0) == "\\": - position = match.end() + 1 - else: - break - - # Process it, using eval. Strings with invalid escape sequences - # might raise ValueError. - try: - return eval(s[start_position : match.end()]), match.end() - except ValueError as e: - raise ReadError("valid escape sequence", start_position) from e - - -_READ_INT_RE = re.compile(r"-?\d+") - - -def read_int(s, start_position): - """ - If an integer begins at the specified position in the given - string, then return a tuple ``(val, end_position)`` containing the - value of the integer and the position where it ends. Otherwise, - raise a ``ReadError``. - - :param s: A string that will be checked to see if within which a - Python integer exists. - :type s: str - - :param start_position: The specified beginning position of the string ``s`` - to begin regex matching. - :type start_position: int - - :return: A tuple containing the matched integer casted to an int, - and the end position of the int in ``s``. - :rtype: tuple(int, int) - - :raise ReadError: If the ``_READ_INT_RE`` regex doesn't return a - match in ``s`` at ``start_position``. - - :Example: - - >>> from nltk.internals import read_int - >>> read_int('42 is the answer', 0) - (42, 2) - - """ - m = _READ_INT_RE.match(s, start_position) - if not m: - raise ReadError("integer", start_position) - return int(m.group()), m.end() - - -_READ_NUMBER_VALUE = re.compile(r"-?(\d*)([.]?\d*)?") - - -def read_number(s, start_position): - """ - If an integer or float begins at the specified position in the - given string, then return a tuple ``(val, end_position)`` - containing the value of the number and the position where it ends. - Otherwise, raise a ``ReadError``. - - :param s: A string that will be checked to see if within which a - Python number exists. - :type s: str - - :param start_position: The specified beginning position of the string ``s`` - to begin regex matching. - :type start_position: int - - :return: A tuple containing the matched number casted to a ``float``, - and the end position of the number in ``s``. - :rtype: tuple(float, int) - - :raise ReadError: If the ``_READ_NUMBER_VALUE`` regex doesn't return a - match in ``s`` at ``start_position``. - - :Example: - - >>> from nltk.internals import read_number - >>> read_number('Pi is 3.14159', 6) - (3.14159, 13) - - """ - m = _READ_NUMBER_VALUE.match(s, start_position) - if not m or not (m.group(1) or m.group(2)): - raise ReadError("number", start_position) - if m.group(2): - return float(m.group()), m.end() - else: - return int(m.group()), m.end() - - -###################################################################### -# Check if a method has been overridden -###################################################################### - - -def overridden(method): - """ - :return: True if ``method`` overrides some method with the same - name in a base class. This is typically used when defining - abstract base classes or interfaces, to allow subclasses to define - either of two related methods: - - >>> class EaterI: - ... '''Subclass must define eat() or batch_eat().''' - ... def eat(self, food): - ... if overridden(self.batch_eat): - ... return self.batch_eat([food])[0] - ... else: - ... raise NotImplementedError() - ... def batch_eat(self, foods): - ... return [self.eat(food) for food in foods] - - :type method: instance method - """ - if isinstance(method, types.MethodType) and method.__self__.__class__ is not None: - name = method.__name__ - funcs = [ - cls.__dict__[name] - for cls in _mro(method.__self__.__class__) - if name in cls.__dict__ - ] - return len(funcs) > 1 - else: - raise TypeError("Expected an instance method.") - - -def _mro(cls): - """ - Return the method resolution order for ``cls`` -- i.e., a list - containing ``cls`` and all its base classes, in the order in which - they would be checked by ``getattr``. For new-style classes, this - is just cls.__mro__. For classic classes, this can be obtained by - a depth-first left-to-right traversal of ``__bases__``. - """ - if isinstance(cls, type): - return cls.__mro__ - else: - mro = [cls] - for base in cls.__bases__: - mro.extend(_mro(base)) - return mro - - -###################################################################### -# Deprecation decorator & base class -###################################################################### -# [xx] dedent msg first if it comes from a docstring. - - -def _add_epytext_field(obj, field, message): - """Add an epytext @field to a given object's docstring.""" - indent = "" - # If we already have a docstring, then add a blank line to separate - # it from the new field, and check its indentation. - if obj.__doc__: - obj.__doc__ = obj.__doc__.rstrip() + "\n\n" - indents = re.findall(r"(?<=\n)[ ]+(?!\s)", obj.__doc__.expandtabs()) - if indents: - indent = min(indents) - # If we don't have a docstring, add an empty one. - else: - obj.__doc__ = "" - - obj.__doc__ += textwrap.fill( - f"@{field}: {message}", - initial_indent=indent, - subsequent_indent=indent + " ", - ) - - -def deprecated(message): - """ - A decorator used to mark functions as deprecated. This will cause - a warning to be printed the when the function is used. Usage: - - >>> from nltk.internals import deprecated - >>> @deprecated('Use foo() instead') - ... def bar(x): - ... print(x/10) - - """ - - def decorator(func): - msg = f"Function {func.__name__}() has been deprecated. {message}" - msg = "\n" + textwrap.fill(msg, initial_indent=" ", subsequent_indent=" ") - - def newFunc(*args, **kwargs): - warnings.warn(msg, category=DeprecationWarning, stacklevel=2) - return func(*args, **kwargs) - - # Copy the old function's name, docstring, & dict - newFunc.__dict__.update(func.__dict__) - newFunc.__name__ = func.__name__ - newFunc.__doc__ = func.__doc__ - newFunc.__deprecated__ = True - # Add a @deprecated field to the docstring. - _add_epytext_field(newFunc, "deprecated", message) - return newFunc - - return decorator - - -class Deprecated: - """ - A base class used to mark deprecated classes. A typical usage is to - alert users that the name of a class has changed: - - >>> from nltk.internals import Deprecated - >>> class NewClassName: - ... pass # All logic goes here. - ... - >>> class OldClassName(Deprecated, NewClassName): - ... "Use NewClassName instead." - - The docstring of the deprecated class will be used in the - deprecation warning message. - """ - - def __new__(cls, *args, **kwargs): - # Figure out which class is the deprecated one. - dep_cls = None - for base in _mro(cls): - if Deprecated in base.__bases__: - dep_cls = base - break - assert dep_cls, "Unable to determine which base is deprecated." - - # Construct an appropriate warning. - doc = dep_cls.__doc__ or "".strip() - # If there's a @deprecated field, strip off the field marker. - doc = re.sub(r"\A\s*@deprecated:", r"", doc) - # Strip off any indentation. - doc = re.sub(r"(?m)^\s*", "", doc) - # Construct a 'name' string. - name = "Class %s" % dep_cls.__name__ - if cls != dep_cls: - name += " (base class for %s)" % cls.__name__ - # Put it all together. - msg = f"{name} has been deprecated. {doc}" - # Wrap it. - msg = "\n" + textwrap.fill(msg, initial_indent=" ", subsequent_indent=" ") - warnings.warn(msg, category=DeprecationWarning, stacklevel=2) - # Do the actual work of __new__. - return object.__new__(cls) - - -########################################################################## -# COUNTER, FOR UNIQUE NAMING -########################################################################## - - -class Counter: - """ - A counter that auto-increments each time its value is read. - """ - - def __init__(self, initial_value=0): - self._value = initial_value - - def get(self): - self._value += 1 - return self._value - - -########################################################################## -# Search for files/binaries -########################################################################## - - -def find_file_iter( - filename, - env_vars=(), - searchpath=(), - file_names=None, - url=None, - verbose=False, - finding_dir=False, -): - """ - Search for a file to be used by nltk. - - :param filename: The name or path of the file. - :param env_vars: A list of environment variable names to check. - :param file_names: A list of alternative file names to check. - :param searchpath: List of directories to search. - :param url: URL presented to user for download help. - :param verbose: Whether or not to print path when a file is found. - """ - file_names = [filename] + (file_names or []) - assert isinstance(filename, str) - assert not isinstance(file_names, str) - assert not isinstance(searchpath, str) - if isinstance(env_vars, str): - env_vars = env_vars.split() - yielded = False - - # File exists, no magic - for alternative in file_names: - path_to_file = os.path.join(filename, alternative) - if os.path.isfile(path_to_file): - if verbose: - print(f"[Found {filename}: {path_to_file}]") - yielded = True - yield path_to_file - # Check the bare alternatives - if os.path.isfile(alternative): - if verbose: - print(f"[Found {filename}: {alternative}]") - yielded = True - yield alternative - # Check if the alternative is inside a 'file' directory - path_to_file = os.path.join(filename, "file", alternative) - if os.path.isfile(path_to_file): - if verbose: - print(f"[Found {filename}: {path_to_file}]") - yielded = True - yield path_to_file - - # Check environment variables - for env_var in env_vars: - if env_var in os.environ: - if finding_dir: # This is to file a directory instead of file - yielded = True - yield os.environ[env_var] - - for env_dir in os.environ[env_var].split(os.pathsep): - # Check if the environment variable contains a direct path to the bin - if os.path.isfile(env_dir): - if verbose: - print(f"[Found {filename}: {env_dir}]") - yielded = True - yield env_dir - # Check if the possible bin names exist inside the environment variable directories - for alternative in file_names: - path_to_file = os.path.join(env_dir, alternative) - if os.path.isfile(path_to_file): - if verbose: - print(f"[Found {filename}: {path_to_file}]") - yielded = True - yield path_to_file - # Check if the alternative is inside a 'file' directory - # path_to_file = os.path.join(env_dir, 'file', alternative) - - # Check if the alternative is inside a 'bin' directory - path_to_file = os.path.join(env_dir, "bin", alternative) - - if os.path.isfile(path_to_file): - if verbose: - print(f"[Found {filename}: {path_to_file}]") - yielded = True - yield path_to_file - - # Check the path list. - for directory in searchpath: - for alternative in file_names: - path_to_file = os.path.join(directory, alternative) - if os.path.isfile(path_to_file): - yielded = True - yield path_to_file - - # If we're on a POSIX system, then try using the 'which' command - # to find the file. - if os.name == "posix": - for alternative in file_names: - try: - p = subprocess.Popen( - ["which", alternative], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - stdout, stderr = p.communicate() - path = _decode_stdoutdata(stdout).strip() - if path.endswith(alternative) and os.path.exists(path): - if verbose: - print(f"[Found {filename}: {path}]") - yielded = True - yield path - except (KeyboardInterrupt, SystemExit, OSError): - raise - finally: - pass - - if not yielded: - msg = ( - "NLTK was unable to find the %s file!" - "\nUse software specific " - "configuration parameters" % filename - ) - if env_vars: - msg += " or set the %s environment variable" % env_vars[0] - msg += "." - if searchpath: - msg += "\n\n Searched in:" - msg += "".join("\n - %s" % d for d in searchpath) - if url: - msg += f"\n\n For more information on {filename}, see:\n <{url}>" - div = "=" * 75 - raise LookupError(f"\n\n{div}\n{msg}\n{div}") - - -def find_file( - filename, env_vars=(), searchpath=(), file_names=None, url=None, verbose=False -): - return next( - find_file_iter(filename, env_vars, searchpath, file_names, url, verbose) - ) - - -def find_dir( - filename, env_vars=(), searchpath=(), file_names=None, url=None, verbose=False -): - return next( - find_file_iter( - filename, env_vars, searchpath, file_names, url, verbose, finding_dir=True - ) - ) - - -def find_binary_iter( - name, - path_to_bin=None, - env_vars=(), - searchpath=(), - binary_names=None, - url=None, - verbose=False, -): - """ - Search for a file to be used by nltk. - - :param name: The name or path of the file. - :param path_to_bin: The user-supplied binary location (deprecated) - :param env_vars: A list of environment variable names to check. - :param file_names: A list of alternative file names to check. - :param searchpath: List of directories to search. - :param url: URL presented to user for download help. - :param verbose: Whether or not to print path when a file is found. - """ - yield from find_file_iter( - path_to_bin or name, env_vars, searchpath, binary_names, url, verbose - ) - - -def find_binary( - name, - path_to_bin=None, - env_vars=(), - searchpath=(), - binary_names=None, - url=None, - verbose=False, -): - return next( - find_binary_iter( - name, path_to_bin, env_vars, searchpath, binary_names, url, verbose - ) - ) - - -def find_jar_iter( - name_pattern, - path_to_jar=None, - env_vars=(), - searchpath=(), - url=None, - verbose=False, - is_regex=False, -): - """ - Search for a jar that is used by nltk. - - :param name_pattern: The name of the jar file - :param path_to_jar: The user-supplied jar location, or None. - :param env_vars: A list of environment variable names to check - in addition to the CLASSPATH variable which is - checked by default. - :param searchpath: List of directories to search. - :param is_regex: Whether name is a regular expression. - """ - - assert isinstance(name_pattern, str) - assert not isinstance(searchpath, str) - if isinstance(env_vars, str): - env_vars = env_vars.split() - yielded = False - - # Make sure we check the CLASSPATH first - env_vars = ["CLASSPATH"] + list(env_vars) - - # If an explicit location was given, then check it, and yield it if - # it's present; otherwise, complain. - if path_to_jar is not None: - if os.path.isfile(path_to_jar): - yielded = True - yield path_to_jar - else: - raise LookupError( - f"Could not find {name_pattern} jar file at {path_to_jar}" - ) - - # Check environment variables - for env_var in env_vars: - if env_var in os.environ: - if env_var == "CLASSPATH": - classpath = os.environ["CLASSPATH"] - for cp in classpath.split(os.path.pathsep): - cp = os.path.expanduser(cp) - if os.path.isfile(cp): - filename = os.path.basename(cp) - if ( - is_regex - and re.match(name_pattern, filename) - or (not is_regex and filename == name_pattern) - ): - if verbose: - print(f"[Found {name_pattern}: {cp}]") - yielded = True - yield cp - # The case where user put directory containing the jar file in the classpath - if os.path.isdir(cp): - if not is_regex: - if os.path.isfile(os.path.join(cp, name_pattern)): - if verbose: - print(f"[Found {name_pattern}: {cp}]") - yielded = True - yield os.path.join(cp, name_pattern) - else: - # Look for file using regular expression - for file_name in os.listdir(cp): - if re.match(name_pattern, file_name): - if verbose: - print( - "[Found %s: %s]" - % ( - name_pattern, - os.path.join(cp, file_name), - ) - ) - yielded = True - yield os.path.join(cp, file_name) - - else: - jar_env = os.path.expanduser(os.environ[env_var]) - jar_iter = ( - ( - os.path.join(jar_env, path_to_jar) - for path_to_jar in os.listdir(jar_env) - ) - if os.path.isdir(jar_env) - else (jar_env,) - ) - for path_to_jar in jar_iter: - if os.path.isfile(path_to_jar): - filename = os.path.basename(path_to_jar) - if ( - is_regex - and re.match(name_pattern, filename) - or (not is_regex and filename == name_pattern) - ): - if verbose: - print(f"[Found {name_pattern}: {path_to_jar}]") - yielded = True - yield path_to_jar - - # Check the path list. - for directory in searchpath: - if is_regex: - for filename in os.listdir(directory): - path_to_jar = os.path.join(directory, filename) - if os.path.isfile(path_to_jar): - if re.match(name_pattern, filename): - if verbose: - print(f"[Found {filename}: {path_to_jar}]") - yielded = True - yield path_to_jar - else: - path_to_jar = os.path.join(directory, name_pattern) - if os.path.isfile(path_to_jar): - if verbose: - print(f"[Found {name_pattern}: {path_to_jar}]") - yielded = True - yield path_to_jar - - if not yielded: - # If nothing was found, raise an error - msg = "NLTK was unable to find %s!" % name_pattern - if env_vars: - msg += " Set the %s environment variable" % env_vars[0] - msg = textwrap.fill(msg + ".", initial_indent=" ", subsequent_indent=" ") - if searchpath: - msg += "\n\n Searched in:" - msg += "".join("\n - %s" % d for d in searchpath) - if url: - msg += "\n\n For more information, on {}, see:\n <{}>".format( - name_pattern, - url, - ) - div = "=" * 75 - raise LookupError(f"\n\n{div}\n{msg}\n{div}") - - -def find_jar( - name_pattern, - path_to_jar=None, - env_vars=(), - searchpath=(), - url=None, - verbose=False, - is_regex=False, -): - return next( - find_jar_iter( - name_pattern, path_to_jar, env_vars, searchpath, url, verbose, is_regex - ) - ) - - -def find_jars_within_path(path_to_jars): - return [ - os.path.join(root, filename) - for root, dirnames, filenames in os.walk(path_to_jars) - for filename in fnmatch.filter(filenames, "*.jar") - ] - - -def _decode_stdoutdata(stdoutdata): - """Convert data read from stdout/stderr to unicode""" - if not isinstance(stdoutdata, bytes): - return stdoutdata - - encoding = getattr(sys.__stdout__, "encoding", locale.getpreferredencoding()) - if encoding is None: - return stdoutdata.decode() - return stdoutdata.decode(encoding) - - -########################################################################## -# Import Stdlib Module -########################################################################## - - -def import_from_stdlib(module): - """ - When python is run from within the nltk/ directory tree, the - current directory is included at the beginning of the search path. - Unfortunately, that means that modules within nltk can sometimes - shadow standard library modules. As an example, the stdlib - 'inspect' module will attempt to import the stdlib 'tokenize' - module, but will instead end up importing NLTK's 'tokenize' module - instead (causing the import to fail). - """ - old_path = sys.path - sys.path = [d for d in sys.path if d not in ("", ".")] - m = __import__(module) - sys.path = old_path - return m - - -########################################################################## -# Wrapper for ElementTree Elements -########################################################################## - - -class ElementWrapper: - """ - A wrapper around ElementTree Element objects whose main purpose is - to provide nicer __repr__ and __str__ methods. In addition, any - of the wrapped Element's methods that return other Element objects - are overridden to wrap those values before returning them. - - This makes Elements more convenient to work with in - interactive sessions and doctests, at the expense of some - efficiency. - """ - - # Prevent double-wrapping: - def __new__(cls, etree): - """ - Create and return a wrapper around a given Element object. - If ``etree`` is an ``ElementWrapper``, then ``etree`` is - returned as-is. - """ - if isinstance(etree, ElementWrapper): - return etree - else: - return object.__new__(ElementWrapper) - - def __init__(self, etree): - r""" - Initialize a new Element wrapper for ``etree``. - - If ``etree`` is a string, then it will be converted to an - Element object using ``ElementTree.fromstring()`` first: - - >>> ElementWrapper("") - \n"> - - """ - if isinstance(etree, str): - etree = ElementTree.fromstring(etree) - self.__dict__["_etree"] = etree - - def unwrap(self): - """ - Return the Element object wrapped by this wrapper. - """ - return self._etree - - ##//////////////////////////////////////////////////////////// - # { String Representation - ##//////////////////////////////////////////////////////////// - - def __repr__(self): - s = ElementTree.tostring(self._etree, encoding="utf8").decode("utf8") - if len(s) > 60: - e = s.rfind("<") - if (len(s) - e) > 30: - e = -20 - s = f"{s[:30]}...{s[e:]}" - return "" % s - - def __str__(self): - """ - :return: the result of applying ``ElementTree.tostring()`` to - the wrapped Element object. - """ - return ( - ElementTree.tostring(self._etree, encoding="utf8").decode("utf8").rstrip() - ) - - ##//////////////////////////////////////////////////////////// - # { Element interface Delegation (pass-through) - ##//////////////////////////////////////////////////////////// - - def __getattr__(self, attrib): - return getattr(self._etree, attrib) - - def __setattr__(self, attr, value): - return setattr(self._etree, attr, value) - - def __delattr__(self, attr): - return delattr(self._etree, attr) - - def __setitem__(self, index, element): - self._etree[index] = element - - def __delitem__(self, index): - del self._etree[index] - - def __setslice__(self, start, stop, elements): - self._etree[start:stop] = elements - - def __delslice__(self, start, stop): - del self._etree[start:stop] - - def __len__(self): - return len(self._etree) - - ##//////////////////////////////////////////////////////////// - # { Element interface Delegation (wrap result) - ##//////////////////////////////////////////////////////////// - - def __getitem__(self, index): - return ElementWrapper(self._etree[index]) - - def __getslice__(self, start, stop): - return [ElementWrapper(elt) for elt in self._etree[start:stop]] - - def getchildren(self): - return [ElementWrapper(elt) for elt in self._etree] - - def getiterator(self, tag=None): - return (ElementWrapper(elt) for elt in self._etree.getiterator(tag)) - - def makeelement(self, tag, attrib): - return ElementWrapper(self._etree.makeelement(tag, attrib)) - - def find(self, path): - elt = self._etree.find(path) - if elt is None: - return elt - else: - return ElementWrapper(elt) - - def findall(self, path): - return [ElementWrapper(elt) for elt in self._etree.findall(path)] - - -###################################################################### -# Helper for Handling Slicing -###################################################################### - - -def slice_bounds(sequence, slice_obj, allow_step=False): - """ - Given a slice, return the corresponding (start, stop) bounds, - taking into account None indices and negative indices. The - following guarantees are made for the returned start and stop values: - - - 0 <= start <= len(sequence) - - 0 <= stop <= len(sequence) - - start <= stop - - :raise ValueError: If ``slice_obj.step`` is not None. - :param allow_step: If true, then the slice object may have a - non-None step. If it does, then return a tuple - (start, stop, step). - """ - start, stop = (slice_obj.start, slice_obj.stop) - - # If allow_step is true, then include the step in our return - # value tuple. - if allow_step: - step = slice_obj.step - if step is None: - step = 1 - # Use a recursive call without allow_step to find the slice - # bounds. If step is negative, then the roles of start and - # stop (in terms of default values, etc), are swapped. - if step < 0: - start, stop = slice_bounds(sequence, slice(stop, start)) - else: - start, stop = slice_bounds(sequence, slice(start, stop)) - return start, stop, step - - # Otherwise, make sure that no non-default step value is used. - elif slice_obj.step not in (None, 1): - raise ValueError( - "slices with steps are not supported by %s" % sequence.__class__.__name__ - ) - - # Supply default offsets. - if start is None: - start = 0 - if stop is None: - stop = len(sequence) - - # Handle negative indices. - if start < 0: - start = max(0, len(sequence) + start) - if stop < 0: - stop = max(0, len(sequence) + stop) - - # Make sure stop doesn't go past the end of the list. Note that - # we avoid calculating len(sequence) if possible, because for lazy - # sequences, calculating the length of a sequence can be expensive. - if stop > 0: - try: - sequence[stop - 1] - except IndexError: - stop = len(sequence) - - # Make sure start isn't past stop. - start = min(start, stop) - - # That's all folks! - return start, stop - - -###################################################################### -# Permission Checking -###################################################################### - - -def is_writable(path): - # Ensure that it exists. - if not os.path.exists(path): - return False - - # If we're on a posix system, check its permissions. - if hasattr(os, "getuid"): - statdata = os.stat(path) - perm = stat.S_IMODE(statdata.st_mode) - # is it world-writable? - if perm & 0o002: - return True - # do we own it? - elif statdata.st_uid == os.getuid() and (perm & 0o200): - return True - # are we in a group that can write to it? - elif (statdata.st_gid in [os.getgid()] + os.getgroups()) and (perm & 0o020): - return True - # otherwise, we can't write to it. - else: - return False - - # Otherwise, we'll assume it's writable. - # [xx] should we do other checks on other platforms? - return True - - -###################################################################### -# NLTK Error reporting -###################################################################### - - -def raise_unorderable_types(ordering, a, b): - raise TypeError( - "unorderable types: %s() %s %s()" - % (type(a).__name__, ordering, type(b).__name__) - ) diff --git a/pipeline/nltk/jsontags.py b/pipeline/nltk/jsontags.py deleted file mode 100644 index 58d73bf138d07236bc38979bd69266c5972b62b2..0000000000000000000000000000000000000000 --- a/pipeline/nltk/jsontags.py +++ /dev/null @@ -1,65 +0,0 @@ -# Natural Language Toolkit: JSON Encoder/Decoder Helpers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Xu -# -# URL: -# For license information, see LICENSE.TXT - -""" -Register JSON tags, so the nltk data loader knows what module and class to look for. - -NLTK uses simple '!' tags to mark the types of objects, but the fully-qualified -"tag:nltk.org,2011:" prefix is also accepted in case anyone ends up -using it. -""" - -import json - -json_tags = {} - -TAG_PREFIX = "!" - - -def register_tag(cls): - """ - Decorates a class to register it's json tag. - """ - json_tags[TAG_PREFIX + getattr(cls, "json_tag")] = cls - return cls - - -class JSONTaggedEncoder(json.JSONEncoder): - def default(self, obj): - obj_tag = getattr(obj, "json_tag", None) - if obj_tag is None: - return super().default(obj) - obj_tag = TAG_PREFIX + obj_tag - obj = obj.encode_json_obj() - return {obj_tag: obj} - - -class JSONTaggedDecoder(json.JSONDecoder): - def decode(self, s): - return self.decode_obj(super().decode(s)) - - @classmethod - def decode_obj(cls, obj): - # Decode nested objects first. - if isinstance(obj, dict): - obj = {key: cls.decode_obj(val) for (key, val) in obj.items()} - elif isinstance(obj, list): - obj = list(cls.decode_obj(val) for val in obj) - # Check if we have a tagged object. - if not isinstance(obj, dict) or len(obj) != 1: - return obj - obj_tag = next(iter(obj.keys())) - if not obj_tag.startswith("!"): - return obj - if obj_tag not in json_tags: - raise ValueError("Unknown tag", obj_tag) - obj_cls = json_tags[obj_tag] - return obj_cls.decode_json_obj(obj[obj_tag]) - - -__all__ = ["register_tag", "json_tags", "JSONTaggedEncoder", "JSONTaggedDecoder"] diff --git a/pipeline/nltk/langnames.py b/pipeline/nltk/langnames.py deleted file mode 100644 index b7fa6b40a4b381b4b2c4f3ff42ee2450f3849465..0000000000000000000000000000000000000000 --- a/pipeline/nltk/langnames.py +++ /dev/null @@ -1,730 +0,0 @@ -# Natural Language Toolkit: Language Codes -# -# Copyright (C) 2022-2023 NLTK Project -# Author: Eric Kafe -# URL: -# For license information, see LICENSE.TXT -# -# iso639-3 language codes (C) https://iso639-3.sil.org/ - -""" -Translate between language names and language codes. - -The iso639-3 language codes were downloaded from the registration authority at -https://iso639-3.sil.org/ - -The iso639-3 codeset is evolving, so retired language codes are kept in the -"iso639retired" dictionary, which is used as fallback by the wrapper functions -"langname" and "langcode", in order to support the lookup of retired codes. - -The "langcode" function returns the current iso639-3 code if there is one, -and falls back to the retired code otherwise. As specified by BCP-47, -it returns the shortest (2-letter) code by default, but 3-letter codes -are also available: - - >>> import nltk.langnames as lgn - >>> lgn.langname('fri') #'fri' is a retired code - 'Western Frisian' - - The current code is different from the retired one: - >>> lgn.langcode('Western Frisian') - 'fy' - - >>> lgn.langcode('Western Frisian', typ = 3) - 'fry' - -""" - -import re -from warnings import warn - -from nltk.corpus import bcp47 - -codepattern = re.compile("[a-z][a-z][a-z]?") - - -def langname(tag, typ="full"): - """ - Convert a composite BCP-47 tag to a language name - - >>> from nltk.langnames import langname - >>> langname('ca-Latn-ES-valencia') - 'Catalan: Latin: Spain: Valencian' - - >>> langname('ca-Latn-ES-valencia', typ="short") - 'Catalan' - """ - tags = tag.split("-") - code = tags[0].lower() - if codepattern.fullmatch(code): - if code in iso639retired: # retired codes - return iso639retired[code] - elif code in iso639short: # 3-letter codes - code2 = iso639short[code] # convert to 2-letter code - warn(f"Shortening {code!r} to {code2!r}", stacklevel=2) - tag = "-".join([code2] + tags[1:]) - name = bcp47.name(tag) # parse according to BCP-47 - if typ == "full": - return name # include all subtags - elif name: - return name.split(":")[0] # only the language subtag - else: - warn(f"Could not find code in {code!r}", stacklevel=2) - - -def langcode(name, typ=2): - """ - Convert language name to iso639-3 language code. Returns the short 2-letter - code by default, if one is available, and the 3-letter code otherwise: - - >>> from nltk.langnames import langcode - >>> langcode('Modern Greek (1453-)') - 'el' - - Specify 'typ=3' to get the 3-letter code: - - >>> langcode('Modern Greek (1453-)', typ=3) - 'ell' - """ - if name in bcp47.langcode: - code = bcp47.langcode[name] - if typ == 3 and code in iso639long: - code = iso639long[code] # convert to 3-letter code - return code - elif name in iso639code_retired: - return iso639code_retired[name] - else: - warn(f"Could not find language in {name!r}", stacklevel=2) - - -# ======================================================================= -# Translate betwwen Wikidata Q-codes and BCP-47 codes or names -# ....................................................................... - - -def tag2q(tag): - """ - Convert BCP-47 tag to Wikidata Q-code - - >>> tag2q('nds-u-sd-demv') - 'Q4289225' - """ - return bcp47.wiki_q[tag] - - -def q2tag(qcode): - """ - Convert Wikidata Q-code to BCP-47 tag - - >>> q2tag('Q4289225') - 'nds-u-sd-demv' - """ - return wiki_bcp47[qcode] - - -def q2name(qcode, typ="full"): - """ - Convert Wikidata Q-code to BCP-47 (full or short) language name - - >>> q2name('Q4289225') - 'Low German: Mecklenburg-Vorpommern' - - >>> q2name('Q4289225', "short") - 'Low German' - """ - return langname(q2tag(qcode), typ) - - -def lang2q(name): - """ - Convert simple language name to Wikidata Q-code - - >>> lang2q('Low German') - 'Q25433' - """ - return tag2q(langcode(name)) - - -# ====================================================================== -# Data dictionaries -# ...................................................................... - - -def inverse_dict(dic): - """Return inverse mapping, but only if it is bijective""" - if len(dic.keys()) == len(set(dic.values())): - return {val: key for (key, val) in dic.items()} - else: - warn("This dictionary has no bijective inverse mapping.") - - -bcp47.load_wiki_q() # Wikidata conversion table needs to be loaded explicitly -wiki_bcp47 = inverse_dict(bcp47.wiki_q) - -iso639short = { - "aar": "aa", - "abk": "ab", - "afr": "af", - "aka": "ak", - "amh": "am", - "ara": "ar", - "arg": "an", - "asm": "as", - "ava": "av", - "ave": "ae", - "aym": "ay", - "aze": "az", - "bak": "ba", - "bam": "bm", - "bel": "be", - "ben": "bn", - "bis": "bi", - "bod": "bo", - "bos": "bs", - "bre": "br", - "bul": "bg", - "cat": "ca", - "ces": "cs", - "cha": "ch", - "che": "ce", - "chu": "cu", - "chv": "cv", - "cor": "kw", - "cos": "co", - "cre": "cr", - "cym": "cy", - "dan": "da", - "deu": "de", - "div": "dv", - "dzo": "dz", - "ell": "el", - "eng": "en", - "epo": "eo", - "est": "et", - "eus": "eu", - "ewe": "ee", - "fao": "fo", - "fas": "fa", - "fij": "fj", - "fin": "fi", - "fra": "fr", - "fry": "fy", - "ful": "ff", - "gla": "gd", - "gle": "ga", - "glg": "gl", - "glv": "gv", - "grn": "gn", - "guj": "gu", - "hat": "ht", - "hau": "ha", - "hbs": "sh", - "heb": "he", - "her": "hz", - "hin": "hi", - "hmo": "ho", - "hrv": "hr", - "hun": "hu", - "hye": "hy", - "ibo": "ig", - "ido": "io", - "iii": "ii", - "iku": "iu", - "ile": "ie", - "ina": "ia", - "ind": "id", - "ipk": "ik", - "isl": "is", - "ita": "it", - "jav": "jv", - "jpn": "ja", - "kal": "kl", - "kan": "kn", - "kas": "ks", - "kat": "ka", - "kau": "kr", - "kaz": "kk", - "khm": "km", - "kik": "ki", - "kin": "rw", - "kir": "ky", - "kom": "kv", - "kon": "kg", - "kor": "ko", - "kua": "kj", - "kur": "ku", - "lao": "lo", - "lat": "la", - "lav": "lv", - "lim": "li", - "lin": "ln", - "lit": "lt", - "ltz": "lb", - "lub": "lu", - "lug": "lg", - "mah": "mh", - "mal": "ml", - "mar": "mr", - "mkd": "mk", - "mlg": "mg", - "mlt": "mt", - "mon": "mn", - "mri": "mi", - "msa": "ms", - "mya": "my", - "nau": "na", - "nav": "nv", - "nbl": "nr", - "nde": "nd", - "ndo": "ng", - "nep": "ne", - "nld": "nl", - "nno": "nn", - "nob": "nb", - "nor": "no", - "nya": "ny", - "oci": "oc", - "oji": "oj", - "ori": "or", - "orm": "om", - "oss": "os", - "pan": "pa", - "pli": "pi", - "pol": "pl", - "por": "pt", - "pus": "ps", - "que": "qu", - "roh": "rm", - "ron": "ro", - "run": "rn", - "rus": "ru", - "sag": "sg", - "san": "sa", - "sin": "si", - "slk": "sk", - "slv": "sl", - "sme": "se", - "smo": "sm", - "sna": "sn", - "snd": "sd", - "som": "so", - "sot": "st", - "spa": "es", - "sqi": "sq", - "srd": "sc", - "srp": "sr", - "ssw": "ss", - "sun": "su", - "swa": "sw", - "swe": "sv", - "tah": "ty", - "tam": "ta", - "tat": "tt", - "tel": "te", - "tgk": "tg", - "tgl": "tl", - "tha": "th", - "tir": "ti", - "ton": "to", - "tsn": "tn", - "tso": "ts", - "tuk": "tk", - "tur": "tr", - "twi": "tw", - "uig": "ug", - "ukr": "uk", - "urd": "ur", - "uzb": "uz", - "ven": "ve", - "vie": "vi", - "vol": "vo", - "wln": "wa", - "wol": "wo", - "xho": "xh", - "yid": "yi", - "yor": "yo", - "zha": "za", - "zho": "zh", - "zul": "zu", -} - - -iso639retired = { - "fri": "Western Frisian", - "auv": "Auvergnat", - "gsc": "Gascon", - "lms": "Limousin", - "lnc": "Languedocien", - "prv": "Provençal", - "amd": "Amapá Creole", - "bgh": "Bogan", - "bnh": "Banawá", - "bvs": "Belgian Sign Language", - "ccy": "Southern Zhuang", - "cit": "Chittagonian", - "flm": "Falam Chin", - "jap": "Jaruára", - "kob": "Kohoroxitari", - "mob": "Moinba", - "mzf": "Aiku", - "nhj": "Tlalitzlipa Nahuatl", - "nhs": "Southeastern Puebla Nahuatl", - "occ": "Occidental", - "tmx": "Tomyang", - "tot": "Patla-Chicontla Totonac", - "xmi": "Miarrã", - "yib": "Yinglish", - "ztc": "Lachirioag Zapotec", - "atf": "Atuence", - "bqe": "Navarro-Labourdin Basque", - "bsz": "Souletin Basque", - "aex": "Amerax", - "ahe": "Ahe", - "aiz": "Aari", - "akn": "Amikoana", - "arf": "Arafundi", - "azr": "Adzera", - "bcx": "Pamona", - "bii": "Bisu", - "bke": "Bengkulu", - "blu": "Hmong Njua", - "boc": "Bakung Kenyah", - "bsd": "Sarawak Bisaya", - "bwv": "Bahau River Kenyah", - "bxt": "Buxinhua", - "byu": "Buyang", - "ccx": "Northern Zhuang", - "cru": "Carútana", - "dat": "Darang Deng", - "dyk": "Land Dayak", - "eni": "Enim", - "fiz": "Izere", - "gen": "Geman Deng", - "ggh": "Garreh-Ajuran", - "itu": "Itutang", - "kds": "Lahu Shi", - "knh": "Kayan River Kenyah", - "krg": "North Korowai", - "krq": "Krui", - "kxg": "Katingan", - "lmt": "Lematang", - "lnt": "Lintang", - "lod": "Berawan", - "mbg": "Northern Nambikuára", - "mdo": "Southwest Gbaya", - "mhv": "Arakanese", - "miv": "Mimi", - "mqd": "Madang", - "nky": "Khiamniungan Naga", - "nxj": "Nyadu", - "ogn": "Ogan", - "ork": "Orokaiva", - "paj": "Ipeka-Tapuia", - "pec": "Southern Pesisir", - "pen": "Penesak", - "plm": "Palembang", - "poj": "Lower Pokomo", - "pun": "Pubian", - "rae": "Ranau", - "rjb": "Rajbanshi", - "rws": "Rawas", - "sdd": "Semendo", - "sdi": "Sindang Kelingi", - "skl": "Selako", - "slb": "Kahumamahon Saluan", - "srj": "Serawai", - "suf": "Tarpia", - "suh": "Suba", - "suu": "Sungkai", - "szk": "Sizaki", - "tle": "Southern Marakwet", - "tnj": "Tanjong", - "ttx": "Tutong 1", - "ubm": "Upper Baram Kenyah", - "vky": "Kayu Agung", - "vmo": "Muko-Muko", - "wre": "Ware", - "xah": "Kahayan", - "xkm": "Mahakam Kenyah", - "xuf": "Kunfal", - "yio": "Dayao Yi", - "ymj": "Muji Yi", - "ypl": "Pula Yi", - "ypw": "Puwa Yi", - "ywm": "Wumeng Yi", - "yym": "Yuanjiang-Mojiang Yi", - "mly": "Malay (individual language)", - "muw": "Mundari", - "xst": "Silt'e", - "ope": "Old Persian", - "scc": "Serbian", - "scr": "Croatian", - "xsk": "Sakan", - "mol": "Moldavian", - "aay": "Aariya", - "acc": "Cubulco Achí", - "cbm": "Yepocapa Southwestern Cakchiquel", - "chs": "Chumash", - "ckc": "Northern Cakchiquel", - "ckd": "South Central Cakchiquel", - "cke": "Eastern Cakchiquel", - "ckf": "Southern Cakchiquel", - "cki": "Santa María De Jesús Cakchiquel", - "ckj": "Santo Domingo Xenacoj Cakchiquel", - "ckk": "Acatenango Southwestern Cakchiquel", - "ckw": "Western Cakchiquel", - "cnm": "Ixtatán Chuj", - "cti": "Tila Chol", - "cun": "Cunén Quiché", - "eml": "Emiliano-Romagnolo", - "eur": "Europanto", - "gmo": "Gamo-Gofa-Dawro", - "hsf": "Southeastern Huastec", - "hva": "San Luís Potosí Huastec", - "ixi": "Nebaj Ixil", - "ixj": "Chajul Ixil", - "jai": "Western Jacalteco", - "mms": "Southern Mam", - "mpf": "Tajumulco Mam", - "mtz": "Tacanec", - "mvc": "Central Mam", - "mvj": "Todos Santos Cuchumatán Mam", - "poa": "Eastern Pokomam", - "pob": "Western Pokomchí", - "pou": "Southern Pokomam", - "ppv": "Papavô", - "quj": "Joyabaj Quiché", - "qut": "West Central Quiché", - "quu": "Eastern Quiché", - "qxi": "San Andrés Quiché", - "sic": "Malinguat", - "stc": "Santa Cruz", - "tlz": "Toala'", - "tzb": "Bachajón Tzeltal", - "tzc": "Chamula Tzotzil", - "tze": "Chenalhó Tzotzil", - "tzs": "San Andrés Larrainzar Tzotzil", - "tzt": "Western Tzutujil", - "tzu": "Huixtán Tzotzil", - "tzz": "Zinacantán Tzotzil", - "vlr": "Vatrata", - "yus": "Chan Santa Cruz Maya", - "nfg": "Nyeng", - "nfk": "Shakara", - "agp": "Paranan", - "bhk": "Albay Bicolano", - "bkb": "Finallig", - "btb": "Beti (Cameroon)", - "cjr": "Chorotega", - "cmk": "Chimakum", - "drh": "Darkhat", - "drw": "Darwazi", - "gav": "Gabutamon", - "mof": "Mohegan-Montauk-Narragansett", - "mst": "Cataelano Mandaya", - "myt": "Sangab Mandaya", - "rmr": "Caló", - "sgl": "Sanglechi-Ishkashimi", - "sul": "Surigaonon", - "sum": "Sumo-Mayangna", - "tnf": "Tangshewi", - "wgw": "Wagawaga", - "ayx": "Ayi (China)", - "bjq": "Southern Betsimisaraka Malagasy", - "dha": "Dhanwar (India)", - "dkl": "Kolum So Dogon", - "mja": "Mahei", - "nbf": "Naxi", - "noo": "Nootka", - "tie": "Tingal", - "tkk": "Takpa", - "baz": "Tunen", - "bjd": "Bandjigali", - "ccq": "Chaungtha", - "cka": "Khumi Awa Chin", - "dap": "Nisi (India)", - "dwl": "Walo Kumbe Dogon", - "elp": "Elpaputih", - "gbc": "Garawa", - "gio": "Gelao", - "hrr": "Horuru", - "ibi": "Ibilo", - "jar": "Jarawa (Nigeria)", - "kdv": "Kado", - "kgh": "Upper Tanudan Kalinga", - "kpp": "Paku Karen", - "kzh": "Kenuzi-Dongola", - "lcq": "Luhu", - "mgx": "Omati", - "nln": "Durango Nahuatl", - "pbz": "Palu", - "pgy": "Pongyong", - "sca": "Sansu", - "tlw": "South Wemale", - "unp": "Worora", - "wiw": "Wirangu", - "ybd": "Yangbye", - "yen": "Yendang", - "yma": "Yamphe", - "daf": "Dan", - "djl": "Djiwarli", - "ggr": "Aghu Tharnggalu", - "ilw": "Talur", - "izi": "Izi-Ezaa-Ikwo-Mgbo", - "meg": "Mea", - "mld": "Malakhel", - "mnt": "Maykulan", - "mwd": "Mudbura", - "myq": "Forest Maninka", - "nbx": "Ngura", - "nlr": "Ngarla", - "pcr": "Panang", - "ppr": "Piru", - "tgg": "Tangga", - "wit": "Wintu", - "xia": "Xiandao", - "yiy": "Yir Yoront", - "yos": "Yos", - "emo": "Emok", - "ggm": "Gugu Mini", - "leg": "Lengua", - "lmm": "Lamam", - "mhh": "Maskoy Pidgin", - "puz": "Purum Naga", - "sap": "Sanapaná", - "yuu": "Yugh", - "aam": "Aramanik", - "adp": "Adap", - "aue": "ǂKxʼauǁʼein", - "bmy": "Bemba (Democratic Republic of Congo)", - "bxx": "Borna (Democratic Republic of Congo)", - "byy": "Buya", - "dzd": "Daza", - "gfx": "Mangetti Dune ǃXung", - "gti": "Gbati-ri", - "ime": "Imeraguen", - "kbf": "Kakauhua", - "koj": "Sara Dunjo", - "kwq": "Kwak", - "kxe": "Kakihum", - "lii": "Lingkhim", - "mwj": "Maligo", - "nnx": "Ngong", - "oun": "ǃOǃung", - "pmu": "Mirpur Panjabi", - "sgo": "Songa", - "thx": "The", - "tsf": "Southwestern Tamang", - "uok": "Uokha", - "xsj": "Subi", - "yds": "Yiddish Sign Language", - "ymt": "Mator-Taygi-Karagas", - "ynh": "Yangho", - "bgm": "Baga Mboteni", - "btl": "Bhatola", - "cbe": "Chipiajes", - "cbh": "Cagua", - "coy": "Coyaima", - "cqu": "Chilean Quechua", - "cum": "Cumeral", - "duj": "Dhuwal", - "ggn": "Eastern Gurung", - "ggo": "Southern Gondi", - "guv": "Gey", - "iap": "Iapama", - "ill": "Iranun", - "kgc": "Kasseng", - "kox": "Coxima", - "ktr": "Kota Marudu Tinagas", - "kvs": "Kunggara", - "kzj": "Coastal Kadazan", - "kzt": "Tambunan Dusun", - "nad": "Nijadali", - "nts": "Natagaimas", - "ome": "Omejes", - "pmc": "Palumata", - "pod": "Ponares", - "ppa": "Pao", - "pry": "Pray 3", - "rna": "Runa", - "svr": "Savara", - "tdu": "Tempasuk Dusun", - "thc": "Tai Hang Tong", - "tid": "Tidong", - "tmp": "Tai Mène", - "tne": "Tinoc Kallahan", - "toe": "Tomedes", - "xba": "Kamba (Brazil)", - "xbx": "Kabixí", - "xip": "Xipináwa", - "xkh": "Karahawyana", - "yri": "Yarí", - "jeg": "Jeng", - "kgd": "Kataang", - "krm": "Krim", - "prb": "Lua'", - "puk": "Pu Ko", - "rie": "Rien", - "rsi": "Rennellese Sign Language", - "skk": "Sok", - "snh": "Shinabo", - "lsg": "Lyons Sign Language", - "mwx": "Mediak", - "mwy": "Mosiro", - "ncp": "Ndaktup", - "ais": "Nataoran Amis", - "asd": "Asas", - "dit": "Dirari", - "dud": "Hun-Saare", - "lba": "Lui", - "llo": "Khlor", - "myd": "Maramba", - "myi": "Mina (India)", - "nns": "Ningye", - "aoh": "Arma", - "ayy": "Tayabas Ayta", - "bbz": "Babalia Creole Arabic", - "bpb": "Barbacoas", - "cca": "Cauca", - "cdg": "Chamari", - "dgu": "Degaru", - "drr": "Dororo", - "ekc": "Eastern Karnic", - "gli": "Guliguli", - "kjf": "Khalaj", - "kxl": "Nepali Kurux", - "kxu": "Kui (India)", - "lmz": "Lumbee", - "nxu": "Narau", - "plp": "Palpa", - "sdm": "Semandang", - "tbb": "Tapeba", - "xrq": "Karranga", - "xtz": "Tasmanian", - "zir": "Ziriya", - "thw": "Thudam", - "bic": "Bikaru", - "bij": "Vaghat-Ya-Bijim-Legeri", - "blg": "Balau", - "gji": "Geji", - "mvm": "Muya", - "ngo": "Ngoni", - "pat": "Papitalai", - "vki": "Ija-Zuba", - "wra": "Warapu", - "ajt": "Judeo-Tunisian Arabic", - "cug": "Chungmboko", - "lak": "Laka (Nigeria)", - "lno": "Lango (South Sudan)", - "pii": "Pini", - "smd": "Sama", - "snb": "Sebuyau", - "uun": "Kulon-Pazeh", - "wrd": "Warduji", - "wya": "Wyandot", -} - - -iso639long = inverse_dict(iso639short) - -iso639code_retired = inverse_dict(iso639retired) diff --git a/pipeline/nltk/lazyimport.py b/pipeline/nltk/lazyimport.py deleted file mode 100644 index ee0c8e4451fff3a19c3608d0d08e3422a77fd8f0..0000000000000000000000000000000000000000 --- a/pipeline/nltk/lazyimport.py +++ /dev/null @@ -1,142 +0,0 @@ -# This module is from mx/DateTime/LazyModule.py and is -# distributed under the terms of the eGenix.com Public License Agreement -# https://www.egenix.com/products/eGenix.com-Public-License-1.1.0.pdf - -""" Helper to enable simple lazy module import. - - 'Lazy' means the actual import is deferred until an attribute is - requested from the module's namespace. This has the advantage of - allowing all imports to be done at the top of a script (in a - prominent and visible place) without having a great impact - on startup time. - - Copyright (c) 1999-2005, Marc-Andre Lemburg; mailto:mal@lemburg.com - See the documentation for further information on copyrights, - or contact the author. All Rights Reserved. -""" - -### Constants - -_debug = 0 - -### - - -class LazyModule: - - """Lazy module class. - - Lazy modules are imported into the given namespaces whenever a - non-special attribute (there are some attributes like __doc__ - that class instances handle without calling __getattr__) is - requested. The module is then registered under the given name - in locals usually replacing the import wrapper instance. The - import itself is done using globals as global namespace. - - Example of creating a lazy load module: - - ISO = LazyModule('ISO',locals(),globals()) - - Later, requesting an attribute from ISO will load the module - automatically into the locals() namespace, overriding the - LazyModule instance: - - t = ISO.Week(1998,1,1) - - """ - - # Flag which indicates whether the LazyModule is initialized or not - __lazymodule_init = 0 - - # Name of the module to load - __lazymodule_name = "" - - # Flag which indicates whether the module was loaded or not - __lazymodule_loaded = 0 - - # Locals dictionary where to register the module - __lazymodule_locals = None - - # Globals dictionary to use for the module import - __lazymodule_globals = None - - def __init__(self, name, locals, globals=None): - - """Create a LazyModule instance wrapping module name. - - The module will later on be registered in locals under the - given module name. - - globals is optional and defaults to locals. - - """ - self.__lazymodule_locals = locals - if globals is None: - globals = locals - self.__lazymodule_globals = globals - mainname = globals.get("__name__", "") - if mainname: - self.__name__ = mainname + "." + name - self.__lazymodule_name = name - else: - self.__name__ = self.__lazymodule_name = name - self.__lazymodule_init = 1 - - def __lazymodule_import(self): - - """Import the module now.""" - # Load and register module - local_name = self.__lazymodule_name # e.g. "toolbox" - full_name = self.__name__ # e.g. "nltk.toolbox" - if self.__lazymodule_loaded: - return self.__lazymodule_locals[local_name] - if _debug: - print("LazyModule: Loading module %r" % full_name) - self.__lazymodule_locals[local_name] = module = __import__( - full_name, self.__lazymodule_locals, self.__lazymodule_globals, "*" - ) - - # Fill namespace with all symbols from original module to - # provide faster access. - self.__dict__.update(module.__dict__) - - # Set import flag - self.__dict__["__lazymodule_loaded"] = 1 - - if _debug: - print("LazyModule: Module %r loaded" % full_name) - return module - - def __getattr__(self, name): - - """Import the module on demand and get the attribute.""" - if self.__lazymodule_loaded: - raise AttributeError(name) - if _debug: - print( - "LazyModule: " - "Module load triggered by attribute %r read access" % name - ) - module = self.__lazymodule_import() - return getattr(module, name) - - def __setattr__(self, name, value): - - """Import the module on demand and set the attribute.""" - if not self.__lazymodule_init: - self.__dict__[name] = value - return - if self.__lazymodule_loaded: - self.__lazymodule_locals[self.__lazymodule_name] = value - self.__dict__[name] = value - return - if _debug: - print( - "LazyModule: " - "Module load triggered by attribute %r write access" % name - ) - module = self.__lazymodule_import() - setattr(module, name, value) - - def __repr__(self): - return "" % self.__name__ diff --git a/pipeline/nltk/lm/__init__.py b/pipeline/nltk/lm/__init__.py deleted file mode 100644 index 11d31b9a6aeded7e96f7db4395801af082a25737..0000000000000000000000000000000000000000 --- a/pipeline/nltk/lm/__init__.py +++ /dev/null @@ -1,235 +0,0 @@ -# Natural Language Toolkit: Language Models -# -# Copyright (C) 2001-2023 NLTK Project -# Authors: Ilia Kurenkov -# URL: >> text = [['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'e', 'f']] - -If we want to train a bigram model, we need to turn this text into bigrams. -Here's what the first sentence of our text would look like if we use a function -from NLTK for this. - - >>> from nltk.util import bigrams - >>> list(bigrams(text[0])) - [('a', 'b'), ('b', 'c')] - -Notice how "b" occurs both as the first and second member of different bigrams -but "a" and "c" don't? Wouldn't it be nice to somehow indicate how often sentences -start with "a" and end with "c"? -A standard way to deal with this is to add special "padding" symbols to the -sentence before splitting it into ngrams. -Fortunately, NLTK also has a function for that, let's see what it does to the -first sentence. - - >>> from nltk.util import pad_sequence - >>> list(pad_sequence(text[0], - ... pad_left=True, - ... left_pad_symbol="", - ... pad_right=True, - ... right_pad_symbol="", - ... n=2)) - ['', 'a', 'b', 'c', ''] - -Note the `n` argument, that tells the function we need padding for bigrams. -Now, passing all these parameters every time is tedious and in most cases they -can be safely assumed as defaults anyway. -Thus our module provides a convenience function that has all these arguments -already set while the other arguments remain the same as for `pad_sequence`. - - >>> from nltk.lm.preprocessing import pad_both_ends - >>> list(pad_both_ends(text[0], n=2)) - ['', 'a', 'b', 'c', ''] - -Combining the two parts discussed so far we get the following preparation steps -for one sentence. - - >>> list(bigrams(pad_both_ends(text[0], n=2))) - [('', 'a'), ('a', 'b'), ('b', 'c'), ('c', '')] - -To make our model more robust we could also train it on unigrams (single words) -as well as bigrams, its main source of information. -NLTK once again helpfully provides a function called `everygrams`. -While not the most efficient, it is conceptually simple. - - - >>> from nltk.util import everygrams - >>> padded_bigrams = list(pad_both_ends(text[0], n=2)) - >>> list(everygrams(padded_bigrams, max_len=2)) - [('',), ('', 'a'), ('a',), ('a', 'b'), ('b',), ('b', 'c'), ('c',), ('c', ''), ('',)] - -We are almost ready to start counting ngrams, just one more step left. -During training and evaluation our model will rely on a vocabulary that -defines which words are "known" to the model. -To create this vocabulary we need to pad our sentences (just like for counting -ngrams) and then combine the sentences into one flat stream of words. - - >>> from nltk.lm.preprocessing import flatten - >>> list(flatten(pad_both_ends(sent, n=2) for sent in text)) - ['', 'a', 'b', 'c', '', '', 'a', 'c', 'd', 'c', 'e', 'f', ''] - -In most cases we want to use the same text as the source for both vocabulary -and ngram counts. -Now that we understand what this means for our preprocessing, we can simply import -a function that does everything for us. - - >>> from nltk.lm.preprocessing import padded_everygram_pipeline - >>> train, vocab = padded_everygram_pipeline(2, text) - -So as to avoid re-creating the text in memory, both `train` and `vocab` are lazy -iterators. They are evaluated on demand at training time. - - -Training -======== -Having prepared our data we are ready to start training a model. -As a simple example, let us train a Maximum Likelihood Estimator (MLE). -We only need to specify the highest ngram order to instantiate it. - - >>> from nltk.lm import MLE - >>> lm = MLE(2) - -This automatically creates an empty vocabulary... - - >>> len(lm.vocab) - 0 - -... which gets filled as we fit the model. - - >>> lm.fit(train, vocab) - >>> print(lm.vocab) - - >>> len(lm.vocab) - 9 - -The vocabulary helps us handle words that have not occurred during training. - - >>> lm.vocab.lookup(text[0]) - ('a', 'b', 'c') - >>> lm.vocab.lookup(["aliens", "from", "Mars"]) - ('', '', '') - -Moreover, in some cases we want to ignore words that we did see during training -but that didn't occur frequently enough, to provide us useful information. -You can tell the vocabulary to ignore such words. -To find out how that works, check out the docs for the `Vocabulary` class. - - -Using a Trained Model -===================== -When it comes to ngram models the training boils down to counting up the ngrams -from the training corpus. - - >>> print(lm.counts) - - -This provides a convenient interface to access counts for unigrams... - - >>> lm.counts['a'] - 2 - -...and bigrams (in this case "a b") - - >>> lm.counts[['a']]['b'] - 1 - -And so on. However, the real purpose of training a language model is to have it -score how probable words are in certain contexts. -This being MLE, the model returns the item's relative frequency as its score. - - >>> lm.score("a") - 0.15384615384615385 - -Items that are not seen during training are mapped to the vocabulary's -"unknown label" token. This is "" by default. - - >>> lm.score("") == lm.score("aliens") - True - -Here's how you get the score for a word given some preceding context. -For example we want to know what is the chance that "b" is preceded by "a". - - >>> lm.score("b", ["a"]) - 0.5 - -To avoid underflow when working with many small score values it makes sense to -take their logarithm. -For convenience this can be done with the `logscore` method. - - >>> lm.logscore("a") - -2.700439718141092 - -Building on this method, we can also evaluate our model's cross-entropy and -perplexity with respect to sequences of ngrams. - - >>> test = [('a', 'b'), ('c', 'd')] - >>> lm.entropy(test) - 1.292481250360578 - >>> lm.perplexity(test) - 2.449489742783178 - -It is advisable to preprocess your test text exactly the same way as you did -the training text. - -One cool feature of ngram models is that they can be used to generate text. - - >>> lm.generate(1, random_seed=3) - '' - >>> lm.generate(5, random_seed=3) - ['', 'a', 'b', 'c', 'd'] - -Provide `random_seed` if you want to consistently reproduce the same text all -other things being equal. Here we are using it to test the examples. - -You can also condition your generation on some preceding text with the `context` -argument. - - >>> lm.generate(5, text_seed=['c'], random_seed=3) - ['', 'c', 'd', 'c', 'd'] - -Note that an ngram model is restricted in how much preceding context it can -take into account. For example, a trigram model can only condition its output -on 2 preceding words. If you pass in a 4-word context, the first two words -will be ignored. -""" - -from nltk.lm.counter import NgramCounter -from nltk.lm.models import ( - MLE, - AbsoluteDiscountingInterpolated, - KneserNeyInterpolated, - Laplace, - Lidstone, - StupidBackoff, - WittenBellInterpolated, -) -from nltk.lm.vocabulary import Vocabulary - -__all__ = [ - "Vocabulary", - "NgramCounter", - "MLE", - "Lidstone", - "Laplace", - "WittenBellInterpolated", - "KneserNeyInterpolated", - "AbsoluteDiscountingInterpolated", - "StupidBackoff", -] diff --git a/pipeline/nltk/lm/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/lm/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 95335049195e5701d46b0f3b20f4d8a942dea1ce..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/lm/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/lm/__pycache__/api.cpython-39.pyc b/pipeline/nltk/lm/__pycache__/api.cpython-39.pyc deleted file mode 100644 index bd79cbac7010a0678c43157059b532d6dde2ea3b..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/lm/__pycache__/api.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/lm/__pycache__/counter.cpython-39.pyc b/pipeline/nltk/lm/__pycache__/counter.cpython-39.pyc deleted file mode 100644 index c061d5d9329164b210b42b48f43c92b47159bb68..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/lm/__pycache__/counter.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/lm/__pycache__/models.cpython-39.pyc b/pipeline/nltk/lm/__pycache__/models.cpython-39.pyc deleted file mode 100644 index 243265e5ebec316b3e5e4b563cf7b4d984ef0253..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/lm/__pycache__/models.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/lm/__pycache__/preprocessing.cpython-39.pyc b/pipeline/nltk/lm/__pycache__/preprocessing.cpython-39.pyc deleted file mode 100644 index b284724d4abac0dd0bb88b7994ed5d07faf9b478..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/lm/__pycache__/preprocessing.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/lm/__pycache__/smoothing.cpython-39.pyc b/pipeline/nltk/lm/__pycache__/smoothing.cpython-39.pyc deleted file mode 100644 index f5ebed0c51a87c60d9493b9f941a55c8546135ce..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/lm/__pycache__/smoothing.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/lm/__pycache__/util.cpython-39.pyc b/pipeline/nltk/lm/__pycache__/util.cpython-39.pyc deleted file mode 100644 index b7ea8d17070ed38f7d328ab2adbe1181ee551907..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/lm/__pycache__/util.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/lm/__pycache__/vocabulary.cpython-39.pyc b/pipeline/nltk/lm/__pycache__/vocabulary.cpython-39.pyc deleted file mode 100644 index 99129b2e31edb79209b46d9abafbfc271915c940..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/lm/__pycache__/vocabulary.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/lm/api.py b/pipeline/nltk/lm/api.py deleted file mode 100644 index 470c4d4ac4c495c4cf9b7bbb3af66cb2a49a02db..0000000000000000000000000000000000000000 --- a/pipeline/nltk/lm/api.py +++ /dev/null @@ -1,235 +0,0 @@ -# Natural Language Toolkit: Language Models -# -# Copyright (C) 2001-2023 NLTK Project -# Authors: Ilia Kurenkov -# URL: -# For license information, see LICENSE.TXT -"""Language Model Interface.""" - -import random -import warnings -from abc import ABCMeta, abstractmethod -from bisect import bisect -from itertools import accumulate - -from nltk.lm.counter import NgramCounter -from nltk.lm.util import log_base2 -from nltk.lm.vocabulary import Vocabulary - - -class Smoothing(metaclass=ABCMeta): - """Ngram Smoothing Interface - - Implements Chen & Goodman 1995's idea that all smoothing algorithms have - certain features in common. This should ideally allow smoothing algorithms to - work both with Backoff and Interpolation. - """ - - def __init__(self, vocabulary, counter): - """ - :param vocabulary: The Ngram vocabulary object. - :type vocabulary: nltk.lm.vocab.Vocabulary - :param counter: The counts of the vocabulary items. - :type counter: nltk.lm.counter.NgramCounter - """ - self.vocab = vocabulary - self.counts = counter - - @abstractmethod - def unigram_score(self, word): - raise NotImplementedError() - - @abstractmethod - def alpha_gamma(self, word, context): - raise NotImplementedError() - - -def _mean(items): - """Return average (aka mean) for sequence of items.""" - return sum(items) / len(items) - - -def _random_generator(seed_or_generator): - if isinstance(seed_or_generator, random.Random): - return seed_or_generator - return random.Random(seed_or_generator) - - -def _weighted_choice(population, weights, random_generator=None): - """Like random.choice, but with weights. - - Heavily inspired by python 3.6 `random.choices`. - """ - if not population: - raise ValueError("Can't choose from empty population") - if len(population) != len(weights): - raise ValueError("The number of weights does not match the population") - cum_weights = list(accumulate(weights)) - total = cum_weights[-1] - threshold = random_generator.random() - return population[bisect(cum_weights, total * threshold)] - - -class LanguageModel(metaclass=ABCMeta): - """ABC for Language Models. - - Cannot be directly instantiated itself. - - """ - - def __init__(self, order, vocabulary=None, counter=None): - """Creates new LanguageModel. - - :param vocabulary: If provided, this vocabulary will be used instead - of creating a new one when training. - :type vocabulary: `nltk.lm.Vocabulary` or None - :param counter: If provided, use this object to count ngrams. - :type counter: `nltk.lm.NgramCounter` or None - :param ngrams_fn: If given, defines how sentences in training text are turned to ngram - sequences. - :type ngrams_fn: function or None - :param pad_fn: If given, defines how sentences in training text are padded. - :type pad_fn: function or None - """ - self.order = order - if vocabulary and not isinstance(vocabulary, Vocabulary): - warnings.warn( - f"The `vocabulary` argument passed to {self.__class__.__name__!r} " - "must be an instance of `nltk.lm.Vocabulary`.", - stacklevel=3, - ) - self.vocab = Vocabulary() if vocabulary is None else vocabulary - self.counts = NgramCounter() if counter is None else counter - - def fit(self, text, vocabulary_text=None): - """Trains the model on a text. - - :param text: Training text as a sequence of sentences. - - """ - if not self.vocab: - if vocabulary_text is None: - raise ValueError( - "Cannot fit without a vocabulary or text to create it from." - ) - self.vocab.update(vocabulary_text) - self.counts.update(self.vocab.lookup(sent) for sent in text) - - def score(self, word, context=None): - """Masks out of vocab (OOV) words and computes their model score. - - For model-specific logic of calculating scores, see the `unmasked_score` - method. - """ - return self.unmasked_score( - self.vocab.lookup(word), self.vocab.lookup(context) if context else None - ) - - @abstractmethod - def unmasked_score(self, word, context=None): - """Score a word given some optional context. - - Concrete models are expected to provide an implementation. - Note that this method does not mask its arguments with the OOV label. - Use the `score` method for that. - - :param str word: Word for which we want the score - :param tuple(str) context: Context the word is in. - If `None`, compute unigram score. - :param context: tuple(str) or None - :rtype: float - """ - raise NotImplementedError() - - def logscore(self, word, context=None): - """Evaluate the log score of this word in this context. - - The arguments are the same as for `score` and `unmasked_score`. - - """ - return log_base2(self.score(word, context)) - - def context_counts(self, context): - """Helper method for retrieving counts for a given context. - - Assumes context has been checked and oov words in it masked. - :type context: tuple(str) or None - - """ - return ( - self.counts[len(context) + 1][context] if context else self.counts.unigrams - ) - - def entropy(self, text_ngrams): - """Calculate cross-entropy of model for given evaluation text. - - :param Iterable(tuple(str)) text_ngrams: A sequence of ngram tuples. - :rtype: float - - """ - return -1 * _mean( - [self.logscore(ngram[-1], ngram[:-1]) for ngram in text_ngrams] - ) - - def perplexity(self, text_ngrams): - """Calculates the perplexity of the given text. - - This is simply 2 ** cross-entropy for the text, so the arguments are the same. - - """ - return pow(2.0, self.entropy(text_ngrams)) - - def generate(self, num_words=1, text_seed=None, random_seed=None): - """Generate words from the model. - - :param int num_words: How many words to generate. By default 1. - :param text_seed: Generation can be conditioned on preceding context. - :param random_seed: A random seed or an instance of `random.Random`. If provided, - makes the random sampling part of generation reproducible. - :return: One (str) word or a list of words generated from model. - - Examples: - - >>> from nltk.lm import MLE - >>> lm = MLE(2) - >>> lm.fit([[("a", "b"), ("b", "c")]], vocabulary_text=['a', 'b', 'c']) - >>> lm.fit([[("a",), ("b",), ("c",)]]) - >>> lm.generate(random_seed=3) - 'a' - >>> lm.generate(text_seed=['a']) - 'b' - - """ - text_seed = [] if text_seed is None else list(text_seed) - random_generator = _random_generator(random_seed) - # This is the base recursion case. - if num_words == 1: - context = ( - text_seed[-self.order + 1 :] - if len(text_seed) >= self.order - else text_seed - ) - samples = self.context_counts(self.vocab.lookup(context)) - while context and not samples: - context = context[1:] if len(context) > 1 else [] - samples = self.context_counts(self.vocab.lookup(context)) - # Sorting samples achieves two things: - # - reproducible randomness when sampling - # - turns Mapping into Sequence which `_weighted_choice` expects - samples = sorted(samples) - return _weighted_choice( - samples, - tuple(self.score(w, context) for w in samples), - random_generator, - ) - # We build up text one word at a time using the preceding context. - generated = [] - for _ in range(num_words): - generated.append( - self.generate( - num_words=1, - text_seed=text_seed + generated, - random_seed=random_generator, - ) - ) - return generated diff --git a/pipeline/nltk/lm/counter.py b/pipeline/nltk/lm/counter.py deleted file mode 100644 index 6a5ab9c6096d9015b14f4e9f3814dda75417391a..0000000000000000000000000000000000000000 --- a/pipeline/nltk/lm/counter.py +++ /dev/null @@ -1,163 +0,0 @@ -# Natural Language Toolkit -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ilia Kurenkov -# URL: -# For license information, see LICENSE.TXT -""" -Language Model Counter ----------------------- -""" - -from collections import defaultdict -from collections.abc import Sequence - -from nltk.probability import ConditionalFreqDist, FreqDist - - -class NgramCounter: - """Class for counting ngrams. - - Will count any ngram sequence you give it ;) - - First we need to make sure we are feeding the counter sentences of ngrams. - - >>> text = [["a", "b", "c", "d"], ["a", "c", "d", "c"]] - >>> from nltk.util import ngrams - >>> text_bigrams = [ngrams(sent, 2) for sent in text] - >>> text_unigrams = [ngrams(sent, 1) for sent in text] - - The counting itself is very simple. - - >>> from nltk.lm import NgramCounter - >>> ngram_counts = NgramCounter(text_bigrams + text_unigrams) - - You can conveniently access ngram counts using standard python dictionary notation. - String keys will give you unigram counts. - - >>> ngram_counts['a'] - 2 - >>> ngram_counts['aliens'] - 0 - - If you want to access counts for higher order ngrams, use a list or a tuple. - These are treated as "context" keys, so what you get is a frequency distribution - over all continuations after the given context. - - >>> sorted(ngram_counts[['a']].items()) - [('b', 1), ('c', 1)] - >>> sorted(ngram_counts[('a',)].items()) - [('b', 1), ('c', 1)] - - This is equivalent to specifying explicitly the order of the ngram (in this case - 2 for bigram) and indexing on the context. - - >>> ngram_counts[2][('a',)] is ngram_counts[['a']] - True - - Note that the keys in `ConditionalFreqDist` cannot be lists, only tuples! - It is generally advisable to use the less verbose and more flexible square - bracket notation. - - To get the count of the full ngram "a b", do this: - - >>> ngram_counts[['a']]['b'] - 1 - - Specifying the ngram order as a number can be useful for accessing all ngrams - in that order. - - >>> ngram_counts[2] - - - The keys of this `ConditionalFreqDist` are the contexts we discussed earlier. - Unigrams can also be accessed with a human-friendly alias. - - >>> ngram_counts.unigrams is ngram_counts[1] - True - - Similarly to `collections.Counter`, you can update counts after initialization. - - >>> ngram_counts['e'] - 0 - >>> ngram_counts.update([ngrams(["d", "e", "f"], 1)]) - >>> ngram_counts['e'] - 1 - - """ - - def __init__(self, ngram_text=None): - """Creates a new NgramCounter. - - If `ngram_text` is specified, counts ngrams from it, otherwise waits for - `update` method to be called explicitly. - - :param ngram_text: Optional text containing sentences of ngrams, as for `update` method. - :type ngram_text: Iterable(Iterable(tuple(str))) or None - - """ - self._counts = defaultdict(ConditionalFreqDist) - self._counts[1] = self.unigrams = FreqDist() - - if ngram_text: - self.update(ngram_text) - - def update(self, ngram_text): - """Updates ngram counts from `ngram_text`. - - Expects `ngram_text` to be a sequence of sentences (sequences). - Each sentence consists of ngrams as tuples of strings. - - :param Iterable(Iterable(tuple(str))) ngram_text: Text containing sentences of ngrams. - :raises TypeError: if the ngrams are not tuples. - - """ - - for sent in ngram_text: - for ngram in sent: - if not isinstance(ngram, tuple): - raise TypeError( - "Ngram <{}> isn't a tuple, " "but {}".format(ngram, type(ngram)) - ) - - ngram_order = len(ngram) - if ngram_order == 1: - self.unigrams[ngram[0]] += 1 - continue - - context, word = ngram[:-1], ngram[-1] - self[ngram_order][context][word] += 1 - - def N(self): - """Returns grand total number of ngrams stored. - - This includes ngrams from all orders, so some duplication is expected. - :rtype: int - - >>> from nltk.lm import NgramCounter - >>> counts = NgramCounter([[("a", "b"), ("c",), ("d", "e")]]) - >>> counts.N() - 3 - - """ - return sum(val.N() for val in self._counts.values()) - - def __getitem__(self, item): - """User-friendly access to ngram counts.""" - if isinstance(item, int): - return self._counts[item] - elif isinstance(item, str): - return self._counts.__getitem__(1)[item] - elif isinstance(item, Sequence): - return self._counts.__getitem__(len(item) + 1)[tuple(item)] - - def __str__(self): - return "<{} with {} ngram orders and {} ngrams>".format( - self.__class__.__name__, len(self._counts), self.N() - ) - - def __len__(self): - return self._counts.__len__() - - def __contains__(self, item): - return item in self._counts diff --git a/pipeline/nltk/lm/models.py b/pipeline/nltk/lm/models.py deleted file mode 100644 index ee5094901a14802b835c934e0054762666cd467f..0000000000000000000000000000000000000000 --- a/pipeline/nltk/lm/models.py +++ /dev/null @@ -1,141 +0,0 @@ -# Natural Language Toolkit: Language Models -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ilia Kurenkov -# Manu Joseph -# URL: -# For license information, see LICENSE.TXT -"""Language Models""" - -from nltk.lm.api import LanguageModel, Smoothing -from nltk.lm.smoothing import AbsoluteDiscounting, KneserNey, WittenBell - - -class MLE(LanguageModel): - """Class for providing MLE ngram model scores. - - Inherits initialization from BaseNgramModel. - """ - - def unmasked_score(self, word, context=None): - """Returns the MLE score for a word given a context. - - Args: - - word is expected to be a string - - context is expected to be something reasonably convertible to a tuple - """ - return self.context_counts(context).freq(word) - - -class Lidstone(LanguageModel): - """Provides Lidstone-smoothed scores. - - In addition to initialization arguments from BaseNgramModel also requires - a number by which to increase the counts, gamma. - """ - - def __init__(self, gamma, *args, **kwargs): - super().__init__(*args, **kwargs) - self.gamma = gamma - - def unmasked_score(self, word, context=None): - """Add-one smoothing: Lidstone or Laplace. - - To see what kind, look at `gamma` attribute on the class. - - """ - counts = self.context_counts(context) - word_count = counts[word] - norm_count = counts.N() - return (word_count + self.gamma) / (norm_count + len(self.vocab) * self.gamma) - - -class Laplace(Lidstone): - """Implements Laplace (add one) smoothing. - - Initialization identical to BaseNgramModel because gamma is always 1. - """ - - def __init__(self, *args, **kwargs): - super().__init__(1, *args, **kwargs) - - -class StupidBackoff(LanguageModel): - """Provides StupidBackoff scores. - - In addition to initialization arguments from BaseNgramModel also requires - a parameter alpha with which we scale the lower order probabilities. - Note that this is not a true probability distribution as scores for ngrams - of the same order do not sum up to unity. - """ - - def __init__(self, alpha=0.4, *args, **kwargs): - super().__init__(*args, **kwargs) - self.alpha = alpha - - def unmasked_score(self, word, context=None): - if not context: - # Base recursion - return self.counts.unigrams.freq(word) - counts = self.context_counts(context) - word_count = counts[word] - norm_count = counts.N() - if word_count > 0: - return word_count / norm_count - else: - return self.alpha * self.unmasked_score(word, context[1:]) - - -class InterpolatedLanguageModel(LanguageModel): - """Logic common to all interpolated language models. - - The idea to abstract this comes from Chen & Goodman 1995. - Do not instantiate this class directly! - """ - - def __init__(self, smoothing_cls, order, **kwargs): - params = kwargs.pop("params", {}) - super().__init__(order, **kwargs) - self.estimator = smoothing_cls(self.vocab, self.counts, **params) - - def unmasked_score(self, word, context=None): - if not context: - # The base recursion case: no context, we only have a unigram. - return self.estimator.unigram_score(word) - if not self.counts[context]: - # It can also happen that we have no data for this context. - # In that case we defer to the lower-order ngram. - # This is the same as setting alpha to 0 and gamma to 1. - alpha, gamma = 0, 1 - else: - alpha, gamma = self.estimator.alpha_gamma(word, context) - return alpha + gamma * self.unmasked_score(word, context[1:]) - - -class WittenBellInterpolated(InterpolatedLanguageModel): - """Interpolated version of Witten-Bell smoothing.""" - - def __init__(self, order, **kwargs): - super().__init__(WittenBell, order, **kwargs) - - -class AbsoluteDiscountingInterpolated(InterpolatedLanguageModel): - """Interpolated version of smoothing with absolute discount.""" - - def __init__(self, order, discount=0.75, **kwargs): - super().__init__( - AbsoluteDiscounting, order, params={"discount": discount}, **kwargs - ) - - -class KneserNeyInterpolated(InterpolatedLanguageModel): - """Interpolated version of Kneser-Ney smoothing.""" - - def __init__(self, order, discount=0.1, **kwargs): - if not (0 <= discount <= 1): - raise ValueError( - "Discount must be between 0 and 1 for probabilities to sum to unity." - ) - super().__init__( - KneserNey, order, params={"discount": discount, "order": order}, **kwargs - ) diff --git a/pipeline/nltk/lm/preprocessing.py b/pipeline/nltk/lm/preprocessing.py deleted file mode 100644 index 9ba6d5bd2cfb59d479b203ebf99878024b2a0f76..0000000000000000000000000000000000000000 --- a/pipeline/nltk/lm/preprocessing.py +++ /dev/null @@ -1,51 +0,0 @@ -# Natural Language Toolkit: Language Model Unit Tests -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ilia Kurenkov -# URL: -# For license information, see LICENSE.TXT -from functools import partial -from itertools import chain - -from nltk.util import everygrams, pad_sequence - -flatten = chain.from_iterable -pad_both_ends = partial( - pad_sequence, - pad_left=True, - left_pad_symbol="", - pad_right=True, - right_pad_symbol="", -) -pad_both_ends.__doc__ = """Pads both ends of a sentence to length specified by ngram order. - - Following convention pads the start of sentence pads its end. - """ - - -def padded_everygrams(order, sentence): - """Helper with some useful defaults. - - Applies pad_both_ends to sentence and follows it up with everygrams. - """ - return everygrams(list(pad_both_ends(sentence, n=order)), max_len=order) - - -def padded_everygram_pipeline(order, text): - """Default preprocessing for a sequence of sentences. - - Creates two iterators: - - - sentences padded and turned into sequences of `nltk.util.everygrams` - - sentences padded as above and chained together for a flat stream of words - - :param order: Largest ngram length produced by `everygrams`. - :param text: Text to iterate over. Expected to be an iterable of sentences. - :type text: Iterable[Iterable[str]] - :return: iterator over text as ngrams, iterator over text as vocabulary data - """ - padding_fn = partial(pad_both_ends, n=order) - return ( - (everygrams(list(padding_fn(sent)), max_len=order) for sent in text), - flatten(map(padding_fn, text)), - ) diff --git a/pipeline/nltk/lm/smoothing.py b/pipeline/nltk/lm/smoothing.py deleted file mode 100644 index 6761f1ead23f7ab5b410d9a5795b8b4fce189d2b..0000000000000000000000000000000000000000 --- a/pipeline/nltk/lm/smoothing.py +++ /dev/null @@ -1,127 +0,0 @@ -# Natural Language Toolkit: Language Model Unit Tests -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ilia Kurenkov -# Manu Joseph -# URL: -# For license information, see LICENSE.TXT -"""Smoothing algorithms for language modeling. - -According to Chen & Goodman 1995 these should work with both Backoff and -Interpolation. -""" -from operator import methodcaller - -from nltk.lm.api import Smoothing -from nltk.probability import ConditionalFreqDist - - -def _count_values_gt_zero(distribution): - """Count values that are greater than zero in a distribution. - - Assumes distribution is either a mapping with counts as values or - an instance of `nltk.ConditionalFreqDist`. - """ - as_count = ( - methodcaller("N") - if isinstance(distribution, ConditionalFreqDist) - else lambda count: count - ) - # We explicitly check that values are > 0 to guard against negative counts. - return sum( - 1 for dist_or_count in distribution.values() if as_count(dist_or_count) > 0 - ) - - -class WittenBell(Smoothing): - """Witten-Bell smoothing.""" - - def __init__(self, vocabulary, counter, **kwargs): - super().__init__(vocabulary, counter, **kwargs) - - def alpha_gamma(self, word, context): - alpha = self.counts[context].freq(word) - gamma = self._gamma(context) - return (1.0 - gamma) * alpha, gamma - - def _gamma(self, context): - n_plus = _count_values_gt_zero(self.counts[context]) - return n_plus / (n_plus + self.counts[context].N()) - - def unigram_score(self, word): - return self.counts.unigrams.freq(word) - - -class AbsoluteDiscounting(Smoothing): - """Smoothing with absolute discount.""" - - def __init__(self, vocabulary, counter, discount=0.75, **kwargs): - super().__init__(vocabulary, counter, **kwargs) - self.discount = discount - - def alpha_gamma(self, word, context): - alpha = ( - max(self.counts[context][word] - self.discount, 0) - / self.counts[context].N() - ) - gamma = self._gamma(context) - return alpha, gamma - - def _gamma(self, context): - n_plus = _count_values_gt_zero(self.counts[context]) - return (self.discount * n_plus) / self.counts[context].N() - - def unigram_score(self, word): - return self.counts.unigrams.freq(word) - - -class KneserNey(Smoothing): - """Kneser-Ney Smoothing. - - This is an extension of smoothing with a discount. - - Resources: - - https://pages.ucsd.edu/~rlevy/lign256/winter2008/kneser_ney_mini_example.pdf - - https://www.youtube.com/watch?v=ody1ysUTD7o - - https://medium.com/@dennyc/a-simple-numerical-example-for-kneser-ney-smoothing-nlp-4600addf38b8 - - https://www.cl.uni-heidelberg.de/courses/ss15/smt/scribe6.pdf - - https://www-i6.informatik.rwth-aachen.de/publications/download/951/Kneser-ICASSP-1995.pdf - """ - - def __init__(self, vocabulary, counter, order, discount=0.1, **kwargs): - super().__init__(vocabulary, counter, **kwargs) - self.discount = discount - self._order = order - - def unigram_score(self, word): - word_continuation_count, total_count = self._continuation_counts(word) - return word_continuation_count / total_count - - def alpha_gamma(self, word, context): - prefix_counts = self.counts[context] - word_continuation_count, total_count = ( - (prefix_counts[word], prefix_counts.N()) - if len(context) + 1 == self._order - else self._continuation_counts(word, context) - ) - alpha = max(word_continuation_count - self.discount, 0.0) / total_count - gamma = self.discount * _count_values_gt_zero(prefix_counts) / total_count - return alpha, gamma - - def _continuation_counts(self, word, context=tuple()): - """Count continuations that end with context and word. - - Continuations track unique ngram "types", regardless of how many - instances were observed for each "type". - This is different than raw ngram counts which track number of instances. - """ - higher_order_ngrams_with_context = ( - counts - for prefix_ngram, counts in self.counts[len(context) + 2].items() - if prefix_ngram[1:] == context - ) - higher_order_ngrams_with_word_count, total = 0, 0 - for counts in higher_order_ngrams_with_context: - higher_order_ngrams_with_word_count += int(counts[word] > 0) - total += _count_values_gt_zero(counts) - return higher_order_ngrams_with_word_count, total diff --git a/pipeline/nltk/lm/util.py b/pipeline/nltk/lm/util.py deleted file mode 100644 index 483e64c26abd85ad9bb6caf74d3bb38fd9ae7d66..0000000000000000000000000000000000000000 --- a/pipeline/nltk/lm/util.py +++ /dev/null @@ -1,19 +0,0 @@ -# Natural Language Toolkit -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ilia Kurenkov -# URL: -# For license information, see LICENSE.TXT -"""Language Model Utilities""" - -from math import log - -NEG_INF = float("-inf") -POS_INF = float("inf") - - -def log_base2(score): - """Convenience function for computing logarithms with base 2.""" - if score == 0.0: - return NEG_INF - return log(score, 2) diff --git a/pipeline/nltk/lm/vocabulary.py b/pipeline/nltk/lm/vocabulary.py deleted file mode 100644 index 74964b262f78fe305cd9e5445d833683ed172978..0000000000000000000000000000000000000000 --- a/pipeline/nltk/lm/vocabulary.py +++ /dev/null @@ -1,218 +0,0 @@ -# Natural Language Toolkit -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ilia Kurenkov -# URL: -# For license information, see LICENSE.TXT -"""Language Model Vocabulary""" - -import sys -from collections import Counter -from collections.abc import Iterable -from functools import singledispatch -from itertools import chain - - -@singledispatch -def _dispatched_lookup(words, vocab): - raise TypeError(f"Unsupported type for looking up in vocabulary: {type(words)}") - - -@_dispatched_lookup.register(Iterable) -def _(words, vocab): - """Look up a sequence of words in the vocabulary. - - Returns an iterator over looked up words. - - """ - return tuple(_dispatched_lookup(w, vocab) for w in words) - - -@_dispatched_lookup.register(str) -def _string_lookup(word, vocab): - """Looks up one word in the vocabulary.""" - return word if word in vocab else vocab.unk_label - - -class Vocabulary: - """Stores language model vocabulary. - - Satisfies two common language modeling requirements for a vocabulary: - - - When checking membership and calculating its size, filters items - by comparing their counts to a cutoff value. - - Adds a special "unknown" token which unseen words are mapped to. - - >>> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd'] - >>> from nltk.lm import Vocabulary - >>> vocab = Vocabulary(words, unk_cutoff=2) - - Tokens with counts greater than or equal to the cutoff value will - be considered part of the vocabulary. - - >>> vocab['c'] - 3 - >>> 'c' in vocab - True - >>> vocab['d'] - 2 - >>> 'd' in vocab - True - - Tokens with frequency counts less than the cutoff value will be considered not - part of the vocabulary even though their entries in the count dictionary are - preserved. - - >>> vocab['b'] - 1 - >>> 'b' in vocab - False - >>> vocab['aliens'] - 0 - >>> 'aliens' in vocab - False - - Keeping the count entries for seen words allows us to change the cutoff value - without having to recalculate the counts. - - >>> vocab2 = Vocabulary(vocab.counts, unk_cutoff=1) - >>> "b" in vocab2 - True - - The cutoff value influences not only membership checking but also the result of - getting the size of the vocabulary using the built-in `len`. - Note that while the number of keys in the vocabulary's counter stays the same, - the items in the vocabulary differ depending on the cutoff. - We use `sorted` to demonstrate because it keeps the order consistent. - - >>> sorted(vocab2.counts) - ['-', 'a', 'b', 'c', 'd', 'r'] - >>> sorted(vocab2) - ['-', '', 'a', 'b', 'c', 'd', 'r'] - >>> sorted(vocab.counts) - ['-', 'a', 'b', 'c', 'd', 'r'] - >>> sorted(vocab) - ['', 'a', 'c', 'd'] - - In addition to items it gets populated with, the vocabulary stores a special - token that stands in for so-called "unknown" items. By default it's "". - - >>> "" in vocab - True - - We can look up words in a vocabulary using its `lookup` method. - "Unseen" words (with counts less than cutoff) are looked up as the unknown label. - If given one word (a string) as an input, this method will return a string. - - >>> vocab.lookup("a") - 'a' - >>> vocab.lookup("aliens") - '' - - If given a sequence, it will return an tuple of the looked up words. - - >>> vocab.lookup(["p", 'a', 'r', 'd', 'b', 'c']) - ('', 'a', '', 'd', '', 'c') - - It's possible to update the counts after the vocabulary has been created. - In general, the interface is the same as that of `collections.Counter`. - - >>> vocab['b'] - 1 - >>> vocab.update(["b", "b", "c"]) - >>> vocab['b'] - 3 - """ - - def __init__(self, counts=None, unk_cutoff=1, unk_label=""): - """Create a new Vocabulary. - - :param counts: Optional iterable or `collections.Counter` instance to - pre-seed the Vocabulary. In case it is iterable, counts - are calculated. - :param int unk_cutoff: Words that occur less frequently than this value - are not considered part of the vocabulary. - :param unk_label: Label for marking words not part of vocabulary. - - """ - self.unk_label = unk_label - if unk_cutoff < 1: - raise ValueError(f"Cutoff value cannot be less than 1. Got: {unk_cutoff}") - self._cutoff = unk_cutoff - - self.counts = Counter() - self.update(counts if counts is not None else "") - - @property - def cutoff(self): - """Cutoff value. - - Items with count below this value are not considered part of vocabulary. - - """ - return self._cutoff - - def update(self, *counter_args, **counter_kwargs): - """Update vocabulary counts. - - Wraps `collections.Counter.update` method. - - """ - self.counts.update(*counter_args, **counter_kwargs) - self._len = sum(1 for _ in self) - - def lookup(self, words): - """Look up one or more words in the vocabulary. - - If passed one word as a string will return that word or `self.unk_label`. - Otherwise will assume it was passed a sequence of words, will try to look - each of them up and return an iterator over the looked up words. - - :param words: Word(s) to look up. - :type words: Iterable(str) or str - :rtype: generator(str) or str - :raises: TypeError for types other than strings or iterables - - >>> from nltk.lm import Vocabulary - >>> vocab = Vocabulary(["a", "b", "c", "a", "b"], unk_cutoff=2) - >>> vocab.lookup("a") - 'a' - >>> vocab.lookup("aliens") - '' - >>> vocab.lookup(["a", "b", "c", ["x", "b"]]) - ('a', 'b', '', ('', 'b')) - - """ - return _dispatched_lookup(words, self) - - def __getitem__(self, item): - return self._cutoff if item == self.unk_label else self.counts[item] - - def __contains__(self, item): - """Only consider items with counts GE to cutoff as being in the - vocabulary.""" - return self[item] >= self.cutoff - - def __iter__(self): - """Building on membership check define how to iterate over - vocabulary.""" - return chain( - (item for item in self.counts if item in self), - [self.unk_label] if self.counts else [], - ) - - def __len__(self): - """Computing size of vocabulary reflects the cutoff.""" - return self._len - - def __eq__(self, other): - return ( - self.unk_label == other.unk_label - and self.cutoff == other.cutoff - and self.counts == other.counts - ) - - def __str__(self): - return "<{} with cutoff={} unk_label='{}' and {} items>".format( - self.__class__.__name__, self.cutoff, self.unk_label, len(self) - ) diff --git a/pipeline/nltk/metrics/__init__.py b/pipeline/nltk/metrics/__init__.py deleted file mode 100644 index ada17ef29e19763f8bc42d103436e7fa72d3cfd0..0000000000000000000000000000000000000000 --- a/pipeline/nltk/metrics/__init__.py +++ /dev/null @@ -1,51 +0,0 @@ -# Natural Language Toolkit: Metrics -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Edward Loper -# URL: -# For license information, see LICENSE.TXT -# - -""" -NLTK Metrics - -Classes and methods for scoring processing modules. -""" - -from nltk.metrics.agreement import AnnotationTask -from nltk.metrics.aline import align -from nltk.metrics.association import ( - BigramAssocMeasures, - ContingencyMeasures, - NgramAssocMeasures, - QuadgramAssocMeasures, - TrigramAssocMeasures, -) -from nltk.metrics.confusionmatrix import ConfusionMatrix -from nltk.metrics.distance import ( - binary_distance, - custom_distance, - edit_distance, - edit_distance_align, - fractional_presence, - interval_distance, - jaccard_distance, - masi_distance, - presence, -) -from nltk.metrics.paice import Paice -from nltk.metrics.scores import ( - accuracy, - approxrand, - f_measure, - log_likelihood, - precision, - recall, -) -from nltk.metrics.segmentation import ghd, pk, windowdiff -from nltk.metrics.spearman import ( - ranks_from_scores, - ranks_from_sequence, - spearman_correlation, -) diff --git a/pipeline/nltk/metrics/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/metrics/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 5dedfbb38061a7d1bf9cffa93ffd591d849d984c..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/metrics/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/metrics/__pycache__/agreement.cpython-39.pyc b/pipeline/nltk/metrics/__pycache__/agreement.cpython-39.pyc deleted file mode 100644 index db57520fe79bc71f9def8745f734682fb6965629..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/metrics/__pycache__/agreement.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/metrics/__pycache__/aline.cpython-39.pyc b/pipeline/nltk/metrics/__pycache__/aline.cpython-39.pyc deleted file mode 100644 index 807cddbf13f2d01e8dafc05664c9f2b84297ce82..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/metrics/__pycache__/aline.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/metrics/__pycache__/association.cpython-39.pyc b/pipeline/nltk/metrics/__pycache__/association.cpython-39.pyc deleted file mode 100644 index 65bc7cb975df1e841970c79ae4c47868157b96af..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/metrics/__pycache__/association.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/metrics/__pycache__/confusionmatrix.cpython-39.pyc b/pipeline/nltk/metrics/__pycache__/confusionmatrix.cpython-39.pyc deleted file mode 100644 index 67b5e97123d3ebc3ec170506f926214e7e9cc628..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/metrics/__pycache__/confusionmatrix.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/metrics/__pycache__/distance.cpython-39.pyc b/pipeline/nltk/metrics/__pycache__/distance.cpython-39.pyc deleted file mode 100644 index 523109614b7d14a9c31a2fcac3b43180f1465b69..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/metrics/__pycache__/distance.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/metrics/__pycache__/paice.cpython-39.pyc b/pipeline/nltk/metrics/__pycache__/paice.cpython-39.pyc deleted file mode 100644 index 5c6c59ffae75f4c53b2612a080f4f2399b7290d3..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/metrics/__pycache__/paice.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/metrics/__pycache__/scores.cpython-39.pyc b/pipeline/nltk/metrics/__pycache__/scores.cpython-39.pyc deleted file mode 100644 index ea8cfb4c1aa4f2ca3799495340ed3edef4755bfe..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/metrics/__pycache__/scores.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/metrics/__pycache__/segmentation.cpython-39.pyc b/pipeline/nltk/metrics/__pycache__/segmentation.cpython-39.pyc deleted file mode 100644 index 583f87724f48344c5a23c675286d626889677725..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/metrics/__pycache__/segmentation.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/metrics/__pycache__/spearman.cpython-39.pyc b/pipeline/nltk/metrics/__pycache__/spearman.cpython-39.pyc deleted file mode 100644 index a837f59b9b7ef6f269f45b733f5ceedbc8f1eba3..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/metrics/__pycache__/spearman.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/metrics/agreement.py b/pipeline/nltk/metrics/agreement.py deleted file mode 100644 index 69b1a39fe2017df3beef39fcdd57b9a73c6ac0f5..0000000000000000000000000000000000000000 --- a/pipeline/nltk/metrics/agreement.py +++ /dev/null @@ -1,465 +0,0 @@ -# Natural Language Toolkit: Agreement Metrics -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Tom Lippincott -# URL: -# For license information, see LICENSE.TXT -# - -""" -Implementations of inter-annotator agreement coefficients surveyed by Artstein -and Poesio (2007), Inter-Coder Agreement for Computational Linguistics. - -An agreement coefficient calculates the amount that annotators agreed on label -assignments beyond what is expected by chance. - -In defining the AnnotationTask class, we use naming conventions similar to the -paper's terminology. There are three types of objects in an annotation task: - - the coders (variables "c" and "C") - the items to be annotated (variables "i" and "I") - the potential categories to be assigned (variables "k" and "K") - -Additionally, it is often the case that we don't want to treat two different -labels as complete disagreement, and so the AnnotationTask constructor can also -take a distance metric as a final argument. Distance metrics are simply -functions that take two arguments, and return a value between 0.0 and 1.0 -indicating the distance between them. If not supplied, the default is binary -comparison between the arguments. - -The simplest way to initialize an AnnotationTask is with a list of triples, -each containing a coder's assignment for one object in the task: - - task = AnnotationTask(data=[('c1', '1', 'v1'),('c2', '1', 'v1'),...]) - -Note that the data list needs to contain the same number of triples for each -individual coder, containing category values for the same set of items. - -Alpha (Krippendorff 1980) -Kappa (Cohen 1960) -S (Bennet, Albert and Goldstein 1954) -Pi (Scott 1955) - - -TODO: Describe handling of multiple coders and missing data - -Expected results from the Artstein and Poesio survey paper: - - >>> from nltk.metrics.agreement import AnnotationTask - >>> import os.path - >>> t = AnnotationTask(data=[x.split() for x in open(os.path.join(os.path.dirname(__file__), "artstein_poesio_example.txt"))]) - >>> t.avg_Ao() - 0.88 - >>> round(t.pi(), 5) - 0.79953 - >>> round(t.S(), 2) - 0.82 - - This would have returned a wrong value (0.0) in @785fb79 as coders are in - the wrong order. Subsequently, all values for pi(), S(), and kappa() would - have been wrong as they are computed with avg_Ao(). - >>> t2 = AnnotationTask(data=[('b','1','stat'),('a','1','stat')]) - >>> t2.avg_Ao() - 1.0 - - The following, of course, also works. - >>> t3 = AnnotationTask(data=[('a','1','othr'),('b','1','othr')]) - >>> t3.avg_Ao() - 1.0 - -""" - -import logging -from itertools import groupby -from operator import itemgetter - -from nltk.internals import deprecated -from nltk.metrics.distance import binary_distance -from nltk.probability import ConditionalFreqDist, FreqDist - -log = logging.getLogger(__name__) - - -class AnnotationTask: - """Represents an annotation task, i.e. people assign labels to items. - - Notation tries to match notation in Artstein and Poesio (2007). - - In general, coders and items can be represented as any hashable object. - Integers, for example, are fine, though strings are more readable. - Labels must support the distance functions applied to them, so e.g. - a string-edit-distance makes no sense if your labels are integers, - whereas interval distance needs numeric values. A notable case of this - is the MASI metric, which requires Python sets. - """ - - def __init__(self, data=None, distance=binary_distance): - """Initialize an annotation task. - - The data argument can be None (to create an empty annotation task) or a sequence of 3-tuples, - each representing a coder's labeling of an item: - ``(coder,item,label)`` - - The distance argument is a function taking two arguments (labels) and producing a numerical distance. - The distance from a label to itself should be zero: - ``distance(l,l) = 0`` - """ - self.distance = distance - self.I = set() - self.K = set() - self.C = set() - self.data = [] - if data is not None: - self.load_array(data) - - def __str__(self): - return "\r\n".join( - map( - lambda x: "%s\t%s\t%s" - % (x["coder"], x["item"].replace("_", "\t"), ",".join(x["labels"])), - self.data, - ) - ) - - def load_array(self, array): - """Load an sequence of annotation results, appending to any data already loaded. - - The argument is a sequence of 3-tuples, each representing a coder's labeling of an item: - (coder,item,label) - """ - for coder, item, labels in array: - self.C.add(coder) - self.K.add(labels) - self.I.add(item) - self.data.append({"coder": coder, "labels": labels, "item": item}) - - def agr(self, cA, cB, i, data=None): - """Agreement between two coders on a given item""" - data = data or self.data - # cfedermann: we don't know what combination of coder/item will come - # first in x; to avoid StopIteration problems due to assuming an order - # cA,cB, we allow either for k1 and then look up the missing as k2. - k1 = next(x for x in data if x["coder"] in (cA, cB) and x["item"] == i) - if k1["coder"] == cA: - k2 = next(x for x in data if x["coder"] == cB and x["item"] == i) - else: - k2 = next(x for x in data if x["coder"] == cA and x["item"] == i) - - ret = 1.0 - float(self.distance(k1["labels"], k2["labels"])) - log.debug("Observed agreement between %s and %s on %s: %f", cA, cB, i, ret) - log.debug( - 'Distance between "%r" and "%r": %f', k1["labels"], k2["labels"], 1.0 - ret - ) - return ret - - def Nk(self, k): - return float(sum(1 for x in self.data if x["labels"] == k)) - - def Nik(self, i, k): - return float(sum(1 for x in self.data if x["item"] == i and x["labels"] == k)) - - def Nck(self, c, k): - return float(sum(1 for x in self.data if x["coder"] == c and x["labels"] == k)) - - @deprecated("Use Nk, Nik or Nck instead") - def N(self, k=None, i=None, c=None): - """Implements the "n-notation" used in Artstein and Poesio (2007)""" - if k is not None and i is None and c is None: - ret = self.Nk(k) - elif k is not None and i is not None and c is None: - ret = self.Nik(i, k) - elif k is not None and c is not None and i is None: - ret = self.Nck(c, k) - else: - raise ValueError( - f"You must pass either i or c, not both! (k={k!r},i={i!r},c={c!r})" - ) - log.debug("Count on N[%s,%s,%s]: %d", k, i, c, ret) - return ret - - def _grouped_data(self, field, data=None): - data = data or self.data - return groupby(sorted(data, key=itemgetter(field)), itemgetter(field)) - - def Ao(self, cA, cB): - """Observed agreement between two coders on all items.""" - data = self._grouped_data( - "item", (x for x in self.data if x["coder"] in (cA, cB)) - ) - ret = sum(self.agr(cA, cB, item, item_data) for item, item_data in data) / len( - self.I - ) - log.debug("Observed agreement between %s and %s: %f", cA, cB, ret) - return ret - - def _pairwise_average(self, function): - """ - Calculates the average of function results for each coder pair - """ - total = 0 - n = 0 - s = self.C.copy() - for cA in self.C: - s.remove(cA) - for cB in s: - total += function(cA, cB) - n += 1 - ret = total / n - return ret - - def avg_Ao(self): - """Average observed agreement across all coders and items.""" - ret = self._pairwise_average(self.Ao) - log.debug("Average observed agreement: %f", ret) - return ret - - def Do_Kw_pairwise(self, cA, cB, max_distance=1.0): - """The observed disagreement for the weighted kappa coefficient.""" - total = 0.0 - data = (x for x in self.data if x["coder"] in (cA, cB)) - for i, itemdata in self._grouped_data("item", data): - # we should have two items; distance doesn't care which comes first - total += self.distance(next(itemdata)["labels"], next(itemdata)["labels"]) - - ret = total / (len(self.I) * max_distance) - log.debug("Observed disagreement between %s and %s: %f", cA, cB, ret) - return ret - - def Do_Kw(self, max_distance=1.0): - """Averaged over all labelers""" - ret = self._pairwise_average( - lambda cA, cB: self.Do_Kw_pairwise(cA, cB, max_distance) - ) - log.debug("Observed disagreement: %f", ret) - return ret - - # Agreement Coefficients - def S(self): - """Bennett, Albert and Goldstein 1954""" - Ae = 1.0 / len(self.K) - ret = (self.avg_Ao() - Ae) / (1.0 - Ae) - return ret - - def pi(self): - """Scott 1955; here, multi-pi. - Equivalent to K from Siegel and Castellan (1988). - - """ - total = 0.0 - label_freqs = FreqDist(x["labels"] for x in self.data) - for k, f in label_freqs.items(): - total += f**2 - Ae = total / ((len(self.I) * len(self.C)) ** 2) - return (self.avg_Ao() - Ae) / (1 - Ae) - - def Ae_kappa(self, cA, cB): - Ae = 0.0 - nitems = float(len(self.I)) - label_freqs = ConditionalFreqDist((x["labels"], x["coder"]) for x in self.data) - for k in label_freqs.conditions(): - Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems) - return Ae - - def kappa_pairwise(self, cA, cB): - """ """ - Ae = self.Ae_kappa(cA, cB) - ret = (self.Ao(cA, cB) - Ae) / (1.0 - Ae) - log.debug("Expected agreement between %s and %s: %f", cA, cB, Ae) - return ret - - def kappa(self): - """Cohen 1960 - Averages naively over kappas for each coder pair. - - """ - return self._pairwise_average(self.kappa_pairwise) - - def multi_kappa(self): - """Davies and Fleiss 1982 - Averages over observed and expected agreements for each coder pair. - - """ - Ae = self._pairwise_average(self.Ae_kappa) - return (self.avg_Ao() - Ae) / (1.0 - Ae) - - def Disagreement(self, label_freqs): - total_labels = sum(label_freqs.values()) - pairs = 0.0 - for j, nj in label_freqs.items(): - for l, nl in label_freqs.items(): - pairs += float(nj * nl) * self.distance(l, j) - return 1.0 * pairs / (total_labels * (total_labels - 1)) - - def alpha(self): - """Krippendorff 1980""" - # check for degenerate cases - if len(self.K) == 0: - raise ValueError("Cannot calculate alpha, no data present!") - if len(self.K) == 1: - log.debug("Only one annotation value, alpha returning 1.") - return 1 - if len(self.C) == 1 and len(self.I) == 1: - raise ValueError("Cannot calculate alpha, only one coder and item present!") - - total_disagreement = 0.0 - total_ratings = 0 - all_valid_labels_freq = FreqDist([]) - - total_do = 0.0 # Total observed disagreement for all items. - for i, itemdata in self._grouped_data("item"): - label_freqs = FreqDist(x["labels"] for x in itemdata) - labels_count = sum(label_freqs.values()) - if labels_count < 2: - # Ignore the item. - continue - all_valid_labels_freq += label_freqs - total_do += self.Disagreement(label_freqs) * labels_count - - do = total_do / sum(all_valid_labels_freq.values()) - - de = self.Disagreement(all_valid_labels_freq) # Expected disagreement. - k_alpha = 1.0 - do / de - - return k_alpha - - def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0): - """Cohen 1968""" - total = 0.0 - label_freqs = ConditionalFreqDist( - (x["coder"], x["labels"]) for x in self.data if x["coder"] in (cA, cB) - ) - for j in self.K: - for l in self.K: - total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l) - De = total / (max_distance * pow(len(self.I), 2)) - log.debug("Expected disagreement between %s and %s: %f", cA, cB, De) - Do = self.Do_Kw_pairwise(cA, cB) - ret = 1.0 - (Do / De) - return ret - - def weighted_kappa(self, max_distance=1.0): - """Cohen 1968""" - return self._pairwise_average( - lambda cA, cB: self.weighted_kappa_pairwise(cA, cB, max_distance) - ) - - -if __name__ == "__main__": - - import optparse - import re - - from nltk.metrics import distance - - # process command-line arguments - parser = optparse.OptionParser() - parser.add_option( - "-d", - "--distance", - dest="distance", - default="binary_distance", - help="distance metric to use", - ) - parser.add_option( - "-a", - "--agreement", - dest="agreement", - default="kappa", - help="agreement coefficient to calculate", - ) - parser.add_option( - "-e", - "--exclude", - dest="exclude", - action="append", - default=[], - help="coder names to exclude (may be specified multiple times)", - ) - parser.add_option( - "-i", - "--include", - dest="include", - action="append", - default=[], - help="coder names to include, same format as exclude", - ) - parser.add_option( - "-f", - "--file", - dest="file", - help="file to read labelings from, each line with three columns: 'labeler item labels'", - ) - parser.add_option( - "-v", - "--verbose", - dest="verbose", - default="0", - help="how much debugging to print on stderr (0-4)", - ) - parser.add_option( - "-c", - "--columnsep", - dest="columnsep", - default="\t", - help="char/string that separates the three columns in the file, defaults to tab", - ) - parser.add_option( - "-l", - "--labelsep", - dest="labelsep", - default=",", - help="char/string that separates labels (if labelers can assign more than one), defaults to comma", - ) - parser.add_option( - "-p", - "--presence", - dest="presence", - default=None, - help="convert each labeling into 1 or 0, based on presence of LABEL", - ) - parser.add_option( - "-T", - "--thorough", - dest="thorough", - default=False, - action="store_true", - help="calculate agreement for every subset of the annotators", - ) - (options, remainder) = parser.parse_args() - - if not options.file: - parser.print_help() - exit() - - logging.basicConfig(level=50 - 10 * int(options.verbose)) - - # read in data from the specified file - data = [] - with open(options.file) as infile: - for l in infile: - toks = l.split(options.columnsep) - coder, object_, labels = ( - toks[0], - str(toks[1:-1]), - frozenset(toks[-1].strip().split(options.labelsep)), - ) - if ( - (options.include == options.exclude) - or (len(options.include) > 0 and coder in options.include) - or (len(options.exclude) > 0 and coder not in options.exclude) - ): - data.append((coder, object_, labels)) - - if options.presence: - task = AnnotationTask( - data, getattr(distance, options.distance)(options.presence) - ) - else: - task = AnnotationTask(data, getattr(distance, options.distance)) - - if options.thorough: - pass - else: - print(getattr(task, options.agreement)()) - - logging.shutdown() diff --git a/pipeline/nltk/metrics/aline.py b/pipeline/nltk/metrics/aline.py deleted file mode 100644 index 5bf8d9930228b2bba3d07b5c92201a011bb9ca25..0000000000000000000000000000000000000000 --- a/pipeline/nltk/metrics/aline.py +++ /dev/null @@ -1,1354 +0,0 @@ -# Natural Language Toolkit: ALINE -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Greg Kondrak -# Geoff Bacon (Python port) -# URL: -# For license information, see LICENSE.TXT - -""" -ALINE -https://webdocs.cs.ualberta.ca/~kondrak/ -Copyright 2002 by Grzegorz Kondrak. - -ALINE is an algorithm for aligning phonetic sequences, described in [1]. -This module is a port of Kondrak's (2002) ALINE. It provides functions for -phonetic sequence alignment and similarity analysis. These are useful in -historical linguistics, sociolinguistics and synchronic phonology. - -ALINE has parameters that can be tuned for desired output. These parameters are: -- C_skip, C_sub, C_exp, C_vwl -- Salience weights -- Segmental features - -In this implementation, some parameters have been changed from their default -values as described in [1], in order to replicate published results. All changes -are noted in comments. - -Example usage -------------- - -# Get optimal alignment of two phonetic sequences - ->>> align('θin', 'tenwis') # doctest: +SKIP -[[('θ', 't'), ('i', 'e'), ('n', 'n'), ('-', 'w'), ('-', 'i'), ('-', 's')]] - -[1] G. Kondrak. Algorithms for Language Reconstruction. PhD dissertation, -University of Toronto. -""" - -try: - import numpy as np -except ImportError: - np = None - -# === Constants === - -inf = float("inf") - -# Default values for maximum similarity scores (Kondrak 2002: 54) -C_skip = -10 # Indels -C_sub = 35 # Substitutions -C_exp = 45 # Expansions/compressions -C_vwl = 5 # Vowel/consonant relative weight (decreased from 10) - -consonants = [ - "B", - "N", - "R", - "b", - "c", - "d", - "f", - "g", - "h", - "j", - "k", - "l", - "m", - "n", - "p", - "q", - "r", - "s", - "t", - "v", - "x", - "z", - "ç", - "ð", - "ħ", - "ŋ", - "ɖ", - "ɟ", - "ɢ", - "ɣ", - "ɦ", - "ɬ", - "ɮ", - "ɰ", - "ɱ", - "ɲ", - "ɳ", - "ɴ", - "ɸ", - "ɹ", - "ɻ", - "ɽ", - "ɾ", - "ʀ", - "ʁ", - "ʂ", - "ʃ", - "ʈ", - "ʋ", - "ʐ ", - "ʒ", - "ʔ", - "ʕ", - "ʙ", - "ʝ", - "β", - "θ", - "χ", - "ʐ", - "w", -] - -# Relevant features for comparing consonants and vowels -R_c = [ - "aspirated", - "lateral", - "manner", - "nasal", - "place", - "retroflex", - "syllabic", - "voice", -] -# 'high' taken out of R_v because same as manner -R_v = [ - "back", - "lateral", - "long", - "manner", - "nasal", - "place", - "retroflex", - "round", - "syllabic", - "voice", -] - -# Flattened feature matrix (Kondrak 2002: 56) -similarity_matrix = { - # place - "bilabial": 1.0, - "labiodental": 0.95, - "dental": 0.9, - "alveolar": 0.85, - "retroflex": 0.8, - "palato-alveolar": 0.75, - "palatal": 0.7, - "velar": 0.6, - "uvular": 0.5, - "pharyngeal": 0.3, - "glottal": 0.1, - "labiovelar": 1.0, - "vowel": -1.0, # added 'vowel' - # manner - "stop": 1.0, - "affricate": 0.9, - "fricative": 0.85, # increased fricative from 0.8 - "trill": 0.7, - "tap": 0.65, - "approximant": 0.6, - "high vowel": 0.4, - "mid vowel": 0.2, - "low vowel": 0.0, - "vowel2": 0.5, # added vowel - # high - "high": 1.0, - "mid": 0.5, - "low": 0.0, - # back - "front": 1.0, - "central": 0.5, - "back": 0.0, - # binary features - "plus": 1.0, - "minus": 0.0, -} - -# Relative weights of phonetic features (Kondrak 2002: 55) -salience = { - "syllabic": 5, - "place": 40, - "manner": 50, - "voice": 5, # decreased from 10 - "nasal": 20, # increased from 10 - "retroflex": 10, - "lateral": 10, - "aspirated": 5, - "long": 0, # decreased from 1 - "high": 3, # decreased from 5 - "back": 2, # decreased from 5 - "round": 2, # decreased from 5 -} - -# (Kondrak 2002: 59-60) -feature_matrix = { - # Consonants - "p": { - "place": "bilabial", - "manner": "stop", - "syllabic": "minus", - "voice": "minus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "b": { - "place": "bilabial", - "manner": "stop", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "t": { - "place": "alveolar", - "manner": "stop", - "syllabic": "minus", - "voice": "minus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "d": { - "place": "alveolar", - "manner": "stop", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ʈ": { - "place": "retroflex", - "manner": "stop", - "syllabic": "minus", - "voice": "minus", - "nasal": "minus", - "retroflex": "plus", - "lateral": "minus", - "aspirated": "minus", - }, - "ɖ": { - "place": "retroflex", - "manner": "stop", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "plus", - "lateral": "minus", - "aspirated": "minus", - }, - "c": { - "place": "palatal", - "manner": "stop", - "syllabic": "minus", - "voice": "minus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ɟ": { - "place": "palatal", - "manner": "stop", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "k": { - "place": "velar", - "manner": "stop", - "syllabic": "minus", - "voice": "minus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "g": { - "place": "velar", - "manner": "stop", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "q": { - "place": "uvular", - "manner": "stop", - "syllabic": "minus", - "voice": "minus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ɢ": { - "place": "uvular", - "manner": "stop", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ʔ": { - "place": "glottal", - "manner": "stop", - "syllabic": "minus", - "voice": "minus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "m": { - "place": "bilabial", - "manner": "stop", - "syllabic": "minus", - "voice": "plus", - "nasal": "plus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ɱ": { - "place": "labiodental", - "manner": "stop", - "syllabic": "minus", - "voice": "plus", - "nasal": "plus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "n": { - "place": "alveolar", - "manner": "stop", - "syllabic": "minus", - "voice": "plus", - "nasal": "plus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ɳ": { - "place": "retroflex", - "manner": "stop", - "syllabic": "minus", - "voice": "plus", - "nasal": "plus", - "retroflex": "plus", - "lateral": "minus", - "aspirated": "minus", - }, - "ɲ": { - "place": "palatal", - "manner": "stop", - "syllabic": "minus", - "voice": "plus", - "nasal": "plus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ŋ": { - "place": "velar", - "manner": "stop", - "syllabic": "minus", - "voice": "plus", - "nasal": "plus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ɴ": { - "place": "uvular", - "manner": "stop", - "syllabic": "minus", - "voice": "plus", - "nasal": "plus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "N": { - "place": "uvular", - "manner": "stop", - "syllabic": "minus", - "voice": "plus", - "nasal": "plus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ʙ": { - "place": "bilabial", - "manner": "trill", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "B": { - "place": "bilabial", - "manner": "trill", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "r": { - "place": "alveolar", - "manner": "trill", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "plus", - "lateral": "minus", - "aspirated": "minus", - }, - "ʀ": { - "place": "uvular", - "manner": "trill", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "R": { - "place": "uvular", - "manner": "trill", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ɾ": { - "place": "alveolar", - "manner": "tap", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ɽ": { - "place": "retroflex", - "manner": "tap", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "plus", - "lateral": "minus", - "aspirated": "minus", - }, - "ɸ": { - "place": "bilabial", - "manner": "fricative", - "syllabic": "minus", - "voice": "minus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "β": { - "place": "bilabial", - "manner": "fricative", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "f": { - "place": "labiodental", - "manner": "fricative", - "syllabic": "minus", - "voice": "minus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "v": { - "place": "labiodental", - "manner": "fricative", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "θ": { - "place": "dental", - "manner": "fricative", - "syllabic": "minus", - "voice": "minus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ð": { - "place": "dental", - "manner": "fricative", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "s": { - "place": "alveolar", - "manner": "fricative", - "syllabic": "minus", - "voice": "minus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "z": { - "place": "alveolar", - "manner": "fricative", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ʃ": { - "place": "palato-alveolar", - "manner": "fricative", - "syllabic": "minus", - "voice": "minus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ʒ": { - "place": "palato-alveolar", - "manner": "fricative", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ʂ": { - "place": "retroflex", - "manner": "fricative", - "syllabic": "minus", - "voice": "minus", - "nasal": "minus", - "retroflex": "plus", - "lateral": "minus", - "aspirated": "minus", - }, - "ʐ": { - "place": "retroflex", - "manner": "fricative", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "plus", - "lateral": "minus", - "aspirated": "minus", - }, - "ç": { - "place": "palatal", - "manner": "fricative", - "syllabic": "minus", - "voice": "minus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ʝ": { - "place": "palatal", - "manner": "fricative", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "x": { - "place": "velar", - "manner": "fricative", - "syllabic": "minus", - "voice": "minus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ɣ": { - "place": "velar", - "manner": "fricative", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "χ": { - "place": "uvular", - "manner": "fricative", - "syllabic": "minus", - "voice": "minus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ʁ": { - "place": "uvular", - "manner": "fricative", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ħ": { - "place": "pharyngeal", - "manner": "fricative", - "syllabic": "minus", - "voice": "minus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ʕ": { - "place": "pharyngeal", - "manner": "fricative", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "h": { - "place": "glottal", - "manner": "fricative", - "syllabic": "minus", - "voice": "minus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ɦ": { - "place": "glottal", - "manner": "fricative", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ɬ": { - "place": "alveolar", - "manner": "fricative", - "syllabic": "minus", - "voice": "minus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "plus", - "aspirated": "minus", - }, - "ɮ": { - "place": "alveolar", - "manner": "fricative", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "plus", - "aspirated": "minus", - }, - "ʋ": { - "place": "labiodental", - "manner": "approximant", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ɹ": { - "place": "alveolar", - "manner": "approximant", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ɻ": { - "place": "retroflex", - "manner": "approximant", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "plus", - "lateral": "minus", - "aspirated": "minus", - }, - "j": { - "place": "palatal", - "manner": "approximant", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "ɰ": { - "place": "velar", - "manner": "approximant", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - "l": { - "place": "alveolar", - "manner": "approximant", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "plus", - "aspirated": "minus", - }, - "w": { - "place": "labiovelar", - "manner": "approximant", - "syllabic": "minus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "aspirated": "minus", - }, - # Vowels - "i": { - "place": "vowel", - "manner": "vowel2", - "syllabic": "plus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "high": "high", - "back": "front", - "round": "minus", - "long": "minus", - "aspirated": "minus", - }, - "y": { - "place": "vowel", - "manner": "vowel2", - "syllabic": "plus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "high": "high", - "back": "front", - "round": "plus", - "long": "minus", - "aspirated": "minus", - }, - "e": { - "place": "vowel", - "manner": "vowel2", - "syllabic": "plus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "high": "mid", - "back": "front", - "round": "minus", - "long": "minus", - "aspirated": "minus", - }, - "E": { - "place": "vowel", - "manner": "vowel2", - "syllabic": "plus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "high": "mid", - "back": "front", - "round": "minus", - "long": "plus", - "aspirated": "minus", - }, - "ø": { - "place": "vowel", - "manner": "vowel2", - "syllabic": "plus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "high": "mid", - "back": "front", - "round": "plus", - "long": "minus", - "aspirated": "minus", - }, - "ɛ": { - "place": "vowel", - "manner": "vowel2", - "syllabic": "plus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "high": "mid", - "back": "front", - "round": "minus", - "long": "minus", - "aspirated": "minus", - }, - "œ": { - "place": "vowel", - "manner": "vowel2", - "syllabic": "plus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "high": "mid", - "back": "front", - "round": "plus", - "long": "minus", - "aspirated": "minus", - }, - "æ": { - "place": "vowel", - "manner": "vowel2", - "syllabic": "plus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "high": "low", - "back": "front", - "round": "minus", - "long": "minus", - "aspirated": "minus", - }, - "a": { - "place": "vowel", - "manner": "vowel2", - "syllabic": "plus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "high": "low", - "back": "front", - "round": "minus", - "long": "minus", - "aspirated": "minus", - }, - "A": { - "place": "vowel", - "manner": "vowel2", - "syllabic": "plus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "high": "low", - "back": "front", - "round": "minus", - "long": "plus", - "aspirated": "minus", - }, - "ɨ": { - "place": "vowel", - "manner": "vowel2", - "syllabic": "plus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "high": "high", - "back": "central", - "round": "minus", - "long": "minus", - "aspirated": "minus", - }, - "ʉ": { - "place": "vowel", - "manner": "vowel2", - "syllabic": "plus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "high": "high", - "back": "central", - "round": "plus", - "long": "minus", - "aspirated": "minus", - }, - "ə": { - "place": "vowel", - "manner": "vowel2", - "syllabic": "plus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "high": "mid", - "back": "central", - "round": "minus", - "long": "minus", - "aspirated": "minus", - }, - "u": { - "place": "vowel", - "manner": "vowel2", - "syllabic": "plus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "high": "high", - "back": "back", - "round": "plus", - "long": "minus", - "aspirated": "minus", - }, - "U": { - "place": "vowel", - "manner": "vowel2", - "syllabic": "plus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "high": "high", - "back": "back", - "round": "plus", - "long": "plus", - "aspirated": "minus", - }, - "o": { - "place": "vowel", - "manner": "vowel2", - "syllabic": "plus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "high": "mid", - "back": "back", - "round": "plus", - "long": "minus", - "aspirated": "minus", - }, - "O": { - "place": "vowel", - "manner": "vowel2", - "syllabic": "plus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "high": "mid", - "back": "back", - "round": "plus", - "long": "plus", - "aspirated": "minus", - }, - "ɔ": { - "place": "vowel", - "manner": "vowel2", - "syllabic": "plus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "high": "mid", - "back": "back", - "round": "plus", - "long": "minus", - "aspirated": "minus", - }, - "ɒ": { - "place": "vowel", - "manner": "vowel2", - "syllabic": "plus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "high": "low", - "back": "back", - "round": "minus", - "long": "minus", - "aspirated": "minus", - }, - "I": { - "place": "vowel", - "manner": "vowel2", - "syllabic": "plus", - "voice": "plus", - "nasal": "minus", - "retroflex": "minus", - "lateral": "minus", - "high": "high", - "back": "front", - "round": "minus", - "long": "plus", - "aspirated": "minus", - }, -} - -# === Algorithm === - - -def align(str1, str2, epsilon=0): - """ - Compute the alignment of two phonetic strings. - - :param str str1: First string to be aligned - :param str str2: Second string to be aligned - - :type epsilon: float (0.0 to 1.0) - :param epsilon: Adjusts threshold similarity score for near-optimal alignments - - :rtype: list(list(tuple(str, str))) - :return: Alignment(s) of str1 and str2 - - (Kondrak 2002: 51) - """ - if np is None: - raise ImportError("You need numpy in order to use the align function") - - assert 0.0 <= epsilon <= 1.0, "Epsilon must be between 0.0 and 1.0." - m = len(str1) - n = len(str2) - # This includes Kondrak's initialization of row 0 and column 0 to all 0s. - S = np.zeros((m + 1, n + 1), dtype=float) - - # If i <= 1 or j <= 1, don't allow expansions as it doesn't make sense, - # and breaks array and string indices. Make sure they never get chosen - # by setting them to -inf. - for i in range(1, m + 1): - for j in range(1, n + 1): - edit1 = S[i - 1, j] + sigma_skip(str1[i - 1]) - edit2 = S[i, j - 1] + sigma_skip(str2[j - 1]) - edit3 = S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) - if i > 1: - edit4 = S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i]) - else: - edit4 = -inf - if j > 1: - edit5 = S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j]) - else: - edit5 = -inf - S[i, j] = max(edit1, edit2, edit3, edit4, edit5, 0) - - T = (1 - epsilon) * np.amax(S) # Threshold score for near-optimal alignments - - alignments = [] - for i in range(1, m + 1): - for j in range(1, n + 1): - if S[i, j] >= T: - alignments.append(_retrieve(i, j, 0, S, T, str1, str2, [])) - return alignments - - -def _retrieve(i, j, s, S, T, str1, str2, out): - """ - Retrieve the path through the similarity matrix S starting at (i, j). - - :rtype: list(tuple(str, str)) - :return: Alignment of str1 and str2 - """ - if S[i, j] == 0: - return out - else: - if j > 1 and S[i - 1, j - 2] + sigma_exp(str1[i - 1], str2[j - 2 : j]) + s >= T: - out.insert(0, (str1[i - 1], str2[j - 2 : j])) - _retrieve( - i - 1, - j - 2, - s + sigma_exp(str1[i - 1], str2[j - 2 : j]), - S, - T, - str1, - str2, - out, - ) - elif ( - i > 1 and S[i - 2, j - 1] + sigma_exp(str2[j - 1], str1[i - 2 : i]) + s >= T - ): - out.insert(0, (str1[i - 2 : i], str2[j - 1])) - _retrieve( - i - 2, - j - 1, - s + sigma_exp(str2[j - 1], str1[i - 2 : i]), - S, - T, - str1, - str2, - out, - ) - elif S[i, j - 1] + sigma_skip(str2[j - 1]) + s >= T: - out.insert(0, ("-", str2[j - 1])) - _retrieve(i, j - 1, s + sigma_skip(str2[j - 1]), S, T, str1, str2, out) - elif S[i - 1, j] + sigma_skip(str1[i - 1]) + s >= T: - out.insert(0, (str1[i - 1], "-")) - _retrieve(i - 1, j, s + sigma_skip(str1[i - 1]), S, T, str1, str2, out) - elif S[i - 1, j - 1] + sigma_sub(str1[i - 1], str2[j - 1]) + s >= T: - out.insert(0, (str1[i - 1], str2[j - 1])) - _retrieve( - i - 1, - j - 1, - s + sigma_sub(str1[i - 1], str2[j - 1]), - S, - T, - str1, - str2, - out, - ) - return out - - -def sigma_skip(p): - """ - Returns score of an indel of P. - - (Kondrak 2002: 54) - """ - return C_skip - - -def sigma_sub(p, q): - """ - Returns score of a substitution of P with Q. - - (Kondrak 2002: 54) - """ - return C_sub - delta(p, q) - V(p) - V(q) - - -def sigma_exp(p, q): - """ - Returns score of an expansion/compression. - - (Kondrak 2002: 54) - """ - q1 = q[0] - q2 = q[1] - return C_exp - delta(p, q1) - delta(p, q2) - V(p) - max(V(q1), V(q2)) - - -def delta(p, q): - """ - Return weighted sum of difference between P and Q. - - (Kondrak 2002: 54) - """ - features = R(p, q) - total = 0 - for f in features: - total += diff(p, q, f) * salience[f] - return total - - -def diff(p, q, f): - """ - Returns difference between phonetic segments P and Q for feature F. - - (Kondrak 2002: 52, 54) - """ - p_features, q_features = feature_matrix[p], feature_matrix[q] - return abs(similarity_matrix[p_features[f]] - similarity_matrix[q_features[f]]) - - -def R(p, q): - """ - Return relevant features for segment comparison. - - (Kondrak 2002: 54) - """ - if p in consonants or q in consonants: - return R_c - return R_v - - -def V(p): - """ - Return vowel weight if P is vowel. - - (Kondrak 2002: 54) - """ - if p in consonants: - return 0 - return C_vwl - - -# === Test === - - -def demo(): - """ - A demonstration of the result of aligning phonetic sequences - used in Kondrak's (2002) dissertation. - """ - data = [pair.split(",") for pair in cognate_data.split("\n")] - for pair in data: - alignment = align(pair[0], pair[1])[0] - alignment = [f"({a[0]}, {a[1]})" for a in alignment] - alignment = " ".join(alignment) - print(f"{pair[0]} ~ {pair[1]} : {alignment}") - - -cognate_data = """jo,ʒə -tu,ty -nosotros,nu -kjen,ki -ke,kwa -todos,tu -una,ən -dos,dø -tres,trwa -ombre,om -arbol,arbrə -pluma,plym -kabeθa,kap -boka,buʃ -pje,pje -koraθon,kœr -ber,vwar -benir,vənir -deθir,dir -pobre,povrə -ðis,dIzes -ðæt,das -wat,vas -nat,nixt -loŋ,laŋ -mæn,man -fleʃ,flajʃ -bləd,blyt -feðər,fEdər -hær,hAr -ir,Or -aj,awgə -nowz,nAzə -mawθ,munt -təŋ,tsuŋə -fut,fys -nij,knI -hænd,hant -hart,herts -livər,lEbər -ænd,ante -æt,ad -blow,flAre -ir,awris -ijt,edere -fiʃ,piʃkis -flow,fluere -staɾ,stella -ful,plenus -græs,gramen -hart,kordis -horn,korny -aj,ego -nij,genU -məðər,mAter -mawntən,mons -nejm,nomen -njuw,nowus -wən,unus -rawnd,rotundus -sow,suere -sit,sedere -θrij,tres -tuwθ,dentis -θin,tenwis -kinwawa,kenuaʔ -nina,nenah -napewa,napɛw -wapimini,wapemen -namesa,namɛʔs -okimawa,okemaw -ʃiʃipa,seʔsep -ahkohkwa,ahkɛh -pematesiweni,pematesewen -asenja,aʔsɛn""" - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/metrics/association.py b/pipeline/nltk/metrics/association.py deleted file mode 100644 index b7010f1f4dd39c122a263aff5d243b3c19c52822..0000000000000000000000000000000000000000 --- a/pipeline/nltk/metrics/association.py +++ /dev/null @@ -1,476 +0,0 @@ -# Natural Language Toolkit: Ngram Association Measures -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Joel Nothman -# URL: -# For license information, see LICENSE.TXT - -""" -Provides scoring functions for a number of association measures through a -generic, abstract implementation in ``NgramAssocMeasures``, and n-specific -``BigramAssocMeasures`` and ``TrigramAssocMeasures``. -""" - -import math as _math -from abc import ABCMeta, abstractmethod -from functools import reduce - -_log2 = lambda x: _math.log2(x) -_ln = _math.log - -_product = lambda s: reduce(lambda x, y: x * y, s) - -_SMALL = 1e-20 - -try: - from scipy.stats import fisher_exact -except ImportError: - - def fisher_exact(*_args, **_kwargs): - raise NotImplementedError - - -### Indices to marginals arguments: - -NGRAM = 0 -"""Marginals index for the ngram count""" - -UNIGRAMS = -2 -"""Marginals index for a tuple of each unigram count""" - -TOTAL = -1 -"""Marginals index for the number of words in the data""" - - -class NgramAssocMeasures(metaclass=ABCMeta): - """ - An abstract class defining a collection of generic association measures. - Each public method returns a score, taking the following arguments:: - - score_fn(count_of_ngram, - (count_of_n-1gram_1, ..., count_of_n-1gram_j), - (count_of_n-2gram_1, ..., count_of_n-2gram_k), - ..., - (count_of_1gram_1, ..., count_of_1gram_n), - count_of_total_words) - - See ``BigramAssocMeasures`` and ``TrigramAssocMeasures`` - - Inheriting classes should define a property _n, and a method _contingency - which calculates contingency values from marginals in order for all - association measures defined here to be usable. - """ - - _n = 0 - - @staticmethod - @abstractmethod - def _contingency(*marginals): - """Calculates values of a contingency table from marginal values.""" - raise NotImplementedError( - "The contingency table is not available" "in the general ngram case" - ) - - @staticmethod - @abstractmethod - def _marginals(*contingency): - """Calculates values of contingency table marginals from its values.""" - raise NotImplementedError( - "The contingency table is not available" "in the general ngram case" - ) - - @classmethod - def _expected_values(cls, cont): - """Calculates expected values for a contingency table.""" - n_all = sum(cont) - bits = [1 << i for i in range(cls._n)] - - # For each contingency table cell - for i in range(len(cont)): - # Yield the expected value - yield ( - _product( - sum(cont[x] for x in range(2**cls._n) if (x & j) == (i & j)) - for j in bits - ) - / (n_all ** (cls._n - 1)) - ) - - @staticmethod - def raw_freq(*marginals): - """Scores ngrams by their frequency""" - return marginals[NGRAM] / marginals[TOTAL] - - @classmethod - def student_t(cls, *marginals): - """Scores ngrams using Student's t test with independence hypothesis - for unigrams, as in Manning and Schutze 5.3.1. - """ - return ( - marginals[NGRAM] - - _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1)) - ) / (marginals[NGRAM] + _SMALL) ** 0.5 - - @classmethod - def chi_sq(cls, *marginals): - """Scores ngrams using Pearson's chi-square as in Manning and Schutze - 5.3.3. - """ - cont = cls._contingency(*marginals) - exps = cls._expected_values(cont) - return sum((obs - exp) ** 2 / (exp + _SMALL) for obs, exp in zip(cont, exps)) - - @staticmethod - def mi_like(*marginals, **kwargs): - """Scores ngrams using a variant of mutual information. The keyword - argument power sets an exponent (default 3) for the numerator. No - logarithm of the result is calculated. - """ - return marginals[NGRAM] ** kwargs.get("power", 3) / _product( - marginals[UNIGRAMS] - ) - - @classmethod - def pmi(cls, *marginals): - """Scores ngrams by pointwise mutual information, as in Manning and - Schutze 5.4. - """ - return _log2(marginals[NGRAM] * marginals[TOTAL] ** (cls._n - 1)) - _log2( - _product(marginals[UNIGRAMS]) - ) - - @classmethod - def likelihood_ratio(cls, *marginals): - """Scores ngrams using likelihood ratios as in Manning and Schutze 5.3.4.""" - cont = cls._contingency(*marginals) - return 2 * sum( - obs * _ln(obs / (exp + _SMALL) + _SMALL) - for obs, exp in zip(cont, cls._expected_values(cont)) - ) - - @classmethod - def poisson_stirling(cls, *marginals): - """Scores ngrams using the Poisson-Stirling measure.""" - exp = _product(marginals[UNIGRAMS]) / (marginals[TOTAL] ** (cls._n - 1)) - return marginals[NGRAM] * (_log2(marginals[NGRAM] / exp) - 1) - - @classmethod - def jaccard(cls, *marginals): - """Scores ngrams using the Jaccard index.""" - cont = cls._contingency(*marginals) - return cont[0] / sum(cont[:-1]) - - -class BigramAssocMeasures(NgramAssocMeasures): - """ - A collection of bigram association measures. Each association measure - is provided as a function with three arguments:: - - bigram_score_fn(n_ii, (n_ix, n_xi), n_xx) - - The arguments constitute the marginals of a contingency table, counting - the occurrences of particular events in a corpus. The letter i in the - suffix refers to the appearance of the word in question, while x indicates - the appearance of any word. Thus, for example: - - - n_ii counts ``(w1, w2)``, i.e. the bigram being scored - - n_ix counts ``(w1, *)`` - - n_xi counts ``(*, w2)`` - - n_xx counts ``(*, *)``, i.e. any bigram - - This may be shown with respect to a contingency table:: - - w1 ~w1 - ------ ------ - w2 | n_ii | n_oi | = n_xi - ------ ------ - ~w2 | n_io | n_oo | - ------ ------ - = n_ix TOTAL = n_xx - """ - - _n = 2 - - @staticmethod - def _contingency(n_ii, n_ix_xi_tuple, n_xx): - """Calculates values of a bigram contingency table from marginal values.""" - (n_ix, n_xi) = n_ix_xi_tuple - n_oi = n_xi - n_ii - n_io = n_ix - n_ii - return (n_ii, n_oi, n_io, n_xx - n_ii - n_oi - n_io) - - @staticmethod - def _marginals(n_ii, n_oi, n_io, n_oo): - """Calculates values of contingency table marginals from its values.""" - return (n_ii, (n_oi + n_ii, n_io + n_ii), n_oo + n_oi + n_io + n_ii) - - @staticmethod - def _expected_values(cont): - """Calculates expected values for a contingency table.""" - n_xx = sum(cont) - # For each contingency table cell - for i in range(4): - yield (cont[i] + cont[i ^ 1]) * (cont[i] + cont[i ^ 2]) / n_xx - - @classmethod - def phi_sq(cls, *marginals): - """Scores bigrams using phi-square, the square of the Pearson correlation - coefficient. - """ - n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals) - - return (n_ii * n_oo - n_io * n_oi) ** 2 / ( - (n_ii + n_io) * (n_ii + n_oi) * (n_io + n_oo) * (n_oi + n_oo) - ) - - @classmethod - def chi_sq(cls, n_ii, n_ix_xi_tuple, n_xx): - """Scores bigrams using chi-square, i.e. phi-sq multiplied by the number - of bigrams, as in Manning and Schutze 5.3.3. - """ - (n_ix, n_xi) = n_ix_xi_tuple - return n_xx * cls.phi_sq(n_ii, (n_ix, n_xi), n_xx) - - @classmethod - def fisher(cls, *marginals): - """Scores bigrams using Fisher's Exact Test (Pedersen 1996). Less - sensitive to small counts than PMI or Chi Sq, but also more expensive - to compute. Requires scipy. - """ - - n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals) - - (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative="less") - return pvalue - - @staticmethod - def dice(n_ii, n_ix_xi_tuple, n_xx): - """Scores bigrams using Dice's coefficient.""" - (n_ix, n_xi) = n_ix_xi_tuple - return 2 * n_ii / (n_ix + n_xi) - - -class TrigramAssocMeasures(NgramAssocMeasures): - """ - A collection of trigram association measures. Each association measure - is provided as a function with four arguments:: - - trigram_score_fn(n_iii, - (n_iix, n_ixi, n_xii), - (n_ixx, n_xix, n_xxi), - n_xxx) - - The arguments constitute the marginals of a contingency table, counting - the occurrences of particular events in a corpus. The letter i in the - suffix refers to the appearance of the word in question, while x indicates - the appearance of any word. Thus, for example: - - - n_iii counts ``(w1, w2, w3)``, i.e. the trigram being scored - - n_ixx counts ``(w1, *, *)`` - - n_xxx counts ``(*, *, *)``, i.e. any trigram - """ - - _n = 3 - - @staticmethod - def _contingency(n_iii, n_iix_tuple, n_ixx_tuple, n_xxx): - """Calculates values of a trigram contingency table (or cube) from - marginal values. - >>> TrigramAssocMeasures._contingency(1, (1, 1, 1), (1, 73, 1), 2000) - (1, 0, 0, 0, 0, 72, 0, 1927) - """ - (n_iix, n_ixi, n_xii) = n_iix_tuple - (n_ixx, n_xix, n_xxi) = n_ixx_tuple - n_oii = n_xii - n_iii - n_ioi = n_ixi - n_iii - n_iio = n_iix - n_iii - n_ooi = n_xxi - n_iii - n_oii - n_ioi - n_oio = n_xix - n_iii - n_oii - n_iio - n_ioo = n_ixx - n_iii - n_ioi - n_iio - n_ooo = n_xxx - n_iii - n_oii - n_ioi - n_iio - n_ooi - n_oio - n_ioo - - return (n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo) - - @staticmethod - def _marginals(*contingency): - """Calculates values of contingency table marginals from its values. - >>> TrigramAssocMeasures._marginals(1, 0, 0, 0, 0, 72, 0, 1927) - (1, (1, 1, 1), (1, 73, 1), 2000) - """ - n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo = contingency - return ( - n_iii, - (n_iii + n_iio, n_iii + n_ioi, n_iii + n_oii), - ( - n_iii + n_ioi + n_iio + n_ioo, - n_iii + n_oii + n_iio + n_oio, - n_iii + n_oii + n_ioi + n_ooi, - ), - sum(contingency), - ) - - -class QuadgramAssocMeasures(NgramAssocMeasures): - """ - A collection of quadgram association measures. Each association measure - is provided as a function with five arguments:: - - trigram_score_fn(n_iiii, - (n_iiix, n_iixi, n_ixii, n_xiii), - (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix), - (n_ixxx, n_xixx, n_xxix, n_xxxi), - n_all) - - The arguments constitute the marginals of a contingency table, counting - the occurrences of particular events in a corpus. The letter i in the - suffix refers to the appearance of the word in question, while x indicates - the appearance of any word. Thus, for example: - - - n_iiii counts ``(w1, w2, w3, w4)``, i.e. the quadgram being scored - - n_ixxi counts ``(w1, *, *, w4)`` - - n_xxxx counts ``(*, *, *, *)``, i.e. any quadgram - """ - - _n = 4 - - @staticmethod - def _contingency(n_iiii, n_iiix_tuple, n_iixx_tuple, n_ixxx_tuple, n_xxxx): - """Calculates values of a quadgram contingency table from - marginal values. - """ - (n_iiix, n_iixi, n_ixii, n_xiii) = n_iiix_tuple - (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix) = n_iixx_tuple - (n_ixxx, n_xixx, n_xxix, n_xxxi) = n_ixxx_tuple - n_oiii = n_xiii - n_iiii - n_ioii = n_ixii - n_iiii - n_iioi = n_iixi - n_iiii - n_ooii = n_xxii - n_iiii - n_oiii - n_ioii - n_oioi = n_xixi - n_iiii - n_oiii - n_iioi - n_iooi = n_ixxi - n_iiii - n_ioii - n_iioi - n_oooi = n_xxxi - n_iiii - n_oiii - n_ioii - n_iioi - n_ooii - n_iooi - n_oioi - n_iiio = n_iiix - n_iiii - n_oiio = n_xiix - n_iiii - n_oiii - n_iiio - n_ioio = n_ixix - n_iiii - n_ioii - n_iiio - n_ooio = n_xxix - n_iiii - n_oiii - n_ioii - n_iiio - n_ooii - n_ioio - n_oiio - n_iioo = n_iixx - n_iiii - n_iioi - n_iiio - n_oioo = n_xixx - n_iiii - n_oiii - n_iioi - n_iiio - n_oioi - n_oiio - n_iioo - n_iooo = n_ixxx - n_iiii - n_ioii - n_iioi - n_iiio - n_iooi - n_iioo - n_ioio - n_oooo = ( - n_xxxx - - n_iiii - - n_oiii - - n_ioii - - n_iioi - - n_ooii - - n_oioi - - n_iooi - - n_oooi - - n_iiio - - n_oiio - - n_ioio - - n_ooio - - n_iioo - - n_oioo - - n_iooo - ) - - return ( - n_iiii, - n_oiii, - n_ioii, - n_ooii, - n_iioi, - n_oioi, - n_iooi, - n_oooi, - n_iiio, - n_oiio, - n_ioio, - n_ooio, - n_iioo, - n_oioo, - n_iooo, - n_oooo, - ) - - @staticmethod - def _marginals(*contingency): - """Calculates values of contingency table marginals from its values. - QuadgramAssocMeasures._marginals(1, 0, 2, 46, 552, 825, 2577, 34967, 1, 0, 2, 48, 7250, 9031, 28585, 356653) - (1, (2, 553, 3, 1), (7804, 6, 3132, 1378, 49, 2), (38970, 17660, 100, 38970), 440540) - """ - ( - n_iiii, - n_oiii, - n_ioii, - n_ooii, - n_iioi, - n_oioi, - n_iooi, - n_oooi, - n_iiio, - n_oiio, - n_ioio, - n_ooio, - n_iioo, - n_oioo, - n_iooo, - n_oooo, - ) = contingency - - n_iiix = n_iiii + n_iiio - n_iixi = n_iiii + n_iioi - n_ixii = n_iiii + n_ioii - n_xiii = n_iiii + n_oiii - - n_iixx = n_iiii + n_iioi + n_iiio + n_iioo - n_ixix = n_iiii + n_ioii + n_iiio + n_ioio - n_ixxi = n_iiii + n_ioii + n_iioi + n_iooi - n_xixi = n_iiii + n_oiii + n_iioi + n_oioi - n_xxii = n_iiii + n_oiii + n_ioii + n_ooii - n_xiix = n_iiii + n_oiii + n_iiio + n_oiio - - n_ixxx = n_iiii + n_ioii + n_iioi + n_iiio + n_iooi + n_iioo + n_ioio + n_iooo - n_xixx = n_iiii + n_oiii + n_iioi + n_iiio + n_oioi + n_oiio + n_iioo + n_oioo - n_xxix = n_iiii + n_oiii + n_ioii + n_iiio + n_ooii + n_ioio + n_oiio + n_ooio - n_xxxi = n_iiii + n_oiii + n_ioii + n_iioi + n_ooii + n_iooi + n_oioi + n_oooi - - n_all = sum(contingency) - - return ( - n_iiii, - (n_iiix, n_iixi, n_ixii, n_xiii), - (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix), - (n_ixxx, n_xixx, n_xxix, n_xxxi), - n_all, - ) - - -class ContingencyMeasures: - """Wraps NgramAssocMeasures classes such that the arguments of association - measures are contingency table values rather than marginals. - """ - - def __init__(self, measures): - """Constructs a ContingencyMeasures given a NgramAssocMeasures class""" - self.__class__.__name__ = "Contingency" + measures.__class__.__name__ - for k in dir(measures): - if k.startswith("__"): - continue - v = getattr(measures, k) - if not k.startswith("_"): - v = self._make_contingency_fn(measures, v) - setattr(self, k, v) - - @staticmethod - def _make_contingency_fn(measures, old_fn): - """From an association measure function, produces a new function which - accepts contingency table values as its arguments. - """ - - def res(*contingency): - return old_fn(*measures._marginals(*contingency)) - - res.__doc__ = old_fn.__doc__ - res.__name__ = old_fn.__name__ - return res diff --git a/pipeline/nltk/metrics/confusionmatrix.py b/pipeline/nltk/metrics/confusionmatrix.py deleted file mode 100644 index 3cb6ee9b2a7e1a9b2235d9268d20fc1269908fe1..0000000000000000000000000000000000000000 --- a/pipeline/nltk/metrics/confusionmatrix.py +++ /dev/null @@ -1,353 +0,0 @@ -# Natural Language Toolkit: Confusion Matrices -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird -# Tom Aarsen <> -# URL: -# For license information, see LICENSE.TXT - -from nltk.probability import FreqDist - - -class ConfusionMatrix: - """ - The confusion matrix between a list of reference values and a - corresponding list of test values. Entry *[r,t]* of this - matrix is a count of the number of times that the reference value - *r* corresponds to the test value *t*. E.g.: - - >>> from nltk.metrics import ConfusionMatrix - >>> ref = 'DET NN VB DET JJ NN NN IN DET NN'.split() - >>> test = 'DET VB VB DET NN NN NN IN DET NN'.split() - >>> cm = ConfusionMatrix(ref, test) - >>> print(cm['NN', 'NN']) - 3 - - Note that the diagonal entries *Ri=Tj* of this matrix - corresponds to correct values; and the off-diagonal entries - correspond to incorrect values. - """ - - def __init__(self, reference, test, sort_by_count=False): - """ - Construct a new confusion matrix from a list of reference - values and a corresponding list of test values. - - :type reference: list - :param reference: An ordered list of reference values. - :type test: list - :param test: A list of values to compare against the - corresponding reference values. - :raise ValueError: If ``reference`` and ``length`` do not have - the same length. - """ - if len(reference) != len(test): - raise ValueError("Lists must have the same length.") - - # Get a list of all values. - if sort_by_count: - ref_fdist = FreqDist(reference) - test_fdist = FreqDist(test) - - def key(v): - return -(ref_fdist[v] + test_fdist[v]) - - values = sorted(set(reference + test), key=key) - else: - values = sorted(set(reference + test)) - - # Construct a value->index dictionary - indices = {val: i for (i, val) in enumerate(values)} - - # Make a confusion matrix table. - confusion = [[0 for _ in values] for _ in values] - max_conf = 0 # Maximum confusion - for w, g in zip(reference, test): - confusion[indices[w]][indices[g]] += 1 - max_conf = max(max_conf, confusion[indices[w]][indices[g]]) - - #: A list of all values in ``reference`` or ``test``. - self._values = values - #: A dictionary mapping values in ``self._values`` to their indices. - self._indices = indices - #: The confusion matrix itself (as a list of lists of counts). - self._confusion = confusion - #: The greatest count in ``self._confusion`` (used for printing). - self._max_conf = max_conf - #: The total number of values in the confusion matrix. - self._total = len(reference) - #: The number of correct (on-diagonal) values in the matrix. - self._correct = sum(confusion[i][i] for i in range(len(values))) - - def __getitem__(self, li_lj_tuple): - """ - :return: The number of times that value ``li`` was expected and - value ``lj`` was given. - :rtype: int - """ - (li, lj) = li_lj_tuple - i = self._indices[li] - j = self._indices[lj] - return self._confusion[i][j] - - def __repr__(self): - return f"" - - def __str__(self): - return self.pretty_format() - - def pretty_format( - self, - show_percents=False, - values_in_chart=True, - truncate=None, - sort_by_count=False, - ): - """ - :return: A multi-line string representation of this confusion matrix. - :type truncate: int - :param truncate: If specified, then only show the specified - number of values. Any sorting (e.g., sort_by_count) - will be performed before truncation. - :param sort_by_count: If true, then sort by the count of each - label in the reference data. I.e., labels that occur more - frequently in the reference label will be towards the left - edge of the matrix, and labels that occur less frequently - will be towards the right edge. - - @todo: add marginals? - """ - confusion = self._confusion - - values = self._values - if sort_by_count: - values = sorted( - values, key=lambda v: -sum(self._confusion[self._indices[v]]) - ) - - if truncate: - values = values[:truncate] - - if values_in_chart: - value_strings = ["%s" % val for val in values] - else: - value_strings = [str(n + 1) for n in range(len(values))] - - # Construct a format string for row values - valuelen = max(len(val) for val in value_strings) - value_format = "%" + repr(valuelen) + "s | " - # Construct a format string for matrix entries - if show_percents: - entrylen = 6 - entry_format = "%5.1f%%" - zerostr = " ." - else: - entrylen = len(repr(self._max_conf)) - entry_format = "%" + repr(entrylen) + "d" - zerostr = " " * (entrylen - 1) + "." - - # Write the column values. - s = "" - for i in range(valuelen): - s += (" " * valuelen) + " |" - for val in value_strings: - if i >= valuelen - len(val): - s += val[i - valuelen + len(val)].rjust(entrylen + 1) - else: - s += " " * (entrylen + 1) - s += " |\n" - - # Write a dividing line - s += "{}-+-{}+\n".format("-" * valuelen, "-" * ((entrylen + 1) * len(values))) - - # Write the entries. - for val, li in zip(value_strings, values): - i = self._indices[li] - s += value_format % val - for lj in values: - j = self._indices[lj] - if confusion[i][j] == 0: - s += zerostr - elif show_percents: - s += entry_format % (100.0 * confusion[i][j] / self._total) - else: - s += entry_format % confusion[i][j] - if i == j: - prevspace = s.rfind(" ") - s = s[:prevspace] + "<" + s[prevspace + 1 :] + ">" - else: - s += " " - s += "|\n" - - # Write a dividing line - s += "{}-+-{}+\n".format("-" * valuelen, "-" * ((entrylen + 1) * len(values))) - - # Write a key - s += "(row = reference; col = test)\n" - if not values_in_chart: - s += "Value key:\n" - for i, value in enumerate(values): - s += "%6d: %s\n" % (i + 1, value) - - return s - - def key(self): - values = self._values - str = "Value key:\n" - indexlen = len(repr(len(values) - 1)) - key_format = " %" + repr(indexlen) + "d: %s\n" - for i in range(len(values)): - str += key_format % (i, values[i]) - - return str - - def recall(self, value): - """Given a value in the confusion matrix, return the recall - that corresponds to this value. The recall is defined as: - - - *r* = true positive / (true positive + false positive) - - and can loosely be considered the ratio of how often ``value`` - was predicted correctly relative to how often ``value`` was - the true result. - - :param value: value used in the ConfusionMatrix - :return: the recall corresponding to ``value``. - :rtype: float - """ - # Number of times `value` was correct, and also predicted - TP = self[value, value] - # Number of times `value` was correct - TP_FN = sum(self[value, pred_value] for pred_value in self._values) - if TP_FN == 0: - return 0.0 - return TP / TP_FN - - def precision(self, value): - """Given a value in the confusion matrix, return the precision - that corresponds to this value. The precision is defined as: - - - *p* = true positive / (true positive + false negative) - - and can loosely be considered the ratio of how often ``value`` - was predicted correctly relative to the number of predictions - for ``value``. - - :param value: value used in the ConfusionMatrix - :return: the precision corresponding to ``value``. - :rtype: float - """ - # Number of times `value` was correct, and also predicted - TP = self[value, value] - # Number of times `value` was predicted - TP_FP = sum(self[real_value, value] for real_value in self._values) - if TP_FP == 0: - return 0.0 - return TP / TP_FP - - def f_measure(self, value, alpha=0.5): - """ - Given a value used in the confusion matrix, return the f-measure - that corresponds to this value. The f-measure is the harmonic mean - of the ``precision`` and ``recall``, weighted by ``alpha``. - In particular, given the precision *p* and recall *r* defined by: - - - *p* = true positive / (true positive + false negative) - - *r* = true positive / (true positive + false positive) - - The f-measure is: - - - *1/(alpha/p + (1-alpha)/r)* - - With ``alpha = 0.5``, this reduces to: - - - *2pr / (p + r)* - - :param value: value used in the ConfusionMatrix - :param alpha: Ratio of the cost of false negative compared to false - positives. Defaults to 0.5, where the costs are equal. - :type alpha: float - :return: the F-measure corresponding to ``value``. - :rtype: float - """ - p = self.precision(value) - r = self.recall(value) - if p == 0.0 or r == 0.0: - return 0.0 - return 1.0 / (alpha / p + (1 - alpha) / r) - - def evaluate(self, alpha=0.5, truncate=None, sort_by_count=False): - """ - Tabulate the **recall**, **precision** and **f-measure** - for each value in this confusion matrix. - - >>> reference = "DET NN VB DET JJ NN NN IN DET NN".split() - >>> test = "DET VB VB DET NN NN NN IN DET NN".split() - >>> cm = ConfusionMatrix(reference, test) - >>> print(cm.evaluate()) - Tag | Prec. | Recall | F-measure - ----+--------+--------+----------- - DET | 1.0000 | 1.0000 | 1.0000 - IN | 1.0000 | 1.0000 | 1.0000 - JJ | 0.0000 | 0.0000 | 0.0000 - NN | 0.7500 | 0.7500 | 0.7500 - VB | 0.5000 | 1.0000 | 0.6667 - - - :param alpha: Ratio of the cost of false negative compared to false - positives, as used in the f-measure computation. Defaults to 0.5, - where the costs are equal. - :type alpha: float - :param truncate: If specified, then only show the specified - number of values. Any sorting (e.g., sort_by_count) - will be performed before truncation. Defaults to None - :type truncate: int, optional - :param sort_by_count: Whether to sort the outputs on frequency - in the reference label. Defaults to False. - :type sort_by_count: bool, optional - :return: A tabulated recall, precision and f-measure string - :rtype: str - """ - tags = self._values - - # Apply keyword parameters - if sort_by_count: - tags = sorted(tags, key=lambda v: -sum(self._confusion[self._indices[v]])) - if truncate: - tags = tags[:truncate] - - tag_column_len = max(max(len(tag) for tag in tags), 3) - - # Construct the header - s = ( - f"{' ' * (tag_column_len - 3)}Tag | Prec. | Recall | F-measure\n" - f"{'-' * tag_column_len}-+--------+--------+-----------\n" - ) - - # Construct the body - for tag in tags: - s += ( - f"{tag:>{tag_column_len}} | " - f"{self.precision(tag):<6.4f} | " - f"{self.recall(tag):<6.4f} | " - f"{self.f_measure(tag, alpha=alpha):.4f}\n" - ) - - return s - - -def demo(): - reference = "DET NN VB DET JJ NN NN IN DET NN".split() - test = "DET VB VB DET NN NN NN IN DET NN".split() - print("Reference =", reference) - print("Test =", test) - print("Confusion matrix:") - print(ConfusionMatrix(reference, test)) - print(ConfusionMatrix(reference, test).pretty_format(sort_by_count=True)) - - print(ConfusionMatrix(reference, test).recall("VB")) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/metrics/distance.py b/pipeline/nltk/metrics/distance.py deleted file mode 100644 index 1f115d97abd6678f7b1a3b15b2e68671d70e5ea7..0000000000000000000000000000000000000000 --- a/pipeline/nltk/metrics/distance.py +++ /dev/null @@ -1,508 +0,0 @@ -# Natural Language Toolkit: Distance Metrics -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird -# Tom Lippincott -# URL: -# For license information, see LICENSE.TXT -# - -""" -Distance Metrics. - -Compute the distance between two items (usually strings). -As metrics, they must satisfy the following three requirements: - -1. d(a, a) = 0 -2. d(a, b) >= 0 -3. d(a, c) <= d(a, b) + d(b, c) -""" - -import operator -import warnings - - -def _edit_dist_init(len1, len2): - lev = [] - for i in range(len1): - lev.append([0] * len2) # initialize 2D array to zero - for i in range(len1): - lev[i][0] = i # column 0: 0,1,2,3,4,... - for j in range(len2): - lev[0][j] = j # row 0: 0,1,2,3,4,... - return lev - - -def _last_left_t_init(sigma): - return {c: 0 for c in sigma} - - -def _edit_dist_step( - lev, i, j, s1, s2, last_left, last_right, substitution_cost=1, transpositions=False -): - c1 = s1[i - 1] - c2 = s2[j - 1] - - # skipping a character in s1 - a = lev[i - 1][j] + 1 - # skipping a character in s2 - b = lev[i][j - 1] + 1 - # substitution - c = lev[i - 1][j - 1] + (substitution_cost if c1 != c2 else 0) - - # transposition - d = c + 1 # never picked by default - if transpositions and last_left > 0 and last_right > 0: - d = lev[last_left - 1][last_right - 1] + i - last_left + j - last_right - 1 - - # pick the cheapest - lev[i][j] = min(a, b, c, d) - - -def edit_distance(s1, s2, substitution_cost=1, transpositions=False): - """ - Calculate the Levenshtein edit-distance between two strings. - The edit distance is the number of characters that need to be - substituted, inserted, or deleted, to transform s1 into s2. For - example, transforming "rain" to "shine" requires three steps, - consisting of two substitutions and one insertion: - "rain" -> "sain" -> "shin" -> "shine". These operations could have - been done in other orders, but at least three steps are needed. - - Allows specifying the cost of substitution edits (e.g., "a" -> "b"), - because sometimes it makes sense to assign greater penalties to - substitutions. - - This also optionally allows transposition edits (e.g., "ab" -> "ba"), - though this is disabled by default. - - :param s1, s2: The strings to be analysed - :param transpositions: Whether to allow transposition edits - :type s1: str - :type s2: str - :type substitution_cost: int - :type transpositions: bool - :rtype: int - """ - # set up a 2-D array - len1 = len(s1) - len2 = len(s2) - lev = _edit_dist_init(len1 + 1, len2 + 1) - - # retrieve alphabet - sigma = set() - sigma.update(s1) - sigma.update(s2) - - # set up table to remember positions of last seen occurrence in s1 - last_left_t = _last_left_t_init(sigma) - - # iterate over the array - # i and j start from 1 and not 0 to stay close to the wikipedia pseudo-code - # see https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance - for i in range(1, len1 + 1): - last_right_buf = 0 - for j in range(1, len2 + 1): - last_left = last_left_t[s2[j - 1]] - last_right = last_right_buf - if s1[i - 1] == s2[j - 1]: - last_right_buf = j - _edit_dist_step( - lev, - i, - j, - s1, - s2, - last_left, - last_right, - substitution_cost=substitution_cost, - transpositions=transpositions, - ) - last_left_t[s1[i - 1]] = i - return lev[len1][len2] - - -def _edit_dist_backtrace(lev): - i, j = len(lev) - 1, len(lev[0]) - 1 - alignment = [(i, j)] - - while (i, j) != (0, 0): - directions = [ - (i - 1, j - 1), # substitution - (i - 1, j), # skip s1 - (i, j - 1), # skip s2 - ] - - direction_costs = ( - (lev[i][j] if (i >= 0 and j >= 0) else float("inf"), (i, j)) - for i, j in directions - ) - _, (i, j) = min(direction_costs, key=operator.itemgetter(0)) - - alignment.append((i, j)) - return list(reversed(alignment)) - - -def edit_distance_align(s1, s2, substitution_cost=1): - """ - Calculate the minimum Levenshtein edit-distance based alignment - mapping between two strings. The alignment finds the mapping - from string s1 to s2 that minimizes the edit distance cost. - For example, mapping "rain" to "shine" would involve 2 - substitutions, 2 matches and an insertion resulting in - the following mapping: - [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (4, 5)] - NB: (0, 0) is the start state without any letters associated - See more: https://web.stanford.edu/class/cs124/lec/med.pdf - - In case of multiple valid minimum-distance alignments, the - backtrace has the following operation precedence: - - 1. Substitute s1 and s2 characters - 2. Skip s1 character - 3. Skip s2 character - - The backtrace is carried out in reverse string order. - - This function does not support transposition. - - :param s1, s2: The strings to be aligned - :type s1: str - :type s2: str - :type substitution_cost: int - :rtype: List[Tuple(int, int)] - """ - # set up a 2-D array - len1 = len(s1) - len2 = len(s2) - lev = _edit_dist_init(len1 + 1, len2 + 1) - - # iterate over the array - for i in range(len1): - for j in range(len2): - _edit_dist_step( - lev, - i + 1, - j + 1, - s1, - s2, - 0, - 0, - substitution_cost=substitution_cost, - transpositions=False, - ) - - # backtrace to find alignment - alignment = _edit_dist_backtrace(lev) - return alignment - - -def binary_distance(label1, label2): - """Simple equality test. - - 0.0 if the labels are identical, 1.0 if they are different. - - >>> from nltk.metrics import binary_distance - >>> binary_distance(1,1) - 0.0 - - >>> binary_distance(1,3) - 1.0 - """ - - return 0.0 if label1 == label2 else 1.0 - - -def jaccard_distance(label1, label2): - """Distance metric comparing set-similarity.""" - return (len(label1.union(label2)) - len(label1.intersection(label2))) / len( - label1.union(label2) - ) - - -def masi_distance(label1, label2): - """Distance metric that takes into account partial agreement when multiple - labels are assigned. - - >>> from nltk.metrics import masi_distance - >>> masi_distance(set([1, 2]), set([1, 2, 3, 4])) - 0.665 - - Passonneau 2006, Measuring Agreement on Set-Valued Items (MASI) - for Semantic and Pragmatic Annotation. - """ - - len_intersection = len(label1.intersection(label2)) - len_union = len(label1.union(label2)) - len_label1 = len(label1) - len_label2 = len(label2) - if len_label1 == len_label2 and len_label1 == len_intersection: - m = 1 - elif len_intersection == min(len_label1, len_label2): - m = 0.67 - elif len_intersection > 0: - m = 0.33 - else: - m = 0 - - return 1 - len_intersection / len_union * m - - -def interval_distance(label1, label2): - """Krippendorff's interval distance metric - - >>> from nltk.metrics import interval_distance - >>> interval_distance(1,10) - 81 - - Krippendorff 1980, Content Analysis: An Introduction to its Methodology - """ - - try: - return pow(label1 - label2, 2) - # return pow(list(label1)[0]-list(label2)[0],2) - except: - print("non-numeric labels not supported with interval distance") - - -def presence(label): - """Higher-order function to test presence of a given label""" - - return lambda x, y: 1.0 * ((label in x) == (label in y)) - - -def fractional_presence(label): - return ( - lambda x, y: abs((1.0 / len(x)) - (1.0 / len(y))) * (label in x and label in y) - or 0.0 * (label not in x and label not in y) - or abs(1.0 / len(x)) * (label in x and label not in y) - or (1.0 / len(y)) * (label not in x and label in y) - ) - - -def custom_distance(file): - data = {} - with open(file) as infile: - for l in infile: - labelA, labelB, dist = l.strip().split("\t") - labelA = frozenset([labelA]) - labelB = frozenset([labelB]) - data[frozenset([labelA, labelB])] = float(dist) - return lambda x, y: data[frozenset([x, y])] - - -def jaro_similarity(s1, s2): - """ - Computes the Jaro similarity between 2 sequences from: - - Matthew A. Jaro (1989). Advances in record linkage methodology - as applied to the 1985 census of Tampa Florida. Journal of the - American Statistical Association. 84 (406): 414-20. - - The Jaro distance between is the min no. of single-character transpositions - required to change one word into another. The Jaro similarity formula from - https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance : - - ``jaro_sim = 0 if m = 0 else 1/3 * (m/|s_1| + m/s_2 + (m-t)/m)`` - - where - - `|s_i|` is the length of string `s_i` - - `m` is the no. of matching characters - - `t` is the half no. of possible transpositions. - """ - # First, store the length of the strings - # because they will be re-used several times. - len_s1, len_s2 = len(s1), len(s2) - - # The upper bound of the distance for being a matched character. - match_bound = max(len_s1, len_s2) // 2 - 1 - - # Initialize the counts for matches and transpositions. - matches = 0 # no.of matched characters in s1 and s2 - transpositions = 0 # no. of transpositions between s1 and s2 - flagged_1 = [] # positions in s1 which are matches to some character in s2 - flagged_2 = [] # positions in s2 which are matches to some character in s1 - - # Iterate through sequences, check for matches and compute transpositions. - for i in range(len_s1): # Iterate through each character. - upperbound = min(i + match_bound, len_s2 - 1) - lowerbound = max(0, i - match_bound) - for j in range(lowerbound, upperbound + 1): - if s1[i] == s2[j] and j not in flagged_2: - matches += 1 - flagged_1.append(i) - flagged_2.append(j) - break - flagged_2.sort() - for i, j in zip(flagged_1, flagged_2): - if s1[i] != s2[j]: - transpositions += 1 - - if matches == 0: - return 0 - else: - return ( - 1 - / 3 - * ( - matches / len_s1 - + matches / len_s2 - + (matches - transpositions // 2) / matches - ) - ) - - -def jaro_winkler_similarity(s1, s2, p=0.1, max_l=4): - """ - The Jaro Winkler distance is an extension of the Jaro similarity in: - - William E. Winkler. 1990. String Comparator Metrics and Enhanced - Decision Rules in the Fellegi-Sunter Model of Record Linkage. - Proceedings of the Section on Survey Research Methods. - American Statistical Association: 354-359. - - such that: - - jaro_winkler_sim = jaro_sim + ( l * p * (1 - jaro_sim) ) - - where, - - - jaro_sim is the output from the Jaro Similarity, - see jaro_similarity() - - l is the length of common prefix at the start of the string - - this implementation provides an upperbound for the l value - to keep the prefixes.A common value of this upperbound is 4. - - p is the constant scaling factor to overweigh common prefixes. - The Jaro-Winkler similarity will fall within the [0, 1] bound, - given that max(p)<=0.25 , default is p=0.1 in Winkler (1990) - - - Test using outputs from https://www.census.gov/srd/papers/pdf/rr93-8.pdf - from "Table 5 Comparison of String Comparators Rescaled between 0 and 1" - - >>> winkler_examples = [("billy", "billy"), ("billy", "bill"), ("billy", "blily"), - ... ("massie", "massey"), ("yvette", "yevett"), ("billy", "bolly"), ("dwayne", "duane"), - ... ("dixon", "dickson"), ("billy", "susan")] - - >>> winkler_scores = [1.000, 0.967, 0.947, 0.944, 0.911, 0.893, 0.858, 0.853, 0.000] - >>> jaro_scores = [1.000, 0.933, 0.933, 0.889, 0.889, 0.867, 0.822, 0.790, 0.000] - - One way to match the values on the Winkler's paper is to provide a different - p scaling factor for different pairs of strings, e.g. - - >>> p_factors = [0.1, 0.125, 0.20, 0.125, 0.20, 0.20, 0.20, 0.15, 0.1] - - >>> for (s1, s2), jscore, wscore, p in zip(winkler_examples, jaro_scores, winkler_scores, p_factors): - ... assert round(jaro_similarity(s1, s2), 3) == jscore - ... assert round(jaro_winkler_similarity(s1, s2, p=p), 3) == wscore - - - Test using outputs from https://www.census.gov/srd/papers/pdf/rr94-5.pdf from - "Table 2.1. Comparison of String Comparators Using Last Names, First Names, and Street Names" - - >>> winkler_examples = [('SHACKLEFORD', 'SHACKELFORD'), ('DUNNINGHAM', 'CUNNIGHAM'), - ... ('NICHLESON', 'NICHULSON'), ('JONES', 'JOHNSON'), ('MASSEY', 'MASSIE'), - ... ('ABROMS', 'ABRAMS'), ('HARDIN', 'MARTINEZ'), ('ITMAN', 'SMITH'), - ... ('JERALDINE', 'GERALDINE'), ('MARHTA', 'MARTHA'), ('MICHELLE', 'MICHAEL'), - ... ('JULIES', 'JULIUS'), ('TANYA', 'TONYA'), ('DWAYNE', 'DUANE'), ('SEAN', 'SUSAN'), - ... ('JON', 'JOHN'), ('JON', 'JAN'), ('BROOKHAVEN', 'BRROKHAVEN'), - ... ('BROOK HALLOW', 'BROOK HLLW'), ('DECATUR', 'DECATIR'), ('FITZRUREITER', 'FITZENREITER'), - ... ('HIGBEE', 'HIGHEE'), ('HIGBEE', 'HIGVEE'), ('LACURA', 'LOCURA'), ('IOWA', 'IONA'), ('1ST', 'IST')] - - >>> jaro_scores = [0.970, 0.896, 0.926, 0.790, 0.889, 0.889, 0.722, 0.467, 0.926, - ... 0.944, 0.869, 0.889, 0.867, 0.822, 0.783, 0.917, 0.000, 0.933, 0.944, 0.905, - ... 0.856, 0.889, 0.889, 0.889, 0.833, 0.000] - - >>> winkler_scores = [0.982, 0.896, 0.956, 0.832, 0.944, 0.922, 0.722, 0.467, 0.926, - ... 0.961, 0.921, 0.933, 0.880, 0.858, 0.805, 0.933, 0.000, 0.947, 0.967, 0.943, - ... 0.913, 0.922, 0.922, 0.900, 0.867, 0.000] - - One way to match the values on the Winkler's paper is to provide a different - p scaling factor for different pairs of strings, e.g. - - >>> p_factors = [0.1, 0.1, 0.1, 0.1, 0.125, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.20, - ... 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1] - - - >>> for (s1, s2), jscore, wscore, p in zip(winkler_examples, jaro_scores, winkler_scores, p_factors): - ... if (s1, s2) in [('JON', 'JAN'), ('1ST', 'IST')]: - ... continue # Skip bad examples from the paper. - ... assert round(jaro_similarity(s1, s2), 3) == jscore - ... assert round(jaro_winkler_similarity(s1, s2, p=p), 3) == wscore - - - - This test-case proves that the output of Jaro-Winkler similarity depends on - the product l * p and not on the product max_l * p. Here the product max_l * p > 1 - however the product l * p <= 1 - - >>> round(jaro_winkler_similarity('TANYA', 'TONYA', p=0.1, max_l=100), 3) - 0.88 - """ - # To ensure that the output of the Jaro-Winkler's similarity - # falls between [0,1], the product of l * p needs to be - # also fall between [0,1]. - if not 0 <= max_l * p <= 1: - warnings.warn( - str( - "The product `max_l * p` might not fall between [0,1]." - "Jaro-Winkler similarity might not be between 0 and 1." - ) - ) - - # Compute the Jaro similarity - jaro_sim = jaro_similarity(s1, s2) - - # Initialize the upper bound for the no. of prefixes. - # if user did not pre-define the upperbound, - # use shorter length between s1 and s2 - - # Compute the prefix matches. - l = 0 - # zip() will automatically loop until the end of shorter string. - for s1_i, s2_i in zip(s1, s2): - if s1_i == s2_i: - l += 1 - else: - break - if l == max_l: - break - # Return the similarity value as described in docstring. - return jaro_sim + (l * p * (1 - jaro_sim)) - - -def demo(): - string_distance_examples = [ - ("rain", "shine"), - ("abcdef", "acbdef"), - ("language", "lnaguaeg"), - ("language", "lnaugage"), - ("language", "lngauage"), - ] - for s1, s2 in string_distance_examples: - print(f"Edit distance btwn '{s1}' and '{s2}':", edit_distance(s1, s2)) - print( - f"Edit dist with transpositions btwn '{s1}' and '{s2}':", - edit_distance(s1, s2, transpositions=True), - ) - print(f"Jaro similarity btwn '{s1}' and '{s2}':", jaro_similarity(s1, s2)) - print( - f"Jaro-Winkler similarity btwn '{s1}' and '{s2}':", - jaro_winkler_similarity(s1, s2), - ) - print( - f"Jaro-Winkler distance btwn '{s1}' and '{s2}':", - 1 - jaro_winkler_similarity(s1, s2), - ) - s1 = {1, 2, 3, 4} - s2 = {3, 4, 5} - print("s1:", s1) - print("s2:", s2) - print("Binary distance:", binary_distance(s1, s2)) - print("Jaccard distance:", jaccard_distance(s1, s2)) - print("MASI distance:", masi_distance(s1, s2)) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/metrics/paice.py b/pipeline/nltk/metrics/paice.py deleted file mode 100644 index bf7de1930b61654f9120a2ec2cd5bf6ef090fc47..0000000000000000000000000000000000000000 --- a/pipeline/nltk/metrics/paice.py +++ /dev/null @@ -1,389 +0,0 @@ -# Natural Language Toolkit: Agreement Metrics -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Lauri Hallila -# URL: -# For license information, see LICENSE.TXT -# - -"""Counts Paice's performance statistics for evaluating stemming algorithms. - -What is required: - - A dictionary of words grouped by their real lemmas - - A dictionary of words grouped by stems from a stemming algorithm - -When these are given, Understemming Index (UI), Overstemming Index (OI), -Stemming Weight (SW) and Error-rate relative to truncation (ERRT) are counted. - -References: -Chris D. Paice (1994). An evaluation method for stemming algorithms. -In Proceedings of SIGIR, 42--50. -""" - -from math import sqrt - - -def get_words_from_dictionary(lemmas): - """ - Get original set of words used for analysis. - - :param lemmas: A dictionary where keys are lemmas and values are sets - or lists of words corresponding to that lemma. - :type lemmas: dict(str): list(str) - :return: Set of words that exist as values in the dictionary - :rtype: set(str) - """ - words = set() - for lemma in lemmas: - words.update(set(lemmas[lemma])) - return words - - -def _truncate(words, cutlength): - """Group words by stems defined by truncating them at given length. - - :param words: Set of words used for analysis - :param cutlength: Words are stemmed by cutting at this length. - :type words: set(str) or list(str) - :type cutlength: int - :return: Dictionary where keys are stems and values are sets of words - corresponding to that stem. - :rtype: dict(str): set(str) - """ - stems = {} - for word in words: - stem = word[:cutlength] - try: - stems[stem].update([word]) - except KeyError: - stems[stem] = {word} - return stems - - -# Reference: https://en.wikipedia.org/wiki/Line-line_intersection -def _count_intersection(l1, l2): - """Count intersection between two line segments defined by coordinate pairs. - - :param l1: Tuple of two coordinate pairs defining the first line segment - :param l2: Tuple of two coordinate pairs defining the second line segment - :type l1: tuple(float, float) - :type l2: tuple(float, float) - :return: Coordinates of the intersection - :rtype: tuple(float, float) - """ - x1, y1 = l1[0] - x2, y2 = l1[1] - x3, y3 = l2[0] - x4, y4 = l2[1] - - denominator = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4) - - if denominator == 0.0: # lines are parallel - if x1 == x2 == x3 == x4 == 0.0: - # When lines are parallel, they must be on the y-axis. - # We can ignore x-axis because we stop counting the - # truncation line when we get there. - # There are no other options as UI (x-axis) grows and - # OI (y-axis) diminishes when we go along the truncation line. - return (0.0, y4) - - x = ( - (x1 * y2 - y1 * x2) * (x3 - x4) - (x1 - x2) * (x3 * y4 - y3 * x4) - ) / denominator - y = ( - (x1 * y2 - y1 * x2) * (y3 - y4) - (y1 - y2) * (x3 * y4 - y3 * x4) - ) / denominator - return (x, y) - - -def _get_derivative(coordinates): - """Get derivative of the line from (0,0) to given coordinates. - - :param coordinates: A coordinate pair - :type coordinates: tuple(float, float) - :return: Derivative; inf if x is zero - :rtype: float - """ - try: - return coordinates[1] / coordinates[0] - except ZeroDivisionError: - return float("inf") - - -def _calculate_cut(lemmawords, stems): - """Count understemmed and overstemmed pairs for (lemma, stem) pair with common words. - - :param lemmawords: Set or list of words corresponding to certain lemma. - :param stems: A dictionary where keys are stems and values are sets - or lists of words corresponding to that stem. - :type lemmawords: set(str) or list(str) - :type stems: dict(str): set(str) - :return: Amount of understemmed and overstemmed pairs contributed by words - existing in both lemmawords and stems. - :rtype: tuple(float, float) - """ - umt, wmt = 0.0, 0.0 - for stem in stems: - cut = set(lemmawords) & set(stems[stem]) - if cut: - cutcount = len(cut) - stemcount = len(stems[stem]) - # Unachieved merge total - umt += cutcount * (len(lemmawords) - cutcount) - # Wrongly merged total - wmt += cutcount * (stemcount - cutcount) - return (umt, wmt) - - -def _calculate(lemmas, stems): - """Calculate actual and maximum possible amounts of understemmed and overstemmed word pairs. - - :param lemmas: A dictionary where keys are lemmas and values are sets - or lists of words corresponding to that lemma. - :param stems: A dictionary where keys are stems and values are sets - or lists of words corresponding to that stem. - :type lemmas: dict(str): list(str) - :type stems: dict(str): set(str) - :return: Global unachieved merge total (gumt), - global desired merge total (gdmt), - global wrongly merged total (gwmt) and - global desired non-merge total (gdnt). - :rtype: tuple(float, float, float, float) - """ - - n = sum(len(lemmas[word]) for word in lemmas) - - gdmt, gdnt, gumt, gwmt = (0.0, 0.0, 0.0, 0.0) - - for lemma in lemmas: - lemmacount = len(lemmas[lemma]) - - # Desired merge total - gdmt += lemmacount * (lemmacount - 1) - - # Desired non-merge total - gdnt += lemmacount * (n - lemmacount) - - # For each (lemma, stem) pair with common words, count how many - # pairs are understemmed and overstemmed. - umt, wmt = _calculate_cut(lemmas[lemma], stems) - - # Add to total undesired and wrongly-merged totals - gumt += umt - gwmt += wmt - - # Each object is counted twice, so divide by two - return (gumt / 2, gdmt / 2, gwmt / 2, gdnt / 2) - - -def _indexes(gumt, gdmt, gwmt, gdnt): - """Count Understemming Index (UI), Overstemming Index (OI) and Stemming Weight (SW). - - :param gumt, gdmt, gwmt, gdnt: Global unachieved merge total (gumt), - global desired merge total (gdmt), - global wrongly merged total (gwmt) and - global desired non-merge total (gdnt). - :type gumt, gdmt, gwmt, gdnt: float - :return: Understemming Index (UI), - Overstemming Index (OI) and - Stemming Weight (SW). - :rtype: tuple(float, float, float) - """ - # Calculate Understemming Index (UI), - # Overstemming Index (OI) and Stemming Weight (SW) - try: - ui = gumt / gdmt - except ZeroDivisionError: - # If GDMT (max merge total) is 0, define UI as 0 - ui = 0.0 - try: - oi = gwmt / gdnt - except ZeroDivisionError: - # IF GDNT (max non-merge total) is 0, define OI as 0 - oi = 0.0 - try: - sw = oi / ui - except ZeroDivisionError: - if oi == 0.0: - # OI and UI are 0, define SW as 'not a number' - sw = float("nan") - else: - # UI is 0, define SW as infinity - sw = float("inf") - return (ui, oi, sw) - - -class Paice: - """Class for storing lemmas, stems and evaluation metrics.""" - - def __init__(self, lemmas, stems): - """ - :param lemmas: A dictionary where keys are lemmas and values are sets - or lists of words corresponding to that lemma. - :param stems: A dictionary where keys are stems and values are sets - or lists of words corresponding to that stem. - :type lemmas: dict(str): list(str) - :type stems: dict(str): set(str) - """ - self.lemmas = lemmas - self.stems = stems - self.coords = [] - self.gumt, self.gdmt, self.gwmt, self.gdnt = (None, None, None, None) - self.ui, self.oi, self.sw = (None, None, None) - self.errt = None - self.update() - - def __str__(self): - text = ["Global Unachieved Merge Total (GUMT): %s\n" % self.gumt] - text.append("Global Desired Merge Total (GDMT): %s\n" % self.gdmt) - text.append("Global Wrongly-Merged Total (GWMT): %s\n" % self.gwmt) - text.append("Global Desired Non-merge Total (GDNT): %s\n" % self.gdnt) - text.append("Understemming Index (GUMT / GDMT): %s\n" % self.ui) - text.append("Overstemming Index (GWMT / GDNT): %s\n" % self.oi) - text.append("Stemming Weight (OI / UI): %s\n" % self.sw) - text.append("Error-Rate Relative to Truncation (ERRT): %s\r\n" % self.errt) - coordinates = " ".join(["(%s, %s)" % item for item in self.coords]) - text.append("Truncation line: %s" % coordinates) - return "".join(text) - - def _get_truncation_indexes(self, words, cutlength): - """Count (UI, OI) when stemming is done by truncating words at \'cutlength\'. - - :param words: Words used for the analysis - :param cutlength: Words are stemmed by cutting them at this length - :type words: set(str) or list(str) - :type cutlength: int - :return: Understemming and overstemming indexes - :rtype: tuple(int, int) - """ - - truncated = _truncate(words, cutlength) - gumt, gdmt, gwmt, gdnt = _calculate(self.lemmas, truncated) - ui, oi = _indexes(gumt, gdmt, gwmt, gdnt)[:2] - return (ui, oi) - - def _get_truncation_coordinates(self, cutlength=0): - """Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line. - - :param cutlength: Optional parameter to start counting from (ui, oi) - coordinates gotten by stemming at this length. Useful for speeding up - the calculations when you know the approximate location of the - intersection. - :type cutlength: int - :return: List of coordinate pairs that define the truncation line - :rtype: list(tuple(float, float)) - """ - words = get_words_from_dictionary(self.lemmas) - maxlength = max(len(word) for word in words) - - # Truncate words from different points until (0, 0) - (ui, oi) segment crosses the truncation line - coords = [] - while cutlength <= maxlength: - # Get (UI, OI) pair of current truncation point - pair = self._get_truncation_indexes(words, cutlength) - - # Store only new coordinates so we'll have an actual - # line segment when counting the intersection point - if pair not in coords: - coords.append(pair) - if pair == (0.0, 0.0): - # Stop counting if truncation line goes through origo; - # length from origo to truncation line is 0 - return coords - if len(coords) >= 2 and pair[0] > 0.0: - derivative1 = _get_derivative(coords[-2]) - derivative2 = _get_derivative(coords[-1]) - # Derivative of the truncation line is a decreasing value; - # when it passes Stemming Weight, we've found the segment - # of truncation line intersecting with (0, 0) - (ui, oi) segment - if derivative1 >= self.sw >= derivative2: - return coords - cutlength += 1 - return coords - - def _errt(self): - """Count Error-Rate Relative to Truncation (ERRT). - - :return: ERRT, length of the line from origo to (UI, OI) divided by - the length of the line from origo to the point defined by the same - line when extended until the truncation line. - :rtype: float - """ - # Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line - self.coords = self._get_truncation_coordinates() - if (0.0, 0.0) in self.coords: - # Truncation line goes through origo, so ERRT cannot be counted - if (self.ui, self.oi) != (0.0, 0.0): - return float("inf") - else: - return float("nan") - if (self.ui, self.oi) == (0.0, 0.0): - # (ui, oi) is origo; define errt as 0.0 - return 0.0 - # Count the intersection point - # Note that (self.ui, self.oi) cannot be (0.0, 0.0) and self.coords has different coordinates - # so we have actual line segments instead of a line segment and a point - intersection = _count_intersection( - ((0, 0), (self.ui, self.oi)), self.coords[-2:] - ) - # Count OP (length of the line from origo to (ui, oi)) - op = sqrt(self.ui**2 + self.oi**2) - # Count OT (length of the line from origo to truncation line that goes through (ui, oi)) - ot = sqrt(intersection[0] ** 2 + intersection[1] ** 2) - # OP / OT tells how well the stemming algorithm works compared to just truncating words - return op / ot - - def update(self): - """Update statistics after lemmas and stems have been set.""" - self.gumt, self.gdmt, self.gwmt, self.gdnt = _calculate(self.lemmas, self.stems) - self.ui, self.oi, self.sw = _indexes(self.gumt, self.gdmt, self.gwmt, self.gdnt) - self.errt = self._errt() - - -def demo(): - """Demonstration of the module.""" - # Some words with their real lemmas - lemmas = { - "kneel": ["kneel", "knelt"], - "range": ["range", "ranged"], - "ring": ["ring", "rang", "rung"], - } - # Same words with stems from a stemming algorithm - stems = { - "kneel": ["kneel"], - "knelt": ["knelt"], - "rang": ["rang", "range", "ranged"], - "ring": ["ring"], - "rung": ["rung"], - } - print("Words grouped by their lemmas:") - for lemma in sorted(lemmas): - print("{} => {}".format(lemma, " ".join(lemmas[lemma]))) - print() - print("Same words grouped by a stemming algorithm:") - for stem in sorted(stems): - print("{} => {}".format(stem, " ".join(stems[stem]))) - print() - p = Paice(lemmas, stems) - print(p) - print() - # Let's "change" results from a stemming algorithm - stems = { - "kneel": ["kneel"], - "knelt": ["knelt"], - "rang": ["rang"], - "range": ["range", "ranged"], - "ring": ["ring"], - "rung": ["rung"], - } - print("Counting stats after changing stemming results:") - for stem in sorted(stems): - print("{} => {}".format(stem, " ".join(stems[stem]))) - print() - p.stems = stems - p.update() - print(p) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/metrics/scores.py b/pipeline/nltk/metrics/scores.py deleted file mode 100644 index 0d6d296aa62893788de65cdd0cdf3f5480a161f2..0000000000000000000000000000000000000000 --- a/pipeline/nltk/metrics/scores.py +++ /dev/null @@ -1,228 +0,0 @@ -# Natural Language Toolkit: Evaluation -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird -# URL: -# For license information, see LICENSE.TXT - -import operator -from functools import reduce -from math import fabs -from random import shuffle - -try: - from scipy.stats.stats import betai -except ImportError: - betai = None - -from nltk.util import LazyConcatenation, LazyMap - - -def accuracy(reference, test): - """ - Given a list of reference values and a corresponding list of test - values, return the fraction of corresponding values that are - equal. In particular, return the fraction of indices - ``0= actual_stat: - c += 1 - - if verbose and i % 10 == 0: - print("pseudo-statistic: %f" % pseudo_stat) - print("significance: %f" % ((c + 1) / (i + 1))) - print("-" * 60) - - significance = (c + 1) / (shuffles + 1) - - if verbose: - print("significance: %f" % significance) - if betai: - for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]: - print(f"prob(phi<={phi:f}): {betai(c, shuffles, phi):f}") - - return (significance, c, shuffles) - - -def demo(): - print("-" * 75) - reference = "DET NN VB DET JJ NN NN IN DET NN".split() - test = "DET VB VB DET NN NN NN IN DET NN".split() - print("Reference =", reference) - print("Test =", test) - print("Accuracy:", accuracy(reference, test)) - - print("-" * 75) - reference_set = set(reference) - test_set = set(test) - print("Reference =", reference_set) - print("Test = ", test_set) - print("Precision:", precision(reference_set, test_set)) - print(" Recall:", recall(reference_set, test_set)) - print("F-Measure:", f_measure(reference_set, test_set)) - print("-" * 75) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/metrics/segmentation.py b/pipeline/nltk/metrics/segmentation.py deleted file mode 100644 index 518197d35dff62ce5735b8e76fb5939b19ebedd2..0000000000000000000000000000000000000000 --- a/pipeline/nltk/metrics/segmentation.py +++ /dev/null @@ -1,222 +0,0 @@ -# Natural Language Toolkit: Text Segmentation Metrics -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird -# David Doukhan -# URL: -# For license information, see LICENSE.TXT - - -""" -Text Segmentation Metrics - -1. Windowdiff - -Pevzner, L., and Hearst, M., A Critique and Improvement of - an Evaluation Metric for Text Segmentation, - Computational Linguistics 28, 19-36 - - -2. Generalized Hamming Distance - -Bookstein A., Kulyukin V.A., Raita T. -Generalized Hamming Distance -Information Retrieval 5, 2002, pp 353-375 - -Baseline implementation in C++ -http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html - -Study describing benefits of Generalized Hamming Distance Versus -WindowDiff for evaluating text segmentation tasks -Begsten, Y. Quel indice pour mesurer l'efficacite en segmentation de textes ? -TALN 2009 - - -3. Pk text segmentation metric - -Beeferman D., Berger A., Lafferty J. (1999) -Statistical Models for Text Segmentation -Machine Learning, 34, 177-210 -""" - -try: - import numpy as np -except ImportError: - pass - - -def windowdiff(seg1, seg2, k, boundary="1", weighted=False): - """ - Compute the windowdiff score for a pair of segmentations. A - segmentation is any sequence over a vocabulary of two items - (e.g. "0", "1"), where the specified boundary value is used to - mark the edge of a segmentation. - - >>> s1 = "000100000010" - >>> s2 = "000010000100" - >>> s3 = "100000010000" - >>> '%.2f' % windowdiff(s1, s1, 3) - '0.00' - >>> '%.2f' % windowdiff(s1, s2, 3) - '0.30' - >>> '%.2f' % windowdiff(s2, s3, 3) - '0.80' - - :param seg1: a segmentation - :type seg1: str or list - :param seg2: a segmentation - :type seg2: str or list - :param k: window width - :type k: int - :param boundary: boundary value - :type boundary: str or int or bool - :param weighted: use the weighted variant of windowdiff - :type weighted: boolean - :rtype: float - """ - - if len(seg1) != len(seg2): - raise ValueError("Segmentations have unequal length") - if k > len(seg1): - raise ValueError( - "Window width k should be smaller or equal than segmentation lengths" - ) - wd = 0 - for i in range(len(seg1) - k + 1): - ndiff = abs(seg1[i : i + k].count(boundary) - seg2[i : i + k].count(boundary)) - if weighted: - wd += ndiff - else: - wd += min(1, ndiff) - return wd / (len(seg1) - k + 1.0) - - -# Generalized Hamming Distance - - -def _init_mat(nrows, ncols, ins_cost, del_cost): - mat = np.empty((nrows, ncols)) - mat[0, :] = ins_cost * np.arange(ncols) - mat[:, 0] = del_cost * np.arange(nrows) - return mat - - -def _ghd_aux(mat, rowv, colv, ins_cost, del_cost, shift_cost_coeff): - for i, rowi in enumerate(rowv): - for j, colj in enumerate(colv): - shift_cost = shift_cost_coeff * abs(rowi - colj) + mat[i, j] - if rowi == colj: - # boundaries are at the same location, no transformation required - tcost = mat[i, j] - elif rowi > colj: - # boundary match through a deletion - tcost = del_cost + mat[i, j + 1] - else: - # boundary match through an insertion - tcost = ins_cost + mat[i + 1, j] - mat[i + 1, j + 1] = min(tcost, shift_cost) - - -def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary="1"): - """ - Compute the Generalized Hamming Distance for a reference and a hypothetical - segmentation, corresponding to the cost related to the transformation - of the hypothetical segmentation into the reference segmentation - through boundary insertion, deletion and shift operations. - - A segmentation is any sequence over a vocabulary of two items - (e.g. "0", "1"), where the specified boundary value is used to - mark the edge of a segmentation. - - Recommended parameter values are a shift_cost_coeff of 2. - Associated with a ins_cost, and del_cost equal to the mean segment - length in the reference segmentation. - - >>> # Same examples as Kulyukin C++ implementation - >>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5) - 0.5 - >>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5) - 2.0 - >>> ghd('011', '110', 1.0, 1.0, 0.5) - 1.0 - >>> ghd('1', '0', 1.0, 1.0, 0.5) - 1.0 - >>> ghd('111', '000', 1.0, 1.0, 0.5) - 3.0 - >>> ghd('000', '111', 1.0, 2.0, 0.5) - 6.0 - - :param ref: the reference segmentation - :type ref: str or list - :param hyp: the hypothetical segmentation - :type hyp: str or list - :param ins_cost: insertion cost - :type ins_cost: float - :param del_cost: deletion cost - :type del_cost: float - :param shift_cost_coeff: constant used to compute the cost of a shift. - ``shift cost = shift_cost_coeff * |i - j|`` where ``i`` and ``j`` - are the positions indicating the shift - :type shift_cost_coeff: float - :param boundary: boundary value - :type boundary: str or int or bool - :rtype: float - """ - - ref_idx = [i for (i, val) in enumerate(ref) if val == boundary] - hyp_idx = [i for (i, val) in enumerate(hyp) if val == boundary] - - nref_bound = len(ref_idx) - nhyp_bound = len(hyp_idx) - - if nref_bound == 0 and nhyp_bound == 0: - return 0.0 - elif nref_bound > 0 and nhyp_bound == 0: - return nref_bound * ins_cost - elif nref_bound == 0 and nhyp_bound > 0: - return nhyp_bound * del_cost - - mat = _init_mat(nhyp_bound + 1, nref_bound + 1, ins_cost, del_cost) - _ghd_aux(mat, hyp_idx, ref_idx, ins_cost, del_cost, shift_cost_coeff) - return mat[-1, -1] - - -# Beeferman's Pk text segmentation evaluation metric - - -def pk(ref, hyp, k=None, boundary="1"): - """ - Compute the Pk metric for a pair of segmentations A segmentation - is any sequence over a vocabulary of two items (e.g. "0", "1"), - where the specified boundary value is used to mark the edge of a - segmentation. - - >>> '%.2f' % pk('0100'*100, '1'*400, 2) - '0.50' - >>> '%.2f' % pk('0100'*100, '0'*400, 2) - '0.50' - >>> '%.2f' % pk('0100'*100, '0100'*100, 2) - '0.00' - - :param ref: the reference segmentation - :type ref: str or list - :param hyp: the segmentation to evaluate - :type hyp: str or list - :param k: window size, if None, set to half of the average reference segment length - :type boundary: str or int or bool - :param boundary: boundary value - :type boundary: str or int or bool - :rtype: float - """ - - if k is None: - k = int(round(len(ref) / (ref.count(boundary) * 2.0))) - - err = 0 - for i in range(len(ref) - k + 1): - r = ref[i : i + k].count(boundary) > 0 - h = hyp[i : i + k].count(boundary) > 0 - if r != h: - err += 1 - return err / (len(ref) - k + 1.0) diff --git a/pipeline/nltk/metrics/spearman.py b/pipeline/nltk/metrics/spearman.py deleted file mode 100644 index 8b2987d371a2af218c7223e155b335315663fd2c..0000000000000000000000000000000000000000 --- a/pipeline/nltk/metrics/spearman.py +++ /dev/null @@ -1,68 +0,0 @@ -# Natural Language Toolkit: Spearman Rank Correlation -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Joel Nothman -# URL: -# For license information, see LICENSE.TXT - -""" -Tools for comparing ranked lists. -""" - - -def _rank_dists(ranks1, ranks2): - """Finds the difference between the values in ranks1 and ranks2 for keys - present in both dicts. If the arguments are not dicts, they are converted - from (key, rank) sequences. - """ - ranks1 = dict(ranks1) - ranks2 = dict(ranks2) - for k in ranks1: - try: - yield k, ranks1[k] - ranks2[k] - except KeyError: - pass - - -def spearman_correlation(ranks1, ranks2): - """Returns the Spearman correlation coefficient for two rankings, which - should be dicts or sequences of (key, rank). The coefficient ranges from - -1.0 (ranks are opposite) to 1.0 (ranks are identical), and is only - calculated for keys in both rankings (for meaningful results, remove keys - present in only one list before ranking).""" - n = 0 - res = 0 - for k, d in _rank_dists(ranks1, ranks2): - res += d * d - n += 1 - try: - return 1 - (6 * res / (n * (n * n - 1))) - except ZeroDivisionError: - # Result is undefined if only one item is ranked - return 0.0 - - -def ranks_from_sequence(seq): - """Given a sequence, yields each element with an increasing rank, suitable - for use as an argument to ``spearman_correlation``. - """ - return ((k, i) for i, k in enumerate(seq)) - - -def ranks_from_scores(scores, rank_gap=1e-15): - """Given a sequence of (key, score) tuples, yields each key with an - increasing rank, tying with previous key's rank if the difference between - their scores is less than rank_gap. Suitable for use as an argument to - ``spearman_correlation``. - """ - prev_score = None - rank = 0 - for i, (key, score) in enumerate(scores): - try: - if abs(score - prev_score) > rank_gap: - rank = i - except TypeError: - pass - - yield key, rank - prev_score = score diff --git a/pipeline/nltk/misc/__init__.py b/pipeline/nltk/misc/__init__.py deleted file mode 100644 index 8ac9e0f7125810319ed560d0cdfdc0c1f0114b18..0000000000000000000000000000000000000000 --- a/pipeline/nltk/misc/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -# Natural Language Toolkit: Miscellaneous modules -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# URL: -# For license information, see LICENSE.TXT - -from nltk.misc.babelfish import babelize_shell -from nltk.misc.chomsky import generate_chomsky -from nltk.misc.minimalset import MinimalSet -from nltk.misc.wordfinder import word_finder diff --git a/pipeline/nltk/misc/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/misc/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 3a61cdb2905d5aa3e812284a1c70d17d9361ff5d..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/misc/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/misc/__pycache__/babelfish.cpython-39.pyc b/pipeline/nltk/misc/__pycache__/babelfish.cpython-39.pyc deleted file mode 100644 index 15e66c8778ae1242d582f8b8960cb60b117f29be..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/misc/__pycache__/babelfish.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/misc/__pycache__/chomsky.cpython-39.pyc b/pipeline/nltk/misc/__pycache__/chomsky.cpython-39.pyc deleted file mode 100644 index f2b5a016757de5055bc7d7c5263ca104cc5d0386..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/misc/__pycache__/chomsky.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/misc/__pycache__/minimalset.cpython-39.pyc b/pipeline/nltk/misc/__pycache__/minimalset.cpython-39.pyc deleted file mode 100644 index 3221c24c74cf9d231efb476d5ae85499c385c184..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/misc/__pycache__/minimalset.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/misc/__pycache__/sort.cpython-39.pyc b/pipeline/nltk/misc/__pycache__/sort.cpython-39.pyc deleted file mode 100644 index 0b51ed9dc824e1816e022292273d0be1bd6b3c88..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/misc/__pycache__/sort.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/misc/__pycache__/wordfinder.cpython-39.pyc b/pipeline/nltk/misc/__pycache__/wordfinder.cpython-39.pyc deleted file mode 100644 index b0a5b55a3fd56b30f99d3472417dbd1de76ec67f..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/misc/__pycache__/wordfinder.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/misc/babelfish.py b/pipeline/nltk/misc/babelfish.py deleted file mode 100644 index d317d65a194578e28ffad94bd53803395b5e3c58..0000000000000000000000000000000000000000 --- a/pipeline/nltk/misc/babelfish.py +++ /dev/null @@ -1,10 +0,0 @@ -""" -This module previously provided an interface to Babelfish online -translation service; this service is no longer available; this -module is kept in NLTK source code in order to provide better error -messages for people following the NLTK Book 2.0. -""" - - -def babelize_shell(): - print("Babelfish online translation service is no longer available.") diff --git a/pipeline/nltk/misc/chomsky.py b/pipeline/nltk/misc/chomsky.py deleted file mode 100644 index 0632bca034512041b3e0cf9a6231f8ac1c131e4b..0000000000000000000000000000000000000000 --- a/pipeline/nltk/misc/chomsky.py +++ /dev/null @@ -1,134 +0,0 @@ -# Chomsky random text generator, version 1.1, Raymond Hettinger, 2005/09/13 -# https://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/440546 - -""" -CHOMSKY is an aid to writing linguistic papers in the style -of the great master. It is based on selected phrases taken -from actual books and articles written by Noam Chomsky. -Upon request, it assembles the phrases in the elegant -stylistic patterns that Chomsky is noted for. -To generate n sentences of linguistic wisdom, type - - (CHOMSKY n) -- for example - (CHOMSKY 5) generates half a screen of linguistic truth. -""" - -leadins = """To characterize a linguistic level L, - On the other hand, - This suggests that - It appears that - Furthermore, - We will bring evidence in favor of the following thesis: - To provide a constituent structure for T(Z,K), - From C1, it follows that - For any transformation which is sufficiently diversified in \ -application to be of any interest, - Analogously, - Clearly, - Note that - Of course, - Suppose, for instance, that - Thus - With this clarification, - Conversely, - We have already seen that - By combining adjunctions and certain deformations, - I suggested that these results would follow from the assumption that - If the position of the trace in (99c) were only relatively \ -inaccessible to movement, - However, this assumption is not correct, since - Comparing these examples with their parasitic gap counterparts in \ -(96) and (97), we see that - In the discussion of resumptive pronouns following (81), - So far, - Nevertheless, - For one thing, - Summarizing, then, we assume that - A consequence of the approach just outlined is that - Presumably, - On our assumptions, - It may be, then, that - It must be emphasized, once again, that - Let us continue to suppose that - Notice, incidentally, that """ -# List of LEADINs to buy time. - -subjects = """ the notion of level of grammaticalness - a case of semigrammaticalness of a different sort - most of the methodological work in modern linguistics - a subset of English sentences interesting on quite independent grounds - the natural general principle that will subsume this case - an important property of these three types of EC - any associated supporting element - the appearance of parasitic gaps in domains relatively inaccessible \ -to ordinary extraction - the speaker-hearer's linguistic intuition - the descriptive power of the base component - the earlier discussion of deviance - this analysis of a formative as a pair of sets of features - this selectionally introduced contextual feature - a descriptively adequate grammar - the fundamental error of regarding functional notions as categorial - relational information - the systematic use of complex symbols - the theory of syntactic features developed earlier""" -# List of SUBJECTs chosen for maximum professorial macho. - -verbs = """can be defined in such a way as to impose - delimits - suffices to account for - cannot be arbitrary in - is not subject to - does not readily tolerate - raises serious doubts about - is not quite equivalent to - does not affect the structure of - may remedy and, at the same time, eliminate - is not to be considered in determining - is to be regarded as - is unspecified with respect to - is, apparently, determined by - is necessary to impose an interpretation on - appears to correlate rather closely with - is rather different from""" -# List of VERBs chosen for autorecursive obfuscation. - -objects = """ problems of phonemic and morphological analysis. - a corpus of utterance tokens upon which conformity has been defined \ -by the paired utterance test. - the traditional practice of grammarians. - the levels of acceptability from fairly high (e.g. (99a)) to virtual \ -gibberish (e.g. (98d)). - a stipulation to place the constructions into these various categories. - a descriptive fact. - a parasitic gap construction. - the extended c-command discussed in connection with (34). - the ultimate standard that determines the accuracy of any proposed grammar. - the system of base rules exclusive of the lexicon. - irrelevant intervening contexts in selectional rules. - nondistinctness in the sense of distinctive feature theory. - a general convention regarding the forms of the grammar. - an abstract underlying order. - an important distinction in language use. - the requirement that branching is not tolerated within the dominance \ -scope of a complex symbol. - the strong generative capacity of the theory.""" -# List of OBJECTs selected for profound sententiousness. - -import random -import textwrap -from itertools import chain, islice - - -def generate_chomsky(times=5, line_length=72): - parts = [] - for part in (leadins, subjects, verbs, objects): - phraselist = list(map(str.strip, part.splitlines())) - random.shuffle(phraselist) - parts.append(phraselist) - output = chain.from_iterable(islice(zip(*parts), 0, times)) - print(textwrap.fill(" ".join(output), line_length)) - - -if __name__ == "__main__": - generate_chomsky() diff --git a/pipeline/nltk/misc/minimalset.py b/pipeline/nltk/misc/minimalset.py deleted file mode 100644 index 50d1fa5b6e45193d15e6fa1d2aec687de503f1d2..0000000000000000000000000000000000000000 --- a/pipeline/nltk/misc/minimalset.py +++ /dev/null @@ -1,85 +0,0 @@ -# Natural Language Toolkit: Minimal Sets -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# URL: -# For license information, see LICENSE.TXT - -from collections import defaultdict - - -class MinimalSet: - """ - Find contexts where more than one possible target value can - appear. E.g. if targets are word-initial letters, and contexts - are the remainders of words, then we would like to find cases like - "fat" vs "cat", and "training" vs "draining". If targets are - parts-of-speech and contexts are words, then we would like to find - cases like wind (noun) 'air in rapid motion', vs wind (verb) - 'coil, wrap'. - """ - - def __init__(self, parameters=None): - """ - Create a new minimal set. - - :param parameters: The (context, target, display) tuples for the item - :type parameters: list(tuple(str, str, str)) - """ - self._targets = set() # the contrastive information - self._contexts = set() # what we are controlling for - self._seen = defaultdict(set) # to record what we have seen - self._displays = {} # what we will display - - if parameters: - for context, target, display in parameters: - self.add(context, target, display) - - def add(self, context, target, display): - """ - Add a new item to the minimal set, having the specified - context, target, and display form. - - :param context: The context in which the item of interest appears - :type context: str - :param target: The item of interest - :type target: str - :param display: The information to be reported for each item - :type display: str - """ - # Store the set of targets that occurred in this context - self._seen[context].add(target) - - # Keep track of which contexts and targets we have seen - self._contexts.add(context) - self._targets.add(target) - - # For a given context and target, store the display form - self._displays[(context, target)] = display - - def contexts(self, minimum=2): - """ - Determine which contexts occurred with enough distinct targets. - - :param minimum: the minimum number of distinct target forms - :type minimum: int - :rtype: list - """ - return [c for c in self._contexts if len(self._seen[c]) >= minimum] - - def display(self, context, target, default=""): - if (context, target) in self._displays: - return self._displays[(context, target)] - else: - return default - - def display_all(self, context): - result = [] - for target in self._targets: - x = self.display(context, target) - if x: - result.append(x) - return result - - def targets(self): - return self._targets diff --git a/pipeline/nltk/misc/sort.py b/pipeline/nltk/misc/sort.py deleted file mode 100644 index cb543d93929f45505475f9d985afea5e92f58a94..0000000000000000000000000000000000000000 --- a/pipeline/nltk/misc/sort.py +++ /dev/null @@ -1,176 +0,0 @@ -# Natural Language Toolkit: List Sorting -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# URL: -# For license information, see LICENSE.TXT - -""" -This module provides a variety of list sorting algorithms, to -illustrate the many different algorithms (recipes) for solving a -problem, and how to analyze algorithms experimentally. -""" -# These algorithms are taken from: -# Levitin (2004) The Design and Analysis of Algorithms - -################################################################## -# Selection Sort -################################################################## - - -def selection(a): - """ - Selection Sort: scan the list to find its smallest element, then - swap it with the first element. The remainder of the list is one - element smaller; apply the same method to this list, and so on. - """ - count = 0 - - for i in range(len(a) - 1): - min = i - - for j in range(i + 1, len(a)): - if a[j] < a[min]: - min = j - - count += 1 - - a[min], a[i] = a[i], a[min] - - return count - - -################################################################## -# Bubble Sort -################################################################## - - -def bubble(a): - """ - Bubble Sort: compare adjacent elements of the list left-to-right, - and swap them if they are out of order. After one pass through - the list swapping adjacent items, the largest item will be in - the rightmost position. The remainder is one element smaller; - apply the same method to this list, and so on. - """ - count = 0 - for i in range(len(a) - 1): - for j in range(len(a) - i - 1): - if a[j + 1] < a[j]: - a[j], a[j + 1] = a[j + 1], a[j] - count += 1 - return count - - -################################################################## -# Merge Sort -################################################################## - - -def _merge_lists(b, c): - count = 0 - i = j = 0 - a = [] - while i < len(b) and j < len(c): - count += 1 - if b[i] <= c[j]: - a.append(b[i]) - i += 1 - else: - a.append(c[j]) - j += 1 - if i == len(b): - a += c[j:] - else: - a += b[i:] - return a, count - - -def merge(a): - """ - Merge Sort: split the list in half, and sort each half, then - combine the sorted halves. - """ - count = 0 - if len(a) > 1: - midpoint = len(a) // 2 - b = a[:midpoint] - c = a[midpoint:] - count_b = merge(b) - count_c = merge(c) - result, count_a = _merge_lists(b, c) - a[:] = result # copy the result back into a. - count = count_a + count_b + count_c - return count - - -################################################################## -# Quick Sort -################################################################## - - -def _partition(a, l, r): - p = a[l] - i = l - j = r + 1 - count = 0 - while True: - while i < r: - i += 1 - if a[i] >= p: - break - while j > l: - j -= 1 - if j < l or a[j] <= p: - break - a[i], a[j] = a[j], a[i] # swap - count += 1 - if i >= j: - break - a[i], a[j] = a[j], a[i] # undo last swap - a[l], a[j] = a[j], a[l] - return j, count - - -def _quick(a, l, r): - count = 0 - if l < r: - s, count = _partition(a, l, r) - count += _quick(a, l, s - 1) - count += _quick(a, s + 1, r) - return count - - -def quick(a): - return _quick(a, 0, len(a) - 1) - - -################################################################## -# Demonstration -################################################################## - - -def demo(): - from random import shuffle - - for size in (10, 20, 50, 100, 200, 500, 1000): - a = list(range(size)) - - # various sort methods - shuffle(a) - count_selection = selection(a) - shuffle(a) - count_bubble = bubble(a) - shuffle(a) - count_merge = merge(a) - shuffle(a) - count_quick = quick(a) - - print( - ("size=%5d: selection=%8d, bubble=%8d, " "merge=%6d, quick=%6d") - % (size, count_selection, count_bubble, count_merge, count_quick) - ) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/misc/wordfinder.py b/pipeline/nltk/misc/wordfinder.py deleted file mode 100644 index e8ddca0dd6282e988ad38d287ae1029dadc98dfc..0000000000000000000000000000000000000000 --- a/pipeline/nltk/misc/wordfinder.py +++ /dev/null @@ -1,139 +0,0 @@ -# Natural Language Toolkit: Word Finder -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# URL: -# For license information, see LICENSE.TXT - -# Simplified from PHP version by Robert Klein -# http://fswordfinder.sourceforge.net/ - -import random - - -# reverse a word with probability 0.5 -def revword(word): - if random.randint(1, 2) == 1: - return word[::-1] - return word - - -# try to insert word at position x,y; direction encoded in xf,yf -def step(word, x, xf, y, yf, grid): - for i in range(len(word)): - if grid[xf(i)][yf(i)] != "" and grid[xf(i)][yf(i)] != word[i]: - return False - for i in range(len(word)): - grid[xf(i)][yf(i)] = word[i] - return True - - -# try to insert word at position x,y, in direction dir -def check(word, dir, x, y, grid, rows, cols): - if dir == 1: - if x - len(word) < 0 or y - len(word) < 0: - return False - return step(word, x, lambda i: x - i, y, lambda i: y - i, grid) - elif dir == 2: - if x - len(word) < 0: - return False - return step(word, x, lambda i: x - i, y, lambda i: y, grid) - elif dir == 3: - if x - len(word) < 0 or y + (len(word) - 1) >= cols: - return False - return step(word, x, lambda i: x - i, y, lambda i: y + i, grid) - elif dir == 4: - if y - len(word) < 0: - return False - return step(word, x, lambda i: x, y, lambda i: y - i, grid) - - -def wordfinder(words, rows=20, cols=20, attempts=50, alph="ABCDEFGHIJKLMNOPQRSTUVWXYZ"): - """ - Attempt to arrange words into a letter-grid with the specified - number of rows and columns. Try each word in several positions - and directions, until it can be fitted into the grid, or the - maximum number of allowable attempts is exceeded. Returns a tuple - consisting of the grid and the words that were successfully - placed. - - :param words: the list of words to be put into the grid - :type words: list - :param rows: the number of rows in the grid - :type rows: int - :param cols: the number of columns in the grid - :type cols: int - :param attempts: the number of times to attempt placing a word - :type attempts: int - :param alph: the alphabet, to be used for filling blank cells - :type alph: list - :rtype: tuple - """ - - # place longer words first - words = sorted(words, key=len, reverse=True) - - grid = [] # the letter grid - used = [] # the words we used - - # initialize the grid - for i in range(rows): - grid.append([""] * cols) - - # try to place each word - for word in words: - word = word.strip().upper() # normalize - save = word # keep a record of the word - word = revword(word) - for attempt in range(attempts): - r = random.randint(0, len(word)) - dir = random.choice([1, 2, 3, 4]) - x = random.randint(0, rows) - y = random.randint(0, cols) - if dir == 1: - x += r - y += r - elif dir == 2: - x += r - elif dir == 3: - x += r - y -= r - elif dir == 4: - y += r - if 0 <= x < rows and 0 <= y < cols: - if check(word, dir, x, y, grid, rows, cols): - # used.append((save, dir, x, y, word)) - used.append(save) - break - - # Fill up the remaining spaces - for i in range(rows): - for j in range(cols): - if grid[i][j] == "": - grid[i][j] = random.choice(alph) - - return grid, used - - -def word_finder(): - from nltk.corpus import words - - wordlist = words.words() - random.shuffle(wordlist) - wordlist = wordlist[:200] - wordlist = [w for w in wordlist if 3 <= len(w) <= 12] - grid, used = wordfinder(wordlist) - - print("Word Finder\n") - for i in range(len(grid)): - for j in range(len(grid[i])): - print(grid[i][j], end=" ") - print() - print() - - for i in range(len(used)): - print("%d:" % (i + 1), used[i]) - - -if __name__ == "__main__": - word_finder() diff --git a/pipeline/nltk/parse/__init__.py b/pipeline/nltk/parse/__init__.py deleted file mode 100644 index 82600563c78bd7fb762777967a43454ffd7ab226..0000000000000000000000000000000000000000 --- a/pipeline/nltk/parse/__init__.py +++ /dev/null @@ -1,102 +0,0 @@ -# Natural Language Toolkit: Parsers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Edward Loper -# URL: -# For license information, see LICENSE.TXT -# - -""" -NLTK Parsers - -Classes and interfaces for producing tree structures that represent -the internal organization of a text. This task is known as "parsing" -the text, and the resulting tree structures are called the text's -"parses". Typically, the text is a single sentence, and the tree -structure represents the syntactic structure of the sentence. -However, parsers can also be used in other domains. For example, -parsers can be used to derive the morphological structure of the -morphemes that make up a word, or to derive the discourse structure -for a set of utterances. - -Sometimes, a single piece of text can be represented by more than one -tree structure. Texts represented by more than one tree structure are -called "ambiguous" texts. Note that there are actually two ways in -which a text can be ambiguous: - - - The text has multiple correct parses. - - There is not enough information to decide which of several - candidate parses is correct. - -However, the parser module does *not* distinguish these two types of -ambiguity. - -The parser module defines ``ParserI``, a standard interface for parsing -texts; and two simple implementations of that interface, -``ShiftReduceParser`` and ``RecursiveDescentParser``. It also contains -three sub-modules for specialized kinds of parsing: - - - ``nltk.parser.chart`` defines chart parsing, which uses dynamic - programming to efficiently parse texts. - - ``nltk.parser.probabilistic`` defines probabilistic parsing, which - associates a probability with each parse. -""" - -from nltk.parse.api import ParserI -from nltk.parse.bllip import BllipParser -from nltk.parse.chart import ( - BottomUpChartParser, - BottomUpLeftCornerChartParser, - ChartParser, - LeftCornerChartParser, - SteppingChartParser, - TopDownChartParser, -) -from nltk.parse.corenlp import CoreNLPDependencyParser, CoreNLPParser -from nltk.parse.dependencygraph import DependencyGraph -from nltk.parse.earleychart import ( - EarleyChartParser, - FeatureEarleyChartParser, - FeatureIncrementalBottomUpChartParser, - FeatureIncrementalBottomUpLeftCornerChartParser, - FeatureIncrementalChartParser, - FeatureIncrementalTopDownChartParser, - IncrementalBottomUpChartParser, - IncrementalBottomUpLeftCornerChartParser, - IncrementalChartParser, - IncrementalLeftCornerChartParser, - IncrementalTopDownChartParser, -) -from nltk.parse.evaluate import DependencyEvaluator -from nltk.parse.featurechart import ( - FeatureBottomUpChartParser, - FeatureBottomUpLeftCornerChartParser, - FeatureChartParser, - FeatureTopDownChartParser, -) -from nltk.parse.malt import MaltParser -from nltk.parse.nonprojectivedependencyparser import ( - NaiveBayesDependencyScorer, - NonprojectiveDependencyParser, - ProbabilisticNonprojectiveParser, -) -from nltk.parse.pchart import ( - BottomUpProbabilisticChartParser, - InsideChartParser, - LongestChartParser, - RandomChartParser, - UnsortedChartParser, -) -from nltk.parse.projectivedependencyparser import ( - ProbabilisticProjectiveDependencyParser, - ProjectiveDependencyParser, -) -from nltk.parse.recursivedescent import ( - RecursiveDescentParser, - SteppingRecursiveDescentParser, -) -from nltk.parse.shiftreduce import ShiftReduceParser, SteppingShiftReduceParser -from nltk.parse.transitionparser import TransitionParser -from nltk.parse.util import TestGrammar, extract_test_sentences, load_parser -from nltk.parse.viterbi import ViterbiParser diff --git a/pipeline/nltk/parse/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/parse/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 9cc6e358a011c09af0ee621ca23e85fe1d037a49..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/parse/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/parse/__pycache__/api.cpython-39.pyc b/pipeline/nltk/parse/__pycache__/api.cpython-39.pyc deleted file mode 100644 index 51ade861d9f3c6f9dd5a1623a5c3aac5790ce1f0..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/parse/__pycache__/api.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/parse/__pycache__/bllip.cpython-39.pyc b/pipeline/nltk/parse/__pycache__/bllip.cpython-39.pyc deleted file mode 100644 index 1ad18de452a6e7610b5d6431b8eefd396ee90ff7..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/parse/__pycache__/bllip.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/parse/__pycache__/chart.cpython-39.pyc b/pipeline/nltk/parse/__pycache__/chart.cpython-39.pyc deleted file mode 100644 index a97e21009602e8453045f4f2f1a31f8b6f36b21c..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/parse/__pycache__/chart.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/parse/__pycache__/corenlp.cpython-39.pyc b/pipeline/nltk/parse/__pycache__/corenlp.cpython-39.pyc deleted file mode 100644 index a5051a64a61c88dcc473109c94785409ff832542..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/parse/__pycache__/corenlp.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/parse/__pycache__/dependencygraph.cpython-39.pyc b/pipeline/nltk/parse/__pycache__/dependencygraph.cpython-39.pyc deleted file mode 100644 index 15340f930d8ad381b50e510d3d8ec3b1bdbec2ab..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/parse/__pycache__/dependencygraph.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/parse/__pycache__/earleychart.cpython-39.pyc b/pipeline/nltk/parse/__pycache__/earleychart.cpython-39.pyc deleted file mode 100644 index cb2ccd777e2218c0fab3a9d909173b9f424427b7..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/parse/__pycache__/earleychart.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/parse/__pycache__/evaluate.cpython-39.pyc b/pipeline/nltk/parse/__pycache__/evaluate.cpython-39.pyc deleted file mode 100644 index ac53c616395cacb017c7e0cd309e6954d5ed09ab..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/parse/__pycache__/evaluate.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/parse/__pycache__/featurechart.cpython-39.pyc b/pipeline/nltk/parse/__pycache__/featurechart.cpython-39.pyc deleted file mode 100644 index f6aa10e20cbd5b74222741187ef7b4c0683b9fd6..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/parse/__pycache__/featurechart.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/parse/__pycache__/generate.cpython-39.pyc b/pipeline/nltk/parse/__pycache__/generate.cpython-39.pyc deleted file mode 100644 index f50a901bc0ef41f80d79efc409576079b1e8de36..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/parse/__pycache__/generate.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/parse/__pycache__/malt.cpython-39.pyc b/pipeline/nltk/parse/__pycache__/malt.cpython-39.pyc deleted file mode 100644 index 3277c26bc9d18df626057a412d542bc13ceae925..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/parse/__pycache__/malt.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/parse/__pycache__/nonprojectivedependencyparser.cpython-39.pyc b/pipeline/nltk/parse/__pycache__/nonprojectivedependencyparser.cpython-39.pyc deleted file mode 100644 index a545be86cab724ec2783a67a65606deb4681a309..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/parse/__pycache__/nonprojectivedependencyparser.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/parse/__pycache__/pchart.cpython-39.pyc b/pipeline/nltk/parse/__pycache__/pchart.cpython-39.pyc deleted file mode 100644 index 72c9516a04fffe56cbf6374d9b5f62adee938cb3..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/parse/__pycache__/pchart.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/parse/__pycache__/projectivedependencyparser.cpython-39.pyc b/pipeline/nltk/parse/__pycache__/projectivedependencyparser.cpython-39.pyc deleted file mode 100644 index 36137cc101993dd07b611d399cb70ba41b553325..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/parse/__pycache__/projectivedependencyparser.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/parse/__pycache__/recursivedescent.cpython-39.pyc b/pipeline/nltk/parse/__pycache__/recursivedescent.cpython-39.pyc deleted file mode 100644 index 30ee154c0767872b95db6db44d7859ee5156ed77..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/parse/__pycache__/recursivedescent.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/parse/__pycache__/shiftreduce.cpython-39.pyc b/pipeline/nltk/parse/__pycache__/shiftreduce.cpython-39.pyc deleted file mode 100644 index 8f55f2639c13d9d3301bd23e30ebe9ded2c183a1..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/parse/__pycache__/shiftreduce.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/parse/__pycache__/stanford.cpython-39.pyc b/pipeline/nltk/parse/__pycache__/stanford.cpython-39.pyc deleted file mode 100644 index 9c78a90139632fbe728c7cb7e7e83e8ff31c4b43..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/parse/__pycache__/stanford.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/parse/__pycache__/transitionparser.cpython-39.pyc b/pipeline/nltk/parse/__pycache__/transitionparser.cpython-39.pyc deleted file mode 100644 index dfc9bd37662b40573df7abb17e562afaa5cc6d54..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/parse/__pycache__/transitionparser.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/parse/__pycache__/util.cpython-39.pyc b/pipeline/nltk/parse/__pycache__/util.cpython-39.pyc deleted file mode 100644 index 1ba4dec47ef145d8e4418e535603ba1a5cd6edbf..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/parse/__pycache__/util.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/parse/__pycache__/viterbi.cpython-39.pyc b/pipeline/nltk/parse/__pycache__/viterbi.cpython-39.pyc deleted file mode 100644 index faabed77a9322bbb233071e3891ec8b1c4395ef7..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/parse/__pycache__/viterbi.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/parse/api.py b/pipeline/nltk/parse/api.py deleted file mode 100644 index 280c1a5a8225e7832ecb6f80e4e96feb25ca4f8d..0000000000000000000000000000000000000000 --- a/pipeline/nltk/parse/api.py +++ /dev/null @@ -1,72 +0,0 @@ -# Natural Language Toolkit: Parser API -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Edward Loper -# URL: -# For license information, see LICENSE.TXT -# - -import itertools - -from nltk.internals import overridden - - -class ParserI: - """ - A processing class for deriving trees that represent possible - structures for a sequence of tokens. These tree structures are - known as "parses". Typically, parsers are used to derive syntax - trees for sentences. But parsers can also be used to derive other - kinds of tree structure, such as morphological trees and discourse - structures. - - Subclasses must define: - - at least one of: ``parse()``, ``parse_sents()``. - - Subclasses may define: - - ``grammar()`` - """ - - def grammar(self): - """ - :return: The grammar used by this parser. - """ - raise NotImplementedError() - - def parse(self, sent, *args, **kwargs): - """ - :return: An iterator that generates parse trees for the sentence. - When possible this list is sorted from most likely to least likely. - - :param sent: The sentence to be parsed - :type sent: list(str) - :rtype: iter(Tree) - """ - if overridden(self.parse_sents): - return next(self.parse_sents([sent], *args, **kwargs)) - elif overridden(self.parse_one): - return ( - tree - for tree in [self.parse_one(sent, *args, **kwargs)] - if tree is not None - ) - elif overridden(self.parse_all): - return iter(self.parse_all(sent, *args, **kwargs)) - else: - raise NotImplementedError() - - def parse_sents(self, sents, *args, **kwargs): - """ - Apply ``self.parse()`` to each element of ``sents``. - :rtype: iter(iter(Tree)) - """ - return (self.parse(sent, *args, **kwargs) for sent in sents) - - def parse_all(self, sent, *args, **kwargs): - """:rtype: list(Tree)""" - return list(self.parse(sent, *args, **kwargs)) - - def parse_one(self, sent, *args, **kwargs): - """:rtype: Tree or None""" - return next(self.parse(sent, *args, **kwargs), None) diff --git a/pipeline/nltk/parse/bllip.py b/pipeline/nltk/parse/bllip.py deleted file mode 100644 index 581ed661c256ca95ed89643516eb6edee8997300..0000000000000000000000000000000000000000 --- a/pipeline/nltk/parse/bllip.py +++ /dev/null @@ -1,299 +0,0 @@ -# Natural Language Toolkit: Interface to BLLIP Parser -# -# Author: David McClosky -# -# Copyright (C) 2001-2023 NLTK Project -# URL: -# For license information, see LICENSE.TXT - -from nltk.parse.api import ParserI -from nltk.tree import Tree - -""" -Interface for parsing with BLLIP Parser. Requires the Python -bllipparser module. BllipParser objects can be constructed with the -``BllipParser.from_unified_model_dir`` class method or manually using the -``BllipParser`` constructor. The former is generally easier if you have -a BLLIP Parser unified model directory -- a basic model can be obtained -from NLTK's downloader. More unified parsing models can be obtained with -BLLIP Parser's ModelFetcher (run ``python -m bllipparser.ModelFetcher`` -or see docs for ``bllipparser.ModelFetcher.download_and_install_model``). - -Basic usage:: - - # download and install a basic unified parsing model (Wall Street Journal) - # sudo python -m nltk.downloader bllip_wsj_no_aux - - >>> from nltk.data import find - >>> model_dir = find('models/bllip_wsj_no_aux').path - >>> bllip = BllipParser.from_unified_model_dir(model_dir) - - # 1-best parsing - >>> sentence1 = 'British left waffles on Falklands .'.split() - >>> top_parse = bllip.parse_one(sentence1) - >>> print(top_parse) - (S1 - (S - (NP (JJ British) (NN left)) - (VP (VBZ waffles) (PP (IN on) (NP (NNP Falklands)))) - (. .))) - - # n-best parsing - >>> sentence2 = 'Time flies'.split() - >>> all_parses = bllip.parse_all(sentence2) - >>> print(len(all_parses)) - 50 - >>> print(all_parses[0]) - (S1 (S (NP (NNP Time)) (VP (VBZ flies)))) - - # incorporating external tagging constraints (None means unconstrained tag) - >>> constrained1 = bllip.tagged_parse([('Time', 'VB'), ('flies', 'NNS')]) - >>> print(next(constrained1)) - (S1 (NP (VB Time) (NNS flies))) - >>> constrained2 = bllip.tagged_parse([('Time', 'NN'), ('flies', None)]) - >>> print(next(constrained2)) - (S1 (NP (NN Time) (VBZ flies))) - -References ----------- - -- Charniak, Eugene. "A maximum-entropy-inspired parser." Proceedings of - the 1st North American chapter of the Association for Computational - Linguistics conference. Association for Computational Linguistics, - 2000. - -- Charniak, Eugene, and Mark Johnson. "Coarse-to-fine n-best parsing - and MaxEnt discriminative reranking." Proceedings of the 43rd Annual - Meeting on Association for Computational Linguistics. Association - for Computational Linguistics, 2005. - -Known issues ------------- - -Note that BLLIP Parser is not currently threadsafe. Since this module -uses a SWIG interface, it is potentially unsafe to create multiple -``BllipParser`` objects in the same process. BLLIP Parser currently -has issues with non-ASCII text and will raise an error if given any. - -See https://pypi.python.org/pypi/bllipparser/ for more information -on BLLIP Parser's Python interface. -""" - -__all__ = ["BllipParser"] - -# this block allows this module to be imported even if bllipparser isn't -# available -try: - from bllipparser import RerankingParser - from bllipparser.RerankingParser import get_unified_model_parameters - - def _ensure_bllip_import_or_error(): - pass - -except ImportError as ie: - - def _ensure_bllip_import_or_error(ie=ie): - raise ImportError("Couldn't import bllipparser module: %s" % ie) - - -def _ensure_ascii(words): - try: - for i, word in enumerate(words): - word.encode("ascii") - except UnicodeEncodeError as e: - raise ValueError( - f"Token {i} ({word!r}) is non-ASCII. BLLIP Parser " - "currently doesn't support non-ASCII inputs." - ) from e - - -def _scored_parse_to_nltk_tree(scored_parse): - return Tree.fromstring(str(scored_parse.ptb_parse)) - - -class BllipParser(ParserI): - """ - Interface for parsing with BLLIP Parser. BllipParser objects can be - constructed with the ``BllipParser.from_unified_model_dir`` class - method or manually using the ``BllipParser`` constructor. - """ - - def __init__( - self, - parser_model=None, - reranker_features=None, - reranker_weights=None, - parser_options=None, - reranker_options=None, - ): - """ - Load a BLLIP Parser model from scratch. You'll typically want to - use the ``from_unified_model_dir()`` class method to construct - this object. - - :param parser_model: Path to parser model directory - :type parser_model: str - - :param reranker_features: Path the reranker model's features file - :type reranker_features: str - - :param reranker_weights: Path the reranker model's weights file - :type reranker_weights: str - - :param parser_options: optional dictionary of parser options, see - ``bllipparser.RerankingParser.RerankingParser.load_parser_options()`` - for more information. - :type parser_options: dict(str) - - :param reranker_options: optional - dictionary of reranker options, see - ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()`` - for more information. - :type reranker_options: dict(str) - """ - _ensure_bllip_import_or_error() - - parser_options = parser_options or {} - reranker_options = reranker_options or {} - - self.rrp = RerankingParser() - self.rrp.load_parser_model(parser_model, **parser_options) - if reranker_features and reranker_weights: - self.rrp.load_reranker_model( - features_filename=reranker_features, - weights_filename=reranker_weights, - **reranker_options, - ) - - def parse(self, sentence): - """ - Use BLLIP Parser to parse a sentence. Takes a sentence as a list - of words; it will be automatically tagged with this BLLIP Parser - instance's tagger. - - :return: An iterator that generates parse trees for the sentence - from most likely to least likely. - - :param sentence: The sentence to be parsed - :type sentence: list(str) - :rtype: iter(Tree) - """ - _ensure_ascii(sentence) - nbest_list = self.rrp.parse(sentence) - for scored_parse in nbest_list: - yield _scored_parse_to_nltk_tree(scored_parse) - - def tagged_parse(self, word_and_tag_pairs): - """ - Use BLLIP to parse a sentence. Takes a sentence as a list of - (word, tag) tuples; the sentence must have already been tokenized - and tagged. BLLIP will attempt to use the tags provided but may - use others if it can't come up with a complete parse subject - to those constraints. You may also specify a tag as ``None`` - to leave a token's tag unconstrained. - - :return: An iterator that generates parse trees for the sentence - from most likely to least likely. - - :param sentence: Input sentence to parse as (word, tag) pairs - :type sentence: list(tuple(str, str)) - :rtype: iter(Tree) - """ - words = [] - tag_map = {} - for i, (word, tag) in enumerate(word_and_tag_pairs): - words.append(word) - if tag is not None: - tag_map[i] = tag - - _ensure_ascii(words) - nbest_list = self.rrp.parse_tagged(words, tag_map) - for scored_parse in nbest_list: - yield _scored_parse_to_nltk_tree(scored_parse) - - @classmethod - def from_unified_model_dir( - cls, model_dir, parser_options=None, reranker_options=None - ): - """ - Create a ``BllipParser`` object from a unified parsing model - directory. Unified parsing model directories are a standardized - way of storing BLLIP parser and reranker models together on disk. - See ``bllipparser.RerankingParser.get_unified_model_parameters()`` - for more information about unified model directories. - - :return: A ``BllipParser`` object using the parser and reranker - models in the model directory. - - :param model_dir: Path to the unified model directory. - :type model_dir: str - :param parser_options: optional dictionary of parser options, see - ``bllipparser.RerankingParser.RerankingParser.load_parser_options()`` - for more information. - :type parser_options: dict(str) - :param reranker_options: optional dictionary of reranker options, see - ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()`` - for more information. - :type reranker_options: dict(str) - :rtype: BllipParser - """ - ( - parser_model_dir, - reranker_features_filename, - reranker_weights_filename, - ) = get_unified_model_parameters(model_dir) - return cls( - parser_model_dir, - reranker_features_filename, - reranker_weights_filename, - parser_options, - reranker_options, - ) - - -def demo(): - """This assumes the Python module bllipparser is installed.""" - - # download and install a basic unified parsing model (Wall Street Journal) - # sudo python -m nltk.downloader bllip_wsj_no_aux - - from nltk.data import find - - model_dir = find("models/bllip_wsj_no_aux").path - - print("Loading BLLIP Parsing models...") - # the easiest way to get started is to use a unified model - bllip = BllipParser.from_unified_model_dir(model_dir) - print("Done.") - - sentence1 = "British left waffles on Falklands .".split() - sentence2 = "I saw the man with the telescope .".split() - # this sentence is known to fail under the WSJ parsing model - fail1 = "# ! ? : -".split() - for sentence in (sentence1, sentence2, fail1): - print("Sentence: %r" % " ".join(sentence)) - try: - tree = next(bllip.parse(sentence)) - print(tree) - except StopIteration: - print("(parse failed)") - - # n-best parsing demo - for i, parse in enumerate(bllip.parse(sentence1)): - print("parse %d:\n%s" % (i, parse)) - - # using external POS tag constraints - print( - "forcing 'tree' to be 'NN':", - next(bllip.tagged_parse([("A", None), ("tree", "NN")])), - ) - print( - "forcing 'A' to be 'DT' and 'tree' to be 'NNP':", - next(bllip.tagged_parse([("A", "DT"), ("tree", "NNP")])), - ) - # constraints don't have to make sense... (though on more complicated - # sentences, they may cause the parse to fail) - print( - "forcing 'A' to be 'NNP':", - next(bllip.tagged_parse([("A", "NNP"), ("tree", None)])), - ) diff --git a/pipeline/nltk/parse/chart.py b/pipeline/nltk/parse/chart.py deleted file mode 100644 index 3f068d7d8ac61f1e46aae99a5ea915c74ac2791b..0000000000000000000000000000000000000000 --- a/pipeline/nltk/parse/chart.py +++ /dev/null @@ -1,1848 +0,0 @@ -# Natural Language Toolkit: A Chart Parser -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird -# Jean Mark Gawron -# Peter Ljunglöf -# URL: -# For license information, see LICENSE.TXT - -""" -Data classes and parser implementations for "chart parsers", which -use dynamic programming to efficiently parse a text. A chart -parser derives parse trees for a text by iteratively adding "edges" -to a "chart." Each edge represents a hypothesis about the tree -structure for a subsequence of the text. The chart is a -"blackboard" for composing and combining these hypotheses. - -When a chart parser begins parsing a text, it creates a new (empty) -chart, spanning the text. It then incrementally adds new edges to the -chart. A set of "chart rules" specifies the conditions under which -new edges should be added to the chart. Once the chart reaches a -stage where none of the chart rules adds any new edges, parsing is -complete. - -Charts are encoded with the ``Chart`` class, and edges are encoded with -the ``TreeEdge`` and ``LeafEdge`` classes. The chart parser module -defines three chart parsers: - - - ``ChartParser`` is a simple and flexible chart parser. Given a - set of chart rules, it will apply those rules to the chart until - no more edges are added. - - - ``SteppingChartParser`` is a subclass of ``ChartParser`` that can - be used to step through the parsing process. -""" - -import itertools -import re -import warnings -from functools import total_ordering - -from nltk.grammar import PCFG, is_nonterminal, is_terminal -from nltk.internals import raise_unorderable_types -from nltk.parse.api import ParserI -from nltk.tree import Tree -from nltk.util import OrderedDict - -######################################################################## -## Edges -######################################################################## - - -@total_ordering -class EdgeI: - """ - A hypothesis about the structure of part of a sentence. - Each edge records the fact that a structure is (partially) - consistent with the sentence. An edge contains: - - - A span, indicating what part of the sentence is - consistent with the hypothesized structure. - - A left-hand side, specifying what kind of structure is - hypothesized. - - A right-hand side, specifying the contents of the - hypothesized structure. - - A dot position, indicating how much of the hypothesized - structure is consistent with the sentence. - - Every edge is either complete or incomplete: - - - An edge is complete if its structure is fully consistent - with the sentence. - - An edge is incomplete if its structure is partially - consistent with the sentence. For every incomplete edge, the - span specifies a possible prefix for the edge's structure. - - There are two kinds of edge: - - - A ``TreeEdge`` records which trees have been found to - be (partially) consistent with the text. - - A ``LeafEdge`` records the tokens occurring in the text. - - The ``EdgeI`` interface provides a common interface to both types - of edge, allowing chart parsers to treat them in a uniform manner. - """ - - def __init__(self): - if self.__class__ == EdgeI: - raise TypeError("Edge is an abstract interface") - - # //////////////////////////////////////////////////////////// - # Span - # //////////////////////////////////////////////////////////// - - def span(self): - """ - Return a tuple ``(s, e)``, where ``tokens[s:e]`` is the - portion of the sentence that is consistent with this - edge's structure. - - :rtype: tuple(int, int) - """ - raise NotImplementedError() - - def start(self): - """ - Return the start index of this edge's span. - - :rtype: int - """ - raise NotImplementedError() - - def end(self): - """ - Return the end index of this edge's span. - - :rtype: int - """ - raise NotImplementedError() - - def length(self): - """ - Return the length of this edge's span. - - :rtype: int - """ - raise NotImplementedError() - - # //////////////////////////////////////////////////////////// - # Left Hand Side - # //////////////////////////////////////////////////////////// - - def lhs(self): - """ - Return this edge's left-hand side, which specifies what kind - of structure is hypothesized by this edge. - - :see: ``TreeEdge`` and ``LeafEdge`` for a description of - the left-hand side values for each edge type. - """ - raise NotImplementedError() - - # //////////////////////////////////////////////////////////// - # Right Hand Side - # //////////////////////////////////////////////////////////// - - def rhs(self): - """ - Return this edge's right-hand side, which specifies - the content of the structure hypothesized by this edge. - - :see: ``TreeEdge`` and ``LeafEdge`` for a description of - the right-hand side values for each edge type. - """ - raise NotImplementedError() - - def dot(self): - """ - Return this edge's dot position, which indicates how much of - the hypothesized structure is consistent with the - sentence. In particular, ``self.rhs[:dot]`` is consistent - with ``tokens[self.start():self.end()]``. - - :rtype: int - """ - raise NotImplementedError() - - def nextsym(self): - """ - Return the element of this edge's right-hand side that - immediately follows its dot. - - :rtype: Nonterminal or terminal or None - """ - raise NotImplementedError() - - def is_complete(self): - """ - Return True if this edge's structure is fully consistent - with the text. - - :rtype: bool - """ - raise NotImplementedError() - - def is_incomplete(self): - """ - Return True if this edge's structure is partially consistent - with the text. - - :rtype: bool - """ - raise NotImplementedError() - - # //////////////////////////////////////////////////////////// - # Comparisons & hashing - # //////////////////////////////////////////////////////////// - - def __eq__(self, other): - return ( - self.__class__ is other.__class__ - and self._comparison_key == other._comparison_key - ) - - def __ne__(self, other): - return not self == other - - def __lt__(self, other): - if not isinstance(other, EdgeI): - raise_unorderable_types("<", self, other) - if self.__class__ is other.__class__: - return self._comparison_key < other._comparison_key - else: - return self.__class__.__name__ < other.__class__.__name__ - - def __hash__(self): - try: - return self._hash - except AttributeError: - self._hash = hash(self._comparison_key) - return self._hash - - -class TreeEdge(EdgeI): - """ - An edge that records the fact that a tree is (partially) - consistent with the sentence. A tree edge consists of: - - - A span, indicating what part of the sentence is - consistent with the hypothesized tree. - - A left-hand side, specifying the hypothesized tree's node - value. - - A right-hand side, specifying the hypothesized tree's - children. Each element of the right-hand side is either a - terminal, specifying a token with that terminal as its leaf - value; or a nonterminal, specifying a subtree with that - nonterminal's symbol as its node value. - - A dot position, indicating which children are consistent - with part of the sentence. In particular, if ``dot`` is the - dot position, ``rhs`` is the right-hand size, ``(start,end)`` - is the span, and ``sentence`` is the list of tokens in the - sentence, then ``tokens[start:end]`` can be spanned by the - children specified by ``rhs[:dot]``. - - For more information about edges, see the ``EdgeI`` interface. - """ - - def __init__(self, span, lhs, rhs, dot=0): - """ - Construct a new ``TreeEdge``. - - :type span: tuple(int, int) - :param span: A tuple ``(s, e)``, where ``tokens[s:e]`` is the - portion of the sentence that is consistent with the new - edge's structure. - :type lhs: Nonterminal - :param lhs: The new edge's left-hand side, specifying the - hypothesized tree's node value. - :type rhs: list(Nonterminal and str) - :param rhs: The new edge's right-hand side, specifying the - hypothesized tree's children. - :type dot: int - :param dot: The position of the new edge's dot. This position - specifies what prefix of the production's right hand side - is consistent with the text. In particular, if - ``sentence`` is the list of tokens in the sentence, then - ``okens[span[0]:span[1]]`` can be spanned by the - children specified by ``rhs[:dot]``. - """ - self._span = span - self._lhs = lhs - rhs = tuple(rhs) - self._rhs = rhs - self._dot = dot - self._comparison_key = (span, lhs, rhs, dot) - - @staticmethod - def from_production(production, index): - """ - Return a new ``TreeEdge`` formed from the given production. - The new edge's left-hand side and right-hand side will - be taken from ``production``; its span will be - ``(index,index)``; and its dot position will be ``0``. - - :rtype: TreeEdge - """ - return TreeEdge( - span=(index, index), lhs=production.lhs(), rhs=production.rhs(), dot=0 - ) - - def move_dot_forward(self, new_end): - """ - Return a new ``TreeEdge`` formed from this edge. - The new edge's dot position is increased by ``1``, - and its end index will be replaced by ``new_end``. - - :param new_end: The new end index. - :type new_end: int - :rtype: TreeEdge - """ - return TreeEdge( - span=(self._span[0], new_end), - lhs=self._lhs, - rhs=self._rhs, - dot=self._dot + 1, - ) - - # Accessors - def lhs(self): - return self._lhs - - def span(self): - return self._span - - def start(self): - return self._span[0] - - def end(self): - return self._span[1] - - def length(self): - return self._span[1] - self._span[0] - - def rhs(self): - return self._rhs - - def dot(self): - return self._dot - - def is_complete(self): - return self._dot == len(self._rhs) - - def is_incomplete(self): - return self._dot != len(self._rhs) - - def nextsym(self): - if self._dot >= len(self._rhs): - return None - else: - return self._rhs[self._dot] - - # String representation - def __str__(self): - str = f"[{self._span[0]}:{self._span[1]}] " - str += "%-2r ->" % (self._lhs,) - - for i in range(len(self._rhs)): - if i == self._dot: - str += " *" - str += " %s" % repr(self._rhs[i]) - if len(self._rhs) == self._dot: - str += " *" - return str - - def __repr__(self): - return "[Edge: %s]" % self - - -class LeafEdge(EdgeI): - """ - An edge that records the fact that a leaf value is consistent with - a word in the sentence. A leaf edge consists of: - - - An index, indicating the position of the word. - - A leaf, specifying the word's content. - - A leaf edge's left-hand side is its leaf value, and its right hand - side is ``()``. Its span is ``[index, index+1]``, and its dot - position is ``0``. - """ - - def __init__(self, leaf, index): - """ - Construct a new ``LeafEdge``. - - :param leaf: The new edge's leaf value, specifying the word - that is recorded by this edge. - :param index: The new edge's index, specifying the position of - the word that is recorded by this edge. - """ - self._leaf = leaf - self._index = index - self._comparison_key = (leaf, index) - - # Accessors - def lhs(self): - return self._leaf - - def span(self): - return (self._index, self._index + 1) - - def start(self): - return self._index - - def end(self): - return self._index + 1 - - def length(self): - return 1 - - def rhs(self): - return () - - def dot(self): - return 0 - - def is_complete(self): - return True - - def is_incomplete(self): - return False - - def nextsym(self): - return None - - # String representations - def __str__(self): - return f"[{self._index}:{self._index + 1}] {repr(self._leaf)}" - - def __repr__(self): - return "[Edge: %s]" % (self) - - -######################################################################## -## Chart -######################################################################## - - -class Chart: - """ - A blackboard for hypotheses about the syntactic constituents of a - sentence. A chart contains a set of edges, and each edge encodes - a single hypothesis about the structure of some portion of the - sentence. - - The ``select`` method can be used to select a specific collection - of edges. For example ``chart.select(is_complete=True, start=0)`` - yields all complete edges whose start indices are 0. To ensure - the efficiency of these selection operations, ``Chart`` dynamically - creates and maintains an index for each set of attributes that - have been selected on. - - In order to reconstruct the trees that are represented by an edge, - the chart associates each edge with a set of child pointer lists. - A child pointer list is a list of the edges that license an - edge's right-hand side. - - :ivar _tokens: The sentence that the chart covers. - :ivar _num_leaves: The number of tokens. - :ivar _edges: A list of the edges in the chart - :ivar _edge_to_cpls: A dictionary mapping each edge to a set - of child pointer lists that are associated with that edge. - :ivar _indexes: A dictionary mapping tuples of edge attributes - to indices, where each index maps the corresponding edge - attribute values to lists of edges. - """ - - def __init__(self, tokens): - """ - Construct a new chart. The chart is initialized with the - leaf edges corresponding to the terminal leaves. - - :type tokens: list - :param tokens: The sentence that this chart will be used to parse. - """ - # Record the sentence token and the sentence length. - self._tokens = tuple(tokens) - self._num_leaves = len(self._tokens) - - # Initialise the chart. - self.initialize() - - def initialize(self): - """ - Clear the chart. - """ - # A list of edges contained in this chart. - self._edges = [] - - # The set of child pointer lists associated with each edge. - self._edge_to_cpls = {} - - # Indexes mapping attribute values to lists of edges - # (used by select()). - self._indexes = {} - - # //////////////////////////////////////////////////////////// - # Sentence Access - # //////////////////////////////////////////////////////////// - - def num_leaves(self): - """ - Return the number of words in this chart's sentence. - - :rtype: int - """ - return self._num_leaves - - def leaf(self, index): - """ - Return the leaf value of the word at the given index. - - :rtype: str - """ - return self._tokens[index] - - def leaves(self): - """ - Return a list of the leaf values of each word in the - chart's sentence. - - :rtype: list(str) - """ - return self._tokens - - # //////////////////////////////////////////////////////////// - # Edge access - # //////////////////////////////////////////////////////////// - - def edges(self): - """ - Return a list of all edges in this chart. New edges - that are added to the chart after the call to edges() - will *not* be contained in this list. - - :rtype: list(EdgeI) - :see: ``iteredges``, ``select`` - """ - return self._edges[:] - - def iteredges(self): - """ - Return an iterator over the edges in this chart. It is - not guaranteed that new edges which are added to the - chart before the iterator is exhausted will also be generated. - - :rtype: iter(EdgeI) - :see: ``edges``, ``select`` - """ - return iter(self._edges) - - # Iterating over the chart yields its edges. - __iter__ = iteredges - - def num_edges(self): - """ - Return the number of edges contained in this chart. - - :rtype: int - """ - return len(self._edge_to_cpls) - - def select(self, **restrictions): - """ - Return an iterator over the edges in this chart. Any - new edges that are added to the chart before the iterator - is exahusted will also be generated. ``restrictions`` - can be used to restrict the set of edges that will be - generated. - - :param span: Only generate edges ``e`` where ``e.span()==span`` - :param start: Only generate edges ``e`` where ``e.start()==start`` - :param end: Only generate edges ``e`` where ``e.end()==end`` - :param length: Only generate edges ``e`` where ``e.length()==length`` - :param lhs: Only generate edges ``e`` where ``e.lhs()==lhs`` - :param rhs: Only generate edges ``e`` where ``e.rhs()==rhs`` - :param nextsym: Only generate edges ``e`` where - ``e.nextsym()==nextsym`` - :param dot: Only generate edges ``e`` where ``e.dot()==dot`` - :param is_complete: Only generate edges ``e`` where - ``e.is_complete()==is_complete`` - :param is_incomplete: Only generate edges ``e`` where - ``e.is_incomplete()==is_incomplete`` - :rtype: iter(EdgeI) - """ - # If there are no restrictions, then return all edges. - if restrictions == {}: - return iter(self._edges) - - # Find the index corresponding to the given restrictions. - restr_keys = sorted(restrictions.keys()) - restr_keys = tuple(restr_keys) - - # If it doesn't exist, then create it. - if restr_keys not in self._indexes: - self._add_index(restr_keys) - - vals = tuple(restrictions[key] for key in restr_keys) - return iter(self._indexes[restr_keys].get(vals, [])) - - def _add_index(self, restr_keys): - """ - A helper function for ``select``, which creates a new index for - a given set of attributes (aka restriction keys). - """ - # Make sure it's a valid index. - for key in restr_keys: - if not hasattr(EdgeI, key): - raise ValueError("Bad restriction: %s" % key) - - # Create the index. - index = self._indexes[restr_keys] = {} - - # Add all existing edges to the index. - for edge in self._edges: - vals = tuple(getattr(edge, key)() for key in restr_keys) - index.setdefault(vals, []).append(edge) - - def _register_with_indexes(self, edge): - """ - A helper function for ``insert``, which registers the new - edge with all existing indexes. - """ - for (restr_keys, index) in self._indexes.items(): - vals = tuple(getattr(edge, key)() for key in restr_keys) - index.setdefault(vals, []).append(edge) - - # //////////////////////////////////////////////////////////// - # Edge Insertion - # //////////////////////////////////////////////////////////// - - def insert_with_backpointer(self, new_edge, previous_edge, child_edge): - """ - Add a new edge to the chart, using a pointer to the previous edge. - """ - cpls = self.child_pointer_lists(previous_edge) - new_cpls = [cpl + (child_edge,) for cpl in cpls] - return self.insert(new_edge, *new_cpls) - - def insert(self, edge, *child_pointer_lists): - """ - Add a new edge to the chart, and return True if this operation - modified the chart. In particular, return true iff the chart - did not already contain ``edge``, or if it did not already associate - ``child_pointer_lists`` with ``edge``. - - :type edge: EdgeI - :param edge: The new edge - :type child_pointer_lists: sequence of tuple(EdgeI) - :param child_pointer_lists: A sequence of lists of the edges that - were used to form this edge. This list is used to reconstruct - the trees (or partial trees) that are associated with ``edge``. - :rtype: bool - """ - # Is it a new edge? - if edge not in self._edge_to_cpls: - # Add it to the list of edges. - self._append_edge(edge) - # Register with indexes. - self._register_with_indexes(edge) - - # Get the set of child pointer lists for this edge. - cpls = self._edge_to_cpls.setdefault(edge, OrderedDict()) - chart_was_modified = False - for child_pointer_list in child_pointer_lists: - child_pointer_list = tuple(child_pointer_list) - if child_pointer_list not in cpls: - # It's a new CPL; register it, and return true. - cpls[child_pointer_list] = True - chart_was_modified = True - return chart_was_modified - - def _append_edge(self, edge): - self._edges.append(edge) - - # //////////////////////////////////////////////////////////// - # Tree extraction & child pointer lists - # //////////////////////////////////////////////////////////// - - def parses(self, root, tree_class=Tree): - """ - Return an iterator of the complete tree structures that span - the entire chart, and whose root node is ``root``. - """ - for edge in self.select(start=0, end=self._num_leaves, lhs=root): - yield from self.trees(edge, tree_class=tree_class, complete=True) - - def trees(self, edge, tree_class=Tree, complete=False): - """ - Return an iterator of the tree structures that are associated - with ``edge``. - - If ``edge`` is incomplete, then the unexpanded children will be - encoded as childless subtrees, whose node value is the - corresponding terminal or nonterminal. - - :rtype: list(Tree) - :note: If two trees share a common subtree, then the same - Tree may be used to encode that subtree in - both trees. If you need to eliminate this subtree - sharing, then create a deep copy of each tree. - """ - return iter(self._trees(edge, complete, memo={}, tree_class=tree_class)) - - def _trees(self, edge, complete, memo, tree_class): - """ - A helper function for ``trees``. - - :param memo: A dictionary used to record the trees that we've - generated for each edge, so that when we see an edge more - than once, we can reuse the same trees. - """ - # If we've seen this edge before, then reuse our old answer. - if edge in memo: - return memo[edge] - - # when we're reading trees off the chart, don't use incomplete edges - if complete and edge.is_incomplete(): - return [] - - # Leaf edges. - if isinstance(edge, LeafEdge): - leaf = self._tokens[edge.start()] - memo[edge] = [leaf] - return [leaf] - - # Until we're done computing the trees for edge, set - # memo[edge] to be empty. This has the effect of filtering - # out any cyclic trees (i.e., trees that contain themselves as - # descendants), because if we reach this edge via a cycle, - # then it will appear that the edge doesn't generate any trees. - memo[edge] = [] - trees = [] - lhs = edge.lhs().symbol() - - # Each child pointer list can be used to form trees. - for cpl in self.child_pointer_lists(edge): - # Get the set of child choices for each child pointer. - # child_choices[i] is the set of choices for the tree's - # ith child. - child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl] - - # For each combination of children, add a tree. - for children in itertools.product(*child_choices): - trees.append(tree_class(lhs, children)) - - # If the edge is incomplete, then extend it with "partial trees": - if edge.is_incomplete(): - unexpanded = [tree_class(elt, []) for elt in edge.rhs()[edge.dot() :]] - for tree in trees: - tree.extend(unexpanded) - - # Update the memoization dictionary. - memo[edge] = trees - - # Return the list of trees. - return trees - - def child_pointer_lists(self, edge): - """ - Return the set of child pointer lists for the given edge. - Each child pointer list is a list of edges that have - been used to form this edge. - - :rtype: list(list(EdgeI)) - """ - # Make a copy, in case they modify it. - return self._edge_to_cpls.get(edge, {}).keys() - - # //////////////////////////////////////////////////////////// - # Display - # //////////////////////////////////////////////////////////// - def pretty_format_edge(self, edge, width=None): - """ - Return a pretty-printed string representation of a given edge - in this chart. - - :rtype: str - :param width: The number of characters allotted to each - index in the sentence. - """ - if width is None: - width = 50 // (self.num_leaves() + 1) - (start, end) = (edge.start(), edge.end()) - - str = "|" + ("." + " " * (width - 1)) * start - - # Zero-width edges are "#" if complete, ">" if incomplete - if start == end: - if edge.is_complete(): - str += "#" - else: - str += ">" - - # Spanning complete edges are "[===]"; Other edges are - # "[---]" if complete, "[--->" if incomplete - elif edge.is_complete() and edge.span() == (0, self._num_leaves): - str += "[" + ("=" * width) * (end - start - 1) + "=" * (width - 1) + "]" - elif edge.is_complete(): - str += "[" + ("-" * width) * (end - start - 1) + "-" * (width - 1) + "]" - else: - str += "[" + ("-" * width) * (end - start - 1) + "-" * (width - 1) + ">" - - str += (" " * (width - 1) + ".") * (self._num_leaves - end) - return str + "| %s" % edge - - def pretty_format_leaves(self, width=None): - """ - Return a pretty-printed string representation of this - chart's leaves. This string can be used as a header - for calls to ``pretty_format_edge``. - """ - if width is None: - width = 50 // (self.num_leaves() + 1) - - if self._tokens is not None and width > 1: - header = "|." - for tok in self._tokens: - header += tok[: width - 1].center(width - 1) + "." - header += "|" - else: - header = "" - - return header - - def pretty_format(self, width=None): - """ - Return a pretty-printed string representation of this chart. - - :param width: The number of characters allotted to each - index in the sentence. - :rtype: str - """ - if width is None: - width = 50 // (self.num_leaves() + 1) - # sort edges: primary key=length, secondary key=start index. - # (and filter out the token edges) - edges = sorted((e.length(), e.start(), e) for e in self) - edges = [e for (_, _, e) in edges] - - return ( - self.pretty_format_leaves(width) - + "\n" - + "\n".join(self.pretty_format_edge(edge, width) for edge in edges) - ) - - # //////////////////////////////////////////////////////////// - # Display: Dot (AT&T Graphviz) - # //////////////////////////////////////////////////////////// - - def dot_digraph(self): - # Header - s = "digraph nltk_chart {\n" - # s += ' size="5,5";\n' - s += " rankdir=LR;\n" - s += " node [height=0.1,width=0.1];\n" - s += ' node [style=filled, color="lightgray"];\n' - - # Set up the nodes - for y in range(self.num_edges(), -1, -1): - if y == 0: - s += ' node [style=filled, color="black"];\n' - for x in range(self.num_leaves() + 1): - if y == 0 or ( - x <= self._edges[y - 1].start() or x >= self._edges[y - 1].end() - ): - s += ' %04d.%04d [label=""];\n' % (x, y) - - # Add a spacer - s += " x [style=invis]; x->0000.0000 [style=invis];\n" - - # Declare ranks. - for x in range(self.num_leaves() + 1): - s += " {rank=same;" - for y in range(self.num_edges() + 1): - if y == 0 or ( - x <= self._edges[y - 1].start() or x >= self._edges[y - 1].end() - ): - s += " %04d.%04d" % (x, y) - s += "}\n" - - # Add the leaves - s += " edge [style=invis, weight=100];\n" - s += " node [shape=plaintext]\n" - s += " 0000.0000" - for x in range(self.num_leaves()): - s += "->%s->%04d.0000" % (self.leaf(x), x + 1) - s += ";\n\n" - - # Add the edges - s += " edge [style=solid, weight=1];\n" - for y, edge in enumerate(self): - for x in range(edge.start()): - s += ' %04d.%04d -> %04d.%04d [style="invis"];\n' % ( - x, - y + 1, - x + 1, - y + 1, - ) - s += ' %04d.%04d -> %04d.%04d [label="%s"];\n' % ( - edge.start(), - y + 1, - edge.end(), - y + 1, - edge, - ) - for x in range(edge.end(), self.num_leaves()): - s += ' %04d.%04d -> %04d.%04d [style="invis"];\n' % ( - x, - y + 1, - x + 1, - y + 1, - ) - s += "}\n" - return s - - -######################################################################## -## Chart Rules -######################################################################## - - -class ChartRuleI: - """ - A rule that specifies what new edges are licensed by any given set - of existing edges. Each chart rule expects a fixed number of - edges, as indicated by the class variable ``NUM_EDGES``. In - particular: - - - A chart rule with ``NUM_EDGES=0`` specifies what new edges are - licensed, regardless of existing edges. - - A chart rule with ``NUM_EDGES=1`` specifies what new edges are - licensed by a single existing edge. - - A chart rule with ``NUM_EDGES=2`` specifies what new edges are - licensed by a pair of existing edges. - - :type NUM_EDGES: int - :cvar NUM_EDGES: The number of existing edges that this rule uses - to license new edges. Typically, this number ranges from zero - to two. - """ - - def apply(self, chart, grammar, *edges): - """ - Return a generator that will add edges licensed by this rule - and the given edges to the chart, one at a time. Each - time the generator is resumed, it will either add a new - edge and yield that edge; or return. - - :type edges: list(EdgeI) - :param edges: A set of existing edges. The number of edges - that should be passed to ``apply()`` is specified by the - ``NUM_EDGES`` class variable. - :rtype: iter(EdgeI) - """ - raise NotImplementedError() - - def apply_everywhere(self, chart, grammar): - """ - Return a generator that will add all edges licensed by - this rule, given the edges that are currently in the - chart, one at a time. Each time the generator is resumed, - it will either add a new edge and yield that edge; or return. - - :rtype: iter(EdgeI) - """ - raise NotImplementedError() - - -class AbstractChartRule(ChartRuleI): - """ - An abstract base class for chart rules. ``AbstractChartRule`` - provides: - - - A default implementation for ``apply``. - - A default implementation for ``apply_everywhere``, - (Currently, this implementation assumes that ``NUM_EDGES <= 3``.) - - A default implementation for ``__str__``, which returns a - name based on the rule's class name. - """ - - # Subclasses must define apply. - def apply(self, chart, grammar, *edges): - raise NotImplementedError() - - # Default: loop through the given number of edges, and call - # self.apply() for each set of edges. - def apply_everywhere(self, chart, grammar): - if self.NUM_EDGES == 0: - yield from self.apply(chart, grammar) - - elif self.NUM_EDGES == 1: - for e1 in chart: - yield from self.apply(chart, grammar, e1) - - elif self.NUM_EDGES == 2: - for e1 in chart: - for e2 in chart: - yield from self.apply(chart, grammar, e1, e2) - - elif self.NUM_EDGES == 3: - for e1 in chart: - for e2 in chart: - for e3 in chart: - yield from self.apply(chart, grammar, e1, e2, e3) - - else: - raise AssertionError("NUM_EDGES>3 is not currently supported") - - # Default: return a name based on the class name. - def __str__(self): - # Add spaces between InitialCapsWords. - return re.sub("([a-z])([A-Z])", r"\1 \2", self.__class__.__name__) - - -# //////////////////////////////////////////////////////////// -# Fundamental Rule -# //////////////////////////////////////////////////////////// - - -class FundamentalRule(AbstractChartRule): - r""" - A rule that joins two adjacent edges to form a single combined - edge. In particular, this rule specifies that any pair of edges - - - ``[A -> alpha \* B beta][i:j]`` - - ``[B -> gamma \*][j:k]`` - - licenses the edge: - - - ``[A -> alpha B * beta][i:j]`` - """ - - NUM_EDGES = 2 - - def apply(self, chart, grammar, left_edge, right_edge): - # Make sure the rule is applicable. - if not ( - left_edge.is_incomplete() - and right_edge.is_complete() - and left_edge.end() == right_edge.start() - and left_edge.nextsym() == right_edge.lhs() - ): - return - - # Construct the new edge. - new_edge = left_edge.move_dot_forward(right_edge.end()) - - # Insert it into the chart. - if chart.insert_with_backpointer(new_edge, left_edge, right_edge): - yield new_edge - - -class SingleEdgeFundamentalRule(FundamentalRule): - r""" - A rule that joins a given edge with adjacent edges in the chart, - to form combined edges. In particular, this rule specifies that - either of the edges: - - - ``[A -> alpha \* B beta][i:j]`` - - ``[B -> gamma \*][j:k]`` - - licenses the edge: - - - ``[A -> alpha B * beta][i:j]`` - - if the other edge is already in the chart. - - :note: This is basically ``FundamentalRule``, with one edge left - unspecified. - """ - - NUM_EDGES = 1 - - def apply(self, chart, grammar, edge): - if edge.is_incomplete(): - yield from self._apply_incomplete(chart, grammar, edge) - else: - yield from self._apply_complete(chart, grammar, edge) - - def _apply_complete(self, chart, grammar, right_edge): - for left_edge in chart.select( - end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs() - ): - new_edge = left_edge.move_dot_forward(right_edge.end()) - if chart.insert_with_backpointer(new_edge, left_edge, right_edge): - yield new_edge - - def _apply_incomplete(self, chart, grammar, left_edge): - for right_edge in chart.select( - start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym() - ): - new_edge = left_edge.move_dot_forward(right_edge.end()) - if chart.insert_with_backpointer(new_edge, left_edge, right_edge): - yield new_edge - - -# //////////////////////////////////////////////////////////// -# Inserting Terminal Leafs -# //////////////////////////////////////////////////////////// - - -class LeafInitRule(AbstractChartRule): - NUM_EDGES = 0 - - def apply(self, chart, grammar): - for index in range(chart.num_leaves()): - new_edge = LeafEdge(chart.leaf(index), index) - if chart.insert(new_edge, ()): - yield new_edge - - -# //////////////////////////////////////////////////////////// -# Top-Down Prediction -# //////////////////////////////////////////////////////////// - - -class TopDownInitRule(AbstractChartRule): - r""" - A rule licensing edges corresponding to the grammar productions for - the grammar's start symbol. In particular, this rule specifies that - ``[S -> \* alpha][0:i]`` is licensed for each grammar production - ``S -> alpha``, where ``S`` is the grammar's start symbol. - """ - - NUM_EDGES = 0 - - def apply(self, chart, grammar): - for prod in grammar.productions(lhs=grammar.start()): - new_edge = TreeEdge.from_production(prod, 0) - if chart.insert(new_edge, ()): - yield new_edge - - -class TopDownPredictRule(AbstractChartRule): - r""" - A rule licensing edges corresponding to the grammar productions - for the nonterminal following an incomplete edge's dot. In - particular, this rule specifies that - ``[A -> alpha \* B beta][i:j]`` licenses the edge - ``[B -> \* gamma][j:j]`` for each grammar production ``B -> gamma``. - - :note: This rule corresponds to the Predictor Rule in Earley parsing. - """ - - NUM_EDGES = 1 - - def apply(self, chart, grammar, edge): - if edge.is_complete(): - return - for prod in grammar.productions(lhs=edge.nextsym()): - new_edge = TreeEdge.from_production(prod, edge.end()) - if chart.insert(new_edge, ()): - yield new_edge - - -class CachedTopDownPredictRule(TopDownPredictRule): - r""" - A cached version of ``TopDownPredictRule``. After the first time - this rule is applied to an edge with a given ``end`` and ``next``, - it will not generate any more edges for edges with that ``end`` and - ``next``. - - If ``chart`` or ``grammar`` are changed, then the cache is flushed. - """ - - def __init__(self): - TopDownPredictRule.__init__(self) - self._done = {} - - def apply(self, chart, grammar, edge): - if edge.is_complete(): - return - nextsym, index = edge.nextsym(), edge.end() - if not is_nonterminal(nextsym): - return - - # If we've already applied this rule to an edge with the same - # next & end, and the chart & grammar have not changed, then - # just return (no new edges to add). - done = self._done.get((nextsym, index), (None, None)) - if done[0] is chart and done[1] is grammar: - return - - # Add all the edges indicated by the top down expand rule. - for prod in grammar.productions(lhs=nextsym): - # If the left corner in the predicted production is - # leaf, it must match with the input. - if prod.rhs(): - first = prod.rhs()[0] - if is_terminal(first): - if index >= chart.num_leaves() or first != chart.leaf(index): - continue - - new_edge = TreeEdge.from_production(prod, index) - if chart.insert(new_edge, ()): - yield new_edge - - # Record the fact that we've applied this rule. - self._done[nextsym, index] = (chart, grammar) - - -# //////////////////////////////////////////////////////////// -# Bottom-Up Prediction -# //////////////////////////////////////////////////////////// - - -class BottomUpPredictRule(AbstractChartRule): - r""" - A rule licensing any edge corresponding to a production whose - right-hand side begins with a complete edge's left-hand side. In - particular, this rule specifies that ``[A -> alpha \*]`` licenses - the edge ``[B -> \* A beta]`` for each grammar production ``B -> A beta``. - """ - - NUM_EDGES = 1 - - def apply(self, chart, grammar, edge): - if edge.is_incomplete(): - return - for prod in grammar.productions(rhs=edge.lhs()): - new_edge = TreeEdge.from_production(prod, edge.start()) - if chart.insert(new_edge, ()): - yield new_edge - - -class BottomUpPredictCombineRule(BottomUpPredictRule): - r""" - A rule licensing any edge corresponding to a production whose - right-hand side begins with a complete edge's left-hand side. In - particular, this rule specifies that ``[A -> alpha \*]`` - licenses the edge ``[B -> A \* beta]`` for each grammar - production ``B -> A beta``. - - :note: This is like ``BottomUpPredictRule``, but it also applies - the ``FundamentalRule`` to the resulting edge. - """ - - NUM_EDGES = 1 - - def apply(self, chart, grammar, edge): - if edge.is_incomplete(): - return - for prod in grammar.productions(rhs=edge.lhs()): - new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1) - if chart.insert(new_edge, (edge,)): - yield new_edge - - -class EmptyPredictRule(AbstractChartRule): - """ - A rule that inserts all empty productions as passive edges, - in every position in the chart. - """ - - NUM_EDGES = 0 - - def apply(self, chart, grammar): - for prod in grammar.productions(empty=True): - for index in range(chart.num_leaves() + 1): - new_edge = TreeEdge.from_production(prod, index) - if chart.insert(new_edge, ()): - yield new_edge - - -######################################################################## -## Filtered Bottom Up -######################################################################## - - -class FilteredSingleEdgeFundamentalRule(SingleEdgeFundamentalRule): - def _apply_complete(self, chart, grammar, right_edge): - end = right_edge.end() - nexttoken = end < chart.num_leaves() and chart.leaf(end) - for left_edge in chart.select( - end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs() - ): - if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()): - new_edge = left_edge.move_dot_forward(right_edge.end()) - if chart.insert_with_backpointer(new_edge, left_edge, right_edge): - yield new_edge - - def _apply_incomplete(self, chart, grammar, left_edge): - for right_edge in chart.select( - start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym() - ): - end = right_edge.end() - nexttoken = end < chart.num_leaves() and chart.leaf(end) - if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()): - new_edge = left_edge.move_dot_forward(right_edge.end()) - if chart.insert_with_backpointer(new_edge, left_edge, right_edge): - yield new_edge - - -class FilteredBottomUpPredictCombineRule(BottomUpPredictCombineRule): - def apply(self, chart, grammar, edge): - if edge.is_incomplete(): - return - - end = edge.end() - nexttoken = end < chart.num_leaves() and chart.leaf(end) - for prod in grammar.productions(rhs=edge.lhs()): - if _bottomup_filter(grammar, nexttoken, prod.rhs()): - new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1) - if chart.insert(new_edge, (edge,)): - yield new_edge - - -def _bottomup_filter(grammar, nexttoken, rhs, dot=0): - if len(rhs) <= dot + 1: - return True - _next = rhs[dot + 1] - if is_terminal(_next): - return nexttoken == _next - else: - return grammar.is_leftcorner(_next, nexttoken) - - -######################################################################## -## Generic Chart Parser -######################################################################## - -TD_STRATEGY = [ - LeafInitRule(), - TopDownInitRule(), - CachedTopDownPredictRule(), - SingleEdgeFundamentalRule(), -] -BU_STRATEGY = [ - LeafInitRule(), - EmptyPredictRule(), - BottomUpPredictRule(), - SingleEdgeFundamentalRule(), -] -BU_LC_STRATEGY = [ - LeafInitRule(), - EmptyPredictRule(), - BottomUpPredictCombineRule(), - SingleEdgeFundamentalRule(), -] - -LC_STRATEGY = [ - LeafInitRule(), - FilteredBottomUpPredictCombineRule(), - FilteredSingleEdgeFundamentalRule(), -] - - -class ChartParser(ParserI): - """ - A generic chart parser. A "strategy", or list of - ``ChartRuleI`` instances, is used to decide what edges to add to - the chart. In particular, ``ChartParser`` uses the following - algorithm to parse texts: - - | Until no new edges are added: - | For each *rule* in *strategy*: - | Apply *rule* to any applicable edges in the chart. - | Return any complete parses in the chart - """ - - def __init__( - self, - grammar, - strategy=BU_LC_STRATEGY, - trace=0, - trace_chart_width=50, - use_agenda=True, - chart_class=Chart, - ): - """ - Create a new chart parser, that uses ``grammar`` to parse - texts. - - :type grammar: CFG - :param grammar: The grammar used to parse texts. - :type strategy: list(ChartRuleI) - :param strategy: A list of rules that should be used to decide - what edges to add to the chart (top-down strategy by default). - :type trace: int - :param trace: The level of tracing that should be used when - parsing a text. ``0`` will generate no tracing output; - and higher numbers will produce more verbose tracing - output. - :type trace_chart_width: int - :param trace_chart_width: The default total width reserved for - the chart in trace output. The remainder of each line will - be used to display edges. - :type use_agenda: bool - :param use_agenda: Use an optimized agenda-based algorithm, - if possible. - :param chart_class: The class that should be used to create - the parse charts. - """ - self._grammar = grammar - self._strategy = strategy - self._trace = trace - self._trace_chart_width = trace_chart_width - # If the strategy only consists of axioms (NUM_EDGES==0) and - # inference rules (NUM_EDGES==1), we can use an agenda-based algorithm: - self._use_agenda = use_agenda - self._chart_class = chart_class - - self._axioms = [] - self._inference_rules = [] - for rule in strategy: - if rule.NUM_EDGES == 0: - self._axioms.append(rule) - elif rule.NUM_EDGES == 1: - self._inference_rules.append(rule) - else: - self._use_agenda = False - - def grammar(self): - return self._grammar - - def _trace_new_edges(self, chart, rule, new_edges, trace, edge_width): - if not trace: - return - print_rule_header = trace > 1 - for edge in new_edges: - if print_rule_header: - print("%s:" % rule) - print_rule_header = False - print(chart.pretty_format_edge(edge, edge_width)) - - def chart_parse(self, tokens, trace=None): - """ - Return the final parse ``Chart`` from which all possible - parse trees can be extracted. - - :param tokens: The sentence to be parsed - :type tokens: list(str) - :rtype: Chart - """ - if trace is None: - trace = self._trace - trace_new_edges = self._trace_new_edges - - tokens = list(tokens) - self._grammar.check_coverage(tokens) - chart = self._chart_class(tokens) - grammar = self._grammar - - # Width, for printing trace edges. - trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1) - if trace: - print(chart.pretty_format_leaves(trace_edge_width)) - - if self._use_agenda: - # Use an agenda-based algorithm. - for axiom in self._axioms: - new_edges = list(axiom.apply(chart, grammar)) - trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width) - - inference_rules = self._inference_rules - agenda = chart.edges() - # We reverse the initial agenda, since it is a stack - # but chart.edges() functions as a queue. - agenda.reverse() - while agenda: - edge = agenda.pop() - for rule in inference_rules: - new_edges = list(rule.apply(chart, grammar, edge)) - if trace: - trace_new_edges(chart, rule, new_edges, trace, trace_edge_width) - agenda += new_edges - - else: - # Do not use an agenda-based algorithm. - edges_added = True - while edges_added: - edges_added = False - for rule in self._strategy: - new_edges = list(rule.apply_everywhere(chart, grammar)) - edges_added = len(new_edges) - trace_new_edges(chart, rule, new_edges, trace, trace_edge_width) - - # Return the final chart. - return chart - - def parse(self, tokens, tree_class=Tree): - chart = self.chart_parse(tokens) - return iter(chart.parses(self._grammar.start(), tree_class=tree_class)) - - -class TopDownChartParser(ChartParser): - """ - A ``ChartParser`` using a top-down parsing strategy. - See ``ChartParser`` for more information. - """ - - def __init__(self, grammar, **parser_args): - ChartParser.__init__(self, grammar, TD_STRATEGY, **parser_args) - - -class BottomUpChartParser(ChartParser): - """ - A ``ChartParser`` using a bottom-up parsing strategy. - See ``ChartParser`` for more information. - """ - - def __init__(self, grammar, **parser_args): - if isinstance(grammar, PCFG): - warnings.warn( - "BottomUpChartParser only works for CFG, " - "use BottomUpProbabilisticChartParser instead", - category=DeprecationWarning, - ) - ChartParser.__init__(self, grammar, BU_STRATEGY, **parser_args) - - -class BottomUpLeftCornerChartParser(ChartParser): - """ - A ``ChartParser`` using a bottom-up left-corner parsing strategy. - This strategy is often more efficient than standard bottom-up. - See ``ChartParser`` for more information. - """ - - def __init__(self, grammar, **parser_args): - ChartParser.__init__(self, grammar, BU_LC_STRATEGY, **parser_args) - - -class LeftCornerChartParser(ChartParser): - def __init__(self, grammar, **parser_args): - if not grammar.is_nonempty(): - raise ValueError( - "LeftCornerParser only works for grammars " "without empty productions." - ) - ChartParser.__init__(self, grammar, LC_STRATEGY, **parser_args) - - -######################################################################## -## Stepping Chart Parser -######################################################################## - - -class SteppingChartParser(ChartParser): - """ - A ``ChartParser`` that allows you to step through the parsing - process, adding a single edge at a time. It also allows you to - change the parser's strategy or grammar midway through parsing a - text. - - The ``initialize`` method is used to start parsing a text. ``step`` - adds a single edge to the chart. ``set_strategy`` changes the - strategy used by the chart parser. ``parses`` returns the set of - parses that has been found by the chart parser. - - :ivar _restart: Records whether the parser's strategy, grammar, - or chart has been changed. If so, then ``step`` must restart - the parsing algorithm. - """ - - def __init__(self, grammar, strategy=[], trace=0): - self._chart = None - self._current_chartrule = None - self._restart = False - ChartParser.__init__(self, grammar, strategy, trace) - - # //////////////////////////////////////////////////////////// - # Initialization - # //////////////////////////////////////////////////////////// - - def initialize(self, tokens): - "Begin parsing the given tokens." - self._chart = Chart(list(tokens)) - self._restart = True - - # //////////////////////////////////////////////////////////// - # Stepping - # //////////////////////////////////////////////////////////// - - def step(self): - """ - Return a generator that adds edges to the chart, one at a - time. Each time the generator is resumed, it adds a single - edge and yields that edge. If no more edges can be added, - then it yields None. - - If the parser's strategy, grammar, or chart is changed, then - the generator will continue adding edges using the new - strategy, grammar, or chart. - - Note that this generator never terminates, since the grammar - or strategy might be changed to values that would add new - edges. Instead, it yields None when no more edges can be - added with the current strategy and grammar. - """ - if self._chart is None: - raise ValueError("Parser must be initialized first") - while True: - self._restart = False - w = 50 // (self._chart.num_leaves() + 1) - - for e in self._parse(): - if self._trace > 1: - print(self._current_chartrule) - if self._trace > 0: - print(self._chart.pretty_format_edge(e, w)) - yield e - if self._restart: - break - else: - yield None # No more edges. - - def _parse(self): - """ - A generator that implements the actual parsing algorithm. - ``step`` iterates through this generator, and restarts it - whenever the parser's strategy, grammar, or chart is modified. - """ - chart = self._chart - grammar = self._grammar - edges_added = 1 - while edges_added > 0: - edges_added = 0 - for rule in self._strategy: - self._current_chartrule = rule - for e in rule.apply_everywhere(chart, grammar): - edges_added += 1 - yield e - - # //////////////////////////////////////////////////////////// - # Accessors - # //////////////////////////////////////////////////////////// - - def strategy(self): - "Return the strategy used by this parser." - return self._strategy - - def grammar(self): - "Return the grammar used by this parser." - return self._grammar - - def chart(self): - "Return the chart that is used by this parser." - return self._chart - - def current_chartrule(self): - "Return the chart rule used to generate the most recent edge." - return self._current_chartrule - - def parses(self, tree_class=Tree): - "Return the parse trees currently contained in the chart." - return self._chart.parses(self._grammar.start(), tree_class) - - # //////////////////////////////////////////////////////////// - # Parser modification - # //////////////////////////////////////////////////////////// - - def set_strategy(self, strategy): - """ - Change the strategy that the parser uses to decide which edges - to add to the chart. - - :type strategy: list(ChartRuleI) - :param strategy: A list of rules that should be used to decide - what edges to add to the chart. - """ - if strategy == self._strategy: - return - self._strategy = strategy[:] # Make a copy. - self._restart = True - - def set_grammar(self, grammar): - "Change the grammar used by the parser." - if grammar is self._grammar: - return - self._grammar = grammar - self._restart = True - - def set_chart(self, chart): - "Load a given chart into the chart parser." - if chart is self._chart: - return - self._chart = chart - self._restart = True - - # //////////////////////////////////////////////////////////// - # Standard parser methods - # //////////////////////////////////////////////////////////// - - def parse(self, tokens, tree_class=Tree): - tokens = list(tokens) - self._grammar.check_coverage(tokens) - - # Initialize ourselves. - self.initialize(tokens) - - # Step until no more edges are generated. - for e in self.step(): - if e is None: - break - - # Return an iterator of complete parses. - return self.parses(tree_class=tree_class) - - -######################################################################## -## Demo Code -######################################################################## - - -def demo_grammar(): - from nltk.grammar import CFG - - return CFG.fromstring( - """ -S -> NP VP -PP -> "with" NP -NP -> NP PP -VP -> VP PP -VP -> Verb NP -VP -> Verb -NP -> Det Noun -NP -> "John" -NP -> "I" -Det -> "the" -Det -> "my" -Det -> "a" -Noun -> "dog" -Noun -> "cookie" -Verb -> "ate" -Verb -> "saw" -Prep -> "with" -Prep -> "under" -""" - ) - - -def demo( - choice=None, - print_times=True, - print_grammar=False, - print_trees=True, - trace=2, - sent="I saw John with a dog with my cookie", - numparses=5, -): - """ - A demonstration of the chart parsers. - """ - import sys - import time - - from nltk import CFG, Production, nonterminals - - # The grammar for ChartParser and SteppingChartParser: - grammar = demo_grammar() - if print_grammar: - print("* Grammar") - print(grammar) - - # Tokenize the sample sentence. - print("* Sentence:") - print(sent) - tokens = sent.split() - print(tokens) - print() - - # Ask the user which parser to test, - # if the parser wasn't provided as an argument - if choice is None: - print(" 1: Top-down chart parser") - print(" 2: Bottom-up chart parser") - print(" 3: Bottom-up left-corner chart parser") - print(" 4: Left-corner chart parser with bottom-up filter") - print(" 5: Stepping chart parser (alternating top-down & bottom-up)") - print(" 6: All parsers") - print("\nWhich parser (1-6)? ", end=" ") - choice = sys.stdin.readline().strip() - print() - - choice = str(choice) - if choice not in "123456": - print("Bad parser number") - return - - # Keep track of how long each parser takes. - times = {} - - strategies = { - "1": ("Top-down", TD_STRATEGY), - "2": ("Bottom-up", BU_STRATEGY), - "3": ("Bottom-up left-corner", BU_LC_STRATEGY), - "4": ("Filtered left-corner", LC_STRATEGY), - } - choices = [] - if choice in strategies: - choices = [choice] - if choice == "6": - choices = "1234" - - # Run the requested chart parser(s), except the stepping parser. - for strategy in choices: - print("* Strategy: " + strategies[strategy][0]) - print() - cp = ChartParser(grammar, strategies[strategy][1], trace=trace) - t = time.time() - chart = cp.chart_parse(tokens) - parses = list(chart.parses(grammar.start())) - - times[strategies[strategy][0]] = time.time() - t - print("Nr edges in chart:", len(chart.edges())) - if numparses: - assert len(parses) == numparses, "Not all parses found" - if print_trees: - for tree in parses: - print(tree) - else: - print("Nr trees:", len(parses)) - print() - - # Run the stepping parser, if requested. - if choice in "56": - print("* Strategy: Stepping (top-down vs bottom-up)") - print() - t = time.time() - cp = SteppingChartParser(grammar, trace=trace) - cp.initialize(tokens) - for i in range(5): - print("*** SWITCH TO TOP DOWN") - cp.set_strategy(TD_STRATEGY) - for j, e in enumerate(cp.step()): - if j > 20 or e is None: - break - print("*** SWITCH TO BOTTOM UP") - cp.set_strategy(BU_STRATEGY) - for j, e in enumerate(cp.step()): - if j > 20 or e is None: - break - times["Stepping"] = time.time() - t - print("Nr edges in chart:", len(cp.chart().edges())) - if numparses: - assert len(list(cp.parses())) == numparses, "Not all parses found" - if print_trees: - for tree in cp.parses(): - print(tree) - else: - print("Nr trees:", len(list(cp.parses()))) - print() - - # Print the times of all parsers: - if not (print_times and times): - return - print("* Parsing times") - print() - maxlen = max(len(key) for key in times) - format = "%" + repr(maxlen) + "s parser: %6.3fsec" - times_items = times.items() - for (parser, t) in sorted(times_items, key=lambda a: a[1]): - print(format % (parser, t)) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/parse/corenlp.py b/pipeline/nltk/parse/corenlp.py deleted file mode 100644 index 5c3146d1a086d4e49a0eaae585e09cab4a267834..0000000000000000000000000000000000000000 --- a/pipeline/nltk/parse/corenlp.py +++ /dev/null @@ -1,800 +0,0 @@ -# Natural Language Toolkit: Interface to the CoreNLP REST API. -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Dmitrijs Milajevs -# -# URL: -# For license information, see LICENSE.TXT - -import json -import os # required for doctests -import re -import socket -import time -from typing import List, Tuple - -from nltk.internals import _java_options, config_java, find_jar_iter, java -from nltk.parse.api import ParserI -from nltk.parse.dependencygraph import DependencyGraph -from nltk.tag.api import TaggerI -from nltk.tokenize.api import TokenizerI -from nltk.tree import Tree - -_stanford_url = "https://stanfordnlp.github.io/CoreNLP/" - - -class CoreNLPServerError(EnvironmentError): - """Exceptions associated with the Core NLP server.""" - - -def try_port(port=0): - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.bind(("", port)) - - p = sock.getsockname()[1] - sock.close() - - return p - - -class CoreNLPServer: - - _MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)\.(\d+)\.(\d+)-models\.jar" - _JAR = r"stanford-corenlp-(\d+)\.(\d+)\.(\d+)\.jar" - - def __init__( - self, - path_to_jar=None, - path_to_models_jar=None, - verbose=False, - java_options=None, - corenlp_options=None, - port=None, - ): - - if corenlp_options is None: - corenlp_options = ["-preload", "tokenize,ssplit,pos,lemma,parse,depparse"] - - jars = list( - find_jar_iter( - self._JAR, - path_to_jar, - env_vars=("CORENLP",), - searchpath=(), - url=_stanford_url, - verbose=verbose, - is_regex=True, - ) - ) - - # find the most recent code and model jar - stanford_jar = max(jars, key=lambda model_name: re.match(self._JAR, model_name)) - - if port is None: - try: - port = try_port(9000) - except OSError: - port = try_port() - corenlp_options.extend(["-port", str(port)]) - else: - try_port(port) - corenlp_options.extend(["-port", str(port)]) - - self.url = f"http://localhost:{port}" - - model_jar = max( - find_jar_iter( - self._MODEL_JAR_PATTERN, - path_to_models_jar, - env_vars=("CORENLP_MODELS",), - searchpath=(), - url=_stanford_url, - verbose=verbose, - is_regex=True, - ), - key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name), - ) - - self.verbose = verbose - - self._classpath = stanford_jar, model_jar - - self.corenlp_options = corenlp_options - self.java_options = java_options or ["-mx2g"] - - def start(self, stdout="devnull", stderr="devnull"): - """Starts the CoreNLP server - - :param stdout, stderr: Specifies where CoreNLP output is redirected. Valid values are 'devnull', 'stdout', 'pipe' - """ - import requests - - cmd = ["edu.stanford.nlp.pipeline.StanfordCoreNLPServer"] - - if self.corenlp_options: - cmd.extend(self.corenlp_options) - - # Configure java. - default_options = " ".join(_java_options) - config_java(options=self.java_options, verbose=self.verbose) - - try: - self.popen = java( - cmd, - classpath=self._classpath, - blocking=False, - stdout=stdout, - stderr=stderr, - ) - finally: - # Return java configurations to their default values. - config_java(options=default_options, verbose=self.verbose) - - # Check that the server is istill running. - returncode = self.popen.poll() - if returncode is not None: - _, stderrdata = self.popen.communicate() - raise CoreNLPServerError( - returncode, - "Could not start the server. " - "The error was: {}".format(stderrdata.decode("ascii")), - ) - - for i in range(30): - try: - response = requests.get(requests.compat.urljoin(self.url, "live")) - except requests.exceptions.ConnectionError: - time.sleep(1) - else: - if response.ok: - break - else: - raise CoreNLPServerError("Could not connect to the server.") - - for i in range(60): - try: - response = requests.get(requests.compat.urljoin(self.url, "ready")) - except requests.exceptions.ConnectionError: - time.sleep(1) - else: - if response.ok: - break - else: - raise CoreNLPServerError("The server is not ready.") - - def stop(self): - self.popen.terminate() - self.popen.wait() - - def __enter__(self): - self.start() - - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.stop() - return False - - -class GenericCoreNLPParser(ParserI, TokenizerI, TaggerI): - """Interface to the CoreNLP Parser.""" - - def __init__( - self, - url="http://localhost:9000", - encoding="utf8", - tagtype=None, - strict_json=True, - ): - import requests - - self.url = url - self.encoding = encoding - - if tagtype not in ["pos", "ner", None]: - raise ValueError("tagtype must be either 'pos', 'ner' or None") - - self.tagtype = tagtype - self.strict_json = strict_json - - self.session = requests.Session() - - def parse_sents(self, sentences, *args, **kwargs): - """Parse multiple sentences. - - Takes multiple sentences as a list where each sentence is a list of - words. Each sentence will be automatically tagged with this - CoreNLPParser instance's tagger. - - If a whitespace exists inside a token, then the token will be treated as - several tokens. - - :param sentences: Input sentences to parse - :type sentences: list(list(str)) - :rtype: iter(iter(Tree)) - """ - # Converting list(list(str)) -> list(str) - sentences = (" ".join(words) for words in sentences) - return self.raw_parse_sents(sentences, *args, **kwargs) - - def raw_parse(self, sentence, properties=None, *args, **kwargs): - """Parse a sentence. - - Takes a sentence as a string; before parsing, it will be automatically - tokenized and tagged by the CoreNLP Parser. - - :param sentence: Input sentence to parse - :type sentence: str - :rtype: iter(Tree) - """ - default_properties = {"tokenize.whitespace": "false"} - default_properties.update(properties or {}) - - return next( - self.raw_parse_sents( - [sentence], properties=default_properties, *args, **kwargs - ) - ) - - def api_call(self, data, properties=None, timeout=60): - default_properties = { - "outputFormat": "json", - "annotators": "tokenize,pos,lemma,ssplit,{parser_annotator}".format( - parser_annotator=self.parser_annotator - ), - } - - default_properties.update(properties or {}) - - response = self.session.post( - self.url, - params={"properties": json.dumps(default_properties)}, - data=data.encode(self.encoding), - headers={"Content-Type": f"text/plain; charset={self.encoding}"}, - timeout=timeout, - ) - - response.raise_for_status() - - return response.json(strict=self.strict_json) - - def raw_parse_sents( - self, sentences, verbose=False, properties=None, *args, **kwargs - ): - """Parse multiple sentences. - - Takes multiple sentences as a list of strings. Each sentence will be - automatically tokenized and tagged. - - :param sentences: Input sentences to parse. - :type sentences: list(str) - :rtype: iter(iter(Tree)) - - """ - default_properties = { - # Only splits on '\n', never inside the sentence. - "ssplit.eolonly": "true" - } - - default_properties.update(properties or {}) - - """ - for sentence in sentences: - parsed_data = self.api_call(sentence, properties=default_properties) - - assert len(parsed_data['sentences']) == 1 - - for parse in parsed_data['sentences']: - tree = self.make_tree(parse) - yield iter([tree]) - """ - parsed_data = self.api_call("\n".join(sentences), properties=default_properties) - for parsed_sent in parsed_data["sentences"]: - tree = self.make_tree(parsed_sent) - yield iter([tree]) - - def parse_text(self, text, *args, **kwargs): - """Parse a piece of text. - - The text might contain several sentences which will be split by CoreNLP. - - :param str text: text to be split. - :returns: an iterable of syntactic structures. # TODO: should it be an iterable of iterables? - - """ - parsed_data = self.api_call(text, *args, **kwargs) - - for parse in parsed_data["sentences"]: - yield self.make_tree(parse) - - def tokenize(self, text, properties=None): - """Tokenize a string of text. - - Skip these tests if CoreNLP is likely not ready. - >>> from nltk.test.setup_fixt import check_jar - >>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True) - - The CoreNLP server can be started using the following notation, although - we recommend the `with CoreNLPServer() as server:` context manager notation - to ensure that the server is always stopped. - >>> server = CoreNLPServer() - >>> server.start() - >>> parser = CoreNLPParser(url=server.url) - - >>> text = 'Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\nThanks.' - >>> list(parser.tokenize(text)) - ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] - - >>> s = "The colour of the wall is blue." - >>> list( - ... parser.tokenize( - ... 'The colour of the wall is blue.', - ... properties={'tokenize.options': 'americanize=true'}, - ... ) - ... ) - ['The', 'colour', 'of', 'the', 'wall', 'is', 'blue', '.'] - >>> server.stop() - - """ - default_properties = {"annotators": "tokenize,ssplit"} - - default_properties.update(properties or {}) - - result = self.api_call(text, properties=default_properties) - - for sentence in result["sentences"]: - for token in sentence["tokens"]: - yield token["originalText"] or token["word"] - - def tag_sents(self, sentences): - """ - Tag multiple sentences. - - Takes multiple sentences as a list where each sentence is a list of - tokens. - - :param sentences: Input sentences to tag - :type sentences: list(list(str)) - :rtype: list(list(tuple(str, str)) - """ - # Converting list(list(str)) -> list(str) - sentences = (" ".join(words) for words in sentences) - return [sentences[0] for sentences in self.raw_tag_sents(sentences)] - - def tag(self, sentence: str) -> List[Tuple[str, str]]: - """ - Tag a list of tokens. - - :rtype: list(tuple(str, str)) - - Skip these tests if CoreNLP is likely not ready. - >>> from nltk.test.setup_fixt import check_jar - >>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True) - - The CoreNLP server can be started using the following notation, although - we recommend the `with CoreNLPServer() as server:` context manager notation - to ensure that the server is always stopped. - >>> server = CoreNLPServer() - >>> server.start() - >>> parser = CoreNLPParser(url=server.url, tagtype='ner') - >>> tokens = 'Rami Eid is studying at Stony Brook University in NY'.split() - >>> parser.tag(tokens) # doctest: +NORMALIZE_WHITESPACE - [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), - ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'STATE_OR_PROVINCE')] - - >>> parser = CoreNLPParser(url=server.url, tagtype='pos') - >>> tokens = "What is the airspeed of an unladen swallow ?".split() - >>> parser.tag(tokens) # doctest: +NORMALIZE_WHITESPACE - [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), - ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), - ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')] - >>> server.stop() - """ - return self.tag_sents([sentence])[0] - - def raw_tag_sents(self, sentences): - """ - Tag multiple sentences. - - Takes multiple sentences as a list where each sentence is a string. - - :param sentences: Input sentences to tag - :type sentences: list(str) - :rtype: list(list(list(tuple(str, str))) - """ - default_properties = { - "ssplit.isOneSentence": "true", - "annotators": "tokenize,ssplit,", - } - - # Supports only 'pos' or 'ner' tags. - assert self.tagtype in ["pos", "ner"] - default_properties["annotators"] += self.tagtype - for sentence in sentences: - tagged_data = self.api_call(sentence, properties=default_properties) - yield [ - [ - (token["word"], token[self.tagtype]) - for token in tagged_sentence["tokens"] - ] - for tagged_sentence in tagged_data["sentences"] - ] - - -class CoreNLPParser(GenericCoreNLPParser): - """ - Skip these tests if CoreNLP is likely not ready. - >>> from nltk.test.setup_fixt import check_jar - >>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True) - - The recommended usage of `CoreNLPParser` is using the context manager notation: - >>> with CoreNLPServer() as server: - ... parser = CoreNLPParser(url=server.url) - ... next( - ... parser.raw_parse('The quick brown fox jumps over the lazy dog.') - ... ).pretty_print() # doctest: +NORMALIZE_WHITESPACE - ROOT - | - S - _______________|__________________________ - | VP | - | _________|___ | - | | PP | - | | ________|___ | - NP | | NP | - ____|__________ | | _______|____ | - DT JJ JJ NN VBZ IN DT JJ NN . - | | | | | | | | | | - The quick brown fox jumps over the lazy dog . - - Alternatively, the server can be started using the following notation. - Note that `CoreNLPServer` does not need to be used if the CoreNLP server is started - outside of Python. - >>> server = CoreNLPServer() - >>> server.start() - >>> parser = CoreNLPParser(url=server.url) - - >>> (parse_fox, ), (parse_wolf, ) = parser.raw_parse_sents( - ... [ - ... 'The quick brown fox jumps over the lazy dog.', - ... 'The quick grey wolf jumps over the lazy fox.', - ... ] - ... ) - - >>> parse_fox.pretty_print() # doctest: +NORMALIZE_WHITESPACE - ROOT - | - S - _______________|__________________________ - | VP | - | _________|___ | - | | PP | - | | ________|___ | - NP | | NP | - ____|__________ | | _______|____ | - DT JJ JJ NN VBZ IN DT JJ NN . - | | | | | | | | | | - The quick brown fox jumps over the lazy dog . - - >>> parse_wolf.pretty_print() # doctest: +NORMALIZE_WHITESPACE - ROOT - | - S - _______________|__________________________ - | VP | - | _________|___ | - | | PP | - | | ________|___ | - NP | | NP | - ____|_________ | | _______|____ | - DT JJ JJ NN VBZ IN DT JJ NN . - | | | | | | | | | | - The quick grey wolf jumps over the lazy fox . - - >>> (parse_dog, ), (parse_friends, ) = parser.parse_sents( - ... [ - ... "I 'm a dog".split(), - ... "This is my friends ' cat ( the tabby )".split(), - ... ] - ... ) - - >>> parse_dog.pretty_print() # doctest: +NORMALIZE_WHITESPACE - ROOT - | - S - _______|____ - | VP - | ________|___ - NP | NP - | | ___|___ - PRP VBP DT NN - | | | | - I 'm a dog - - >>> parse_friends.pretty_print() # doctest: +NORMALIZE_WHITESPACE - ROOT - | - S - ____|___________ - | VP - | ___________|_____________ - | | NP - | | _______|________________________ - | | NP | | | - | | _____|_______ | | | - NP | NP | | NP | - | | ______|_________ | | ___|____ | - DT VBZ PRP$ NNS POS NN -LRB- DT NN -RRB- - | | | | | | | | | | - This is my friends ' cat -LRB- the tabby -RRB- - - >>> parse_john, parse_mary, = parser.parse_text( - ... 'John loves Mary. Mary walks.' - ... ) - - >>> parse_john.pretty_print() # doctest: +NORMALIZE_WHITESPACE - ROOT - | - S - _____|_____________ - | VP | - | ____|___ | - NP | NP | - | | | | - NNP VBZ NNP . - | | | | - John loves Mary . - - >>> parse_mary.pretty_print() # doctest: +NORMALIZE_WHITESPACE - ROOT - | - S - _____|____ - NP VP | - | | | - NNP VBZ . - | | | - Mary walks . - - Special cases - - >>> next( - ... parser.raw_parse( - ... 'NASIRIYA, Iraq—Iraqi doctors who treated former prisoner of war ' - ... 'Jessica Lynch have angrily dismissed claims made in her biography ' - ... 'that she was raped by her Iraqi captors.' - ... ) - ... ).height() - 14 - - >>> next( - ... parser.raw_parse( - ... "The broader Standard & Poor's 500 Index <.SPX> was 0.46 points lower, or " - ... '0.05 percent, at 997.02.' - ... ) - ... ).height() - 11 - - >>> server.stop() - """ - - _OUTPUT_FORMAT = "penn" - parser_annotator = "parse" - - def make_tree(self, result): - return Tree.fromstring(result["parse"]) - - -class CoreNLPDependencyParser(GenericCoreNLPParser): - """Dependency parser. - - Skip these tests if CoreNLP is likely not ready. - >>> from nltk.test.setup_fixt import check_jar - >>> check_jar(CoreNLPServer._JAR, env_vars=("CORENLP",), is_regex=True) - - The recommended usage of `CoreNLPParser` is using the context manager notation: - >>> with CoreNLPServer() as server: - ... dep_parser = CoreNLPDependencyParser(url=server.url) - ... parse, = dep_parser.raw_parse( - ... 'The quick brown fox jumps over the lazy dog.' - ... ) - ... print(parse.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE - The DT 4 det - quick JJ 4 amod - brown JJ 4 amod - fox NN 5 nsubj - jumps VBZ 0 ROOT - over IN 9 case - the DT 9 det - lazy JJ 9 amod - dog NN 5 obl - . . 5 punct - - Alternatively, the server can be started using the following notation. - Note that `CoreNLPServer` does not need to be used if the CoreNLP server is started - outside of Python. - >>> server = CoreNLPServer() - >>> server.start() - >>> dep_parser = CoreNLPDependencyParser(url=server.url) - >>> parse, = dep_parser.raw_parse('The quick brown fox jumps over the lazy dog.') - >>> print(parse.tree()) # doctest: +NORMALIZE_WHITESPACE - (jumps (fox The quick brown) (dog over the lazy) .) - - >>> for governor, dep, dependent in parse.triples(): - ... print(governor, dep, dependent) # doctest: +NORMALIZE_WHITESPACE - ('jumps', 'VBZ') nsubj ('fox', 'NN') - ('fox', 'NN') det ('The', 'DT') - ('fox', 'NN') amod ('quick', 'JJ') - ('fox', 'NN') amod ('brown', 'JJ') - ('jumps', 'VBZ') obl ('dog', 'NN') - ('dog', 'NN') case ('over', 'IN') - ('dog', 'NN') det ('the', 'DT') - ('dog', 'NN') amod ('lazy', 'JJ') - ('jumps', 'VBZ') punct ('.', '.') - - >>> (parse_fox, ), (parse_dog, ) = dep_parser.raw_parse_sents( - ... [ - ... 'The quick brown fox jumps over the lazy dog.', - ... 'The quick grey wolf jumps over the lazy fox.', - ... ] - ... ) - >>> print(parse_fox.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE - The DT 4 det - quick JJ 4 amod - brown JJ 4 amod - fox NN 5 nsubj - jumps VBZ 0 ROOT - over IN 9 case - the DT 9 det - lazy JJ 9 amod - dog NN 5 obl - . . 5 punct - - >>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE - The DT 4 det - quick JJ 4 amod - grey JJ 4 amod - wolf NN 5 nsubj - jumps VBZ 0 ROOT - over IN 9 case - the DT 9 det - lazy JJ 9 amod - fox NN 5 obl - . . 5 punct - - >>> (parse_dog, ), (parse_friends, ) = dep_parser.parse_sents( - ... [ - ... "I 'm a dog".split(), - ... "This is my friends ' cat ( the tabby )".split(), - ... ] - ... ) - >>> print(parse_dog.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE - I PRP 4 nsubj - 'm VBP 4 cop - a DT 4 det - dog NN 0 ROOT - - >>> print(parse_friends.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE - This DT 6 nsubj - is VBZ 6 cop - my PRP$ 4 nmod:poss - friends NNS 6 nmod:poss - ' POS 4 case - cat NN 0 ROOT - ( -LRB- 9 punct - the DT 9 det - tabby NN 6 dep - ) -RRB- 9 punct - - >>> parse_john, parse_mary, = dep_parser.parse_text( - ... 'John loves Mary. Mary walks.' - ... ) - - >>> print(parse_john.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE - John NNP 2 nsubj - loves VBZ 0 ROOT - Mary NNP 2 obj - . . 2 punct - - >>> print(parse_mary.to_conll(4)) # doctest: +NORMALIZE_WHITESPACE - Mary NNP 2 nsubj - walks VBZ 0 ROOT - . . 2 punct - - Special cases - - Non-breaking space inside of a token. - - >>> len( - ... next( - ... dep_parser.raw_parse( - ... 'Anhalt said children typically treat a 20-ounce soda bottle as one ' - ... 'serving, while it actually contains 2 1/2 servings.' - ... ) - ... ).nodes - ... ) - 23 - - Phone numbers. - - >>> len( - ... next( - ... dep_parser.raw_parse('This is not going to crash: 01 111 555.') - ... ).nodes - ... ) - 10 - - >>> print( - ... next( - ... dep_parser.raw_parse('The underscore _ should not simply disappear.') - ... ).to_conll(4) - ... ) # doctest: +NORMALIZE_WHITESPACE - The DT 2 det - underscore NN 7 nsubj - _ NFP 7 punct - should MD 7 aux - not RB 7 advmod - simply RB 7 advmod - disappear VB 0 ROOT - . . 7 punct - - >>> print( - ... next( - ... dep_parser.raw_parse( - ... 'for all of its insights into the dream world of teen life , and its electronic expression through ' - ... 'cyber culture , the film gives no quarter to anyone seeking to pull a cohesive story out of its 2 ' - ... '1/2-hour running time .' - ... ) - ... ).to_conll(4) - ... ) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS - for IN 2 case - all DT 24 obl - of IN 5 case - its PRP$ 5 nmod:poss - insights NNS 2 nmod - into IN 9 case - the DT 9 det - dream NN 9 compound - world NN 5 nmod - of IN 12 case - teen NN 12 compound - ... - - >>> server.stop() - """ - - _OUTPUT_FORMAT = "conll2007" - parser_annotator = "depparse" - - def make_tree(self, result): - - return DependencyGraph( - ( - " ".join(n_items[1:]) # NLTK expects an iterable of strings... - for n_items in sorted(transform(result)) - ), - cell_separator=" ", # To make sure that a non-breaking space is kept inside of a token. - ) - - -def transform(sentence): - for dependency in sentence["basicDependencies"]: - - dependent_index = dependency["dependent"] - token = sentence["tokens"][dependent_index - 1] - - # Return values that we don't know as '_'. Also, consider tag and ctag - # to be equal. - yield ( - dependent_index, - "_", - token["word"], - token["lemma"], - token["pos"], - token["pos"], - "_", - str(dependency["governor"]), - dependency["dep"], - "_", - "_", - ) diff --git a/pipeline/nltk/parse/dependencygraph.py b/pipeline/nltk/parse/dependencygraph.py deleted file mode 100644 index 7300f0596baf3a6fdae5a4183aca862c1aa09ea8..0000000000000000000000000000000000000000 --- a/pipeline/nltk/parse/dependencygraph.py +++ /dev/null @@ -1,799 +0,0 @@ -# Natural Language Toolkit: Dependency Grammars -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Jason Narad -# Steven Bird (modifications) -# -# URL: -# For license information, see LICENSE.TXT -# - -""" -Tools for reading and writing dependency trees. -The input is assumed to be in Malt-TAB format -(https://stp.lingfil.uu.se/~nivre/research/MaltXML.html). -""" - -import subprocess -import warnings -from collections import defaultdict -from itertools import chain -from pprint import pformat - -from nltk.internals import find_binary -from nltk.tree import Tree - -################################################################# -# DependencyGraph Class -################################################################# - - -class DependencyGraph: - """ - A container for the nodes and labelled edges of a dependency structure. - """ - - def __init__( - self, - tree_str=None, - cell_extractor=None, - zero_based=False, - cell_separator=None, - top_relation_label="ROOT", - ): - """Dependency graph. - - We place a dummy `TOP` node with the index 0, since the root node is - often assigned 0 as its head. This also means that the indexing of the - nodes corresponds directly to the Malt-TAB format, which starts at 1. - - If zero-based is True, then Malt-TAB-like input with node numbers - starting at 0 and the root node assigned -1 (as produced by, e.g., - zpar). - - :param str cell_separator: the cell separator. If not provided, cells - are split by whitespace. - - :param str top_relation_label: the label by which the top relation is - identified, for examlple, `ROOT`, `null` or `TOP`. - """ - self.nodes = defaultdict( - lambda: { - "address": None, - "word": None, - "lemma": None, - "ctag": None, - "tag": None, - "feats": None, - "head": None, - "deps": defaultdict(list), - "rel": None, - } - ) - - self.nodes[0].update({"ctag": "TOP", "tag": "TOP", "address": 0}) - - self.root = None - - if tree_str: - self._parse( - tree_str, - cell_extractor=cell_extractor, - zero_based=zero_based, - cell_separator=cell_separator, - top_relation_label=top_relation_label, - ) - - def remove_by_address(self, address): - """ - Removes the node with the given address. References - to this node in others will still exist. - """ - del self.nodes[address] - - def redirect_arcs(self, originals, redirect): - """ - Redirects arcs to any of the nodes in the originals list - to the redirect node address. - """ - for node in self.nodes.values(): - new_deps = [] - for dep in node["deps"]: - if dep in originals: - new_deps.append(redirect) - else: - new_deps.append(dep) - node["deps"] = new_deps - - def add_arc(self, head_address, mod_address): - """ - Adds an arc from the node specified by head_address to the - node specified by the mod address. - """ - relation = self.nodes[mod_address]["rel"] - self.nodes[head_address]["deps"].setdefault(relation, []) - self.nodes[head_address]["deps"][relation].append(mod_address) - # self.nodes[head_address]['deps'].append(mod_address) - - def connect_graph(self): - """ - Fully connects all non-root nodes. All nodes are set to be dependents - of the root node. - """ - for node1 in self.nodes.values(): - for node2 in self.nodes.values(): - if node1["address"] != node2["address"] and node2["rel"] != "TOP": - relation = node2["rel"] - node1["deps"].setdefault(relation, []) - node1["deps"][relation].append(node2["address"]) - # node1['deps'].append(node2['address']) - - def get_by_address(self, node_address): - """Return the node with the given address.""" - return self.nodes[node_address] - - def contains_address(self, node_address): - """ - Returns true if the graph contains a node with the given node - address, false otherwise. - """ - return node_address in self.nodes - - def to_dot(self): - """Return a dot representation suitable for using with Graphviz. - - >>> dg = DependencyGraph( - ... 'John N 2\\n' - ... 'loves V 0\\n' - ... 'Mary N 2' - ... ) - >>> print(dg.to_dot()) - digraph G{ - edge [dir=forward] - node [shape=plaintext] - - 0 [label="0 (None)"] - 0 -> 2 [label="ROOT"] - 1 [label="1 (John)"] - 2 [label="2 (loves)"] - 2 -> 1 [label=""] - 2 -> 3 [label=""] - 3 [label="3 (Mary)"] - } - - """ - # Start the digraph specification - s = "digraph G{\n" - s += "edge [dir=forward]\n" - s += "node [shape=plaintext]\n" - - # Draw the remaining nodes - for node in sorted(self.nodes.values(), key=lambda v: v["address"]): - s += '\n{} [label="{} ({})"]'.format( - node["address"], - node["address"], - node["word"], - ) - for rel, deps in node["deps"].items(): - for dep in deps: - if rel is not None: - s += '\n{} -> {} [label="{}"]'.format(node["address"], dep, rel) - else: - s += "\n{} -> {} ".format(node["address"], dep) - s += "\n}" - - return s - - def _repr_svg_(self): - """Show SVG representation of the transducer (IPython magic). - >>> from nltk.test.setup_fixt import check_binary - >>> check_binary('dot') - >>> dg = DependencyGraph( - ... 'John N 2\\n' - ... 'loves V 0\\n' - ... 'Mary N 2' - ... ) - >>> dg._repr_svg_().split('\\n')[0] - '' - - """ - dot_string = self.to_dot() - return dot2img(dot_string) - - def __str__(self): - return pformat(self.nodes) - - def __repr__(self): - return f"" - - @staticmethod - def load( - filename, zero_based=False, cell_separator=None, top_relation_label="ROOT" - ): - """ - :param filename: a name of a file in Malt-TAB format - :param zero_based: nodes in the input file are numbered starting from 0 - rather than 1 (as produced by, e.g., zpar) - :param str cell_separator: the cell separator. If not provided, cells - are split by whitespace. - :param str top_relation_label: the label by which the top relation is - identified, for examlple, `ROOT`, `null` or `TOP`. - - :return: a list of DependencyGraphs - - """ - with open(filename) as infile: - return [ - DependencyGraph( - tree_str, - zero_based=zero_based, - cell_separator=cell_separator, - top_relation_label=top_relation_label, - ) - for tree_str in infile.read().split("\n\n") - ] - - def left_children(self, node_index): - """ - Returns the number of left children under the node specified - by the given address. - """ - children = chain.from_iterable(self.nodes[node_index]["deps"].values()) - index = self.nodes[node_index]["address"] - return sum(1 for c in children if c < index) - - def right_children(self, node_index): - """ - Returns the number of right children under the node specified - by the given address. - """ - children = chain.from_iterable(self.nodes[node_index]["deps"].values()) - index = self.nodes[node_index]["address"] - return sum(1 for c in children if c > index) - - def add_node(self, node): - if not self.contains_address(node["address"]): - self.nodes[node["address"]].update(node) - - def _parse( - self, - input_, - cell_extractor=None, - zero_based=False, - cell_separator=None, - top_relation_label="ROOT", - ): - """Parse a sentence. - - :param extractor: a function that given a tuple of cells returns a - 7-tuple, where the values are ``word, lemma, ctag, tag, feats, head, - rel``. - - :param str cell_separator: the cell separator. If not provided, cells - are split by whitespace. - - :param str top_relation_label: the label by which the top relation is - identified, for examlple, `ROOT`, `null` or `TOP`. - - """ - - def extract_3_cells(cells, index): - word, tag, head = cells - return index, word, word, tag, tag, "", head, "" - - def extract_4_cells(cells, index): - word, tag, head, rel = cells - return index, word, word, tag, tag, "", head, rel - - def extract_7_cells(cells, index): - line_index, word, lemma, tag, _, head, rel = cells - try: - index = int(line_index) - except ValueError: - # index can't be parsed as an integer, use default - pass - return index, word, lemma, tag, tag, "", head, rel - - def extract_10_cells(cells, index): - line_index, word, lemma, ctag, tag, feats, head, rel, _, _ = cells - try: - index = int(line_index) - except ValueError: - # index can't be parsed as an integer, use default - pass - return index, word, lemma, ctag, tag, feats, head, rel - - extractors = { - 3: extract_3_cells, - 4: extract_4_cells, - 7: extract_7_cells, - 10: extract_10_cells, - } - - if isinstance(input_, str): - input_ = (line for line in input_.split("\n")) - - lines = (l.rstrip() for l in input_) - lines = (l for l in lines if l) - - cell_number = None - for index, line in enumerate(lines, start=1): - cells = line.split(cell_separator) - if cell_number is None: - cell_number = len(cells) - else: - assert cell_number == len(cells) - - if cell_extractor is None: - try: - cell_extractor = extractors[cell_number] - except KeyError as e: - raise ValueError( - "Number of tab-delimited fields ({}) not supported by " - "CoNLL(10) or Malt-Tab(4) format".format(cell_number) - ) from e - - try: - index, word, lemma, ctag, tag, feats, head, rel = cell_extractor( - cells, index - ) - except (TypeError, ValueError): - # cell_extractor doesn't take 2 arguments or doesn't return 8 - # values; assume the cell_extractor is an older external - # extractor and doesn't accept or return an index. - word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells) - - if head == "_": - continue - - head = int(head) - if zero_based: - head += 1 - - self.nodes[index].update( - { - "address": index, - "word": word, - "lemma": lemma, - "ctag": ctag, - "tag": tag, - "feats": feats, - "head": head, - "rel": rel, - } - ) - - # Make sure that the fake root node has labeled dependencies. - if (cell_number == 3) and (head == 0): - rel = top_relation_label - self.nodes[head]["deps"][rel].append(index) - - if self.nodes[0]["deps"][top_relation_label]: - root_address = self.nodes[0]["deps"][top_relation_label][0] - self.root = self.nodes[root_address] - self.top_relation_label = top_relation_label - else: - warnings.warn( - "The graph doesn't contain a node " "that depends on the root element." - ) - - def _word(self, node, filter=True): - w = node["word"] - if filter: - if w != ",": - return w - return w - - def _tree(self, i): - """Turn dependency graphs into NLTK trees. - - :param int i: index of a node - :return: either a word (if the indexed node is a leaf) or a ``Tree``. - """ - node = self.get_by_address(i) - word = node["word"] - deps = sorted(chain.from_iterable(node["deps"].values())) - - if deps: - return Tree(word, [self._tree(dep) for dep in deps]) - else: - return word - - def tree(self): - """ - Starting with the ``root`` node, build a dependency tree using the NLTK - ``Tree`` constructor. Dependency labels are omitted. - """ - node = self.root - - word = node["word"] - deps = sorted(chain.from_iterable(node["deps"].values())) - return Tree(word, [self._tree(dep) for dep in deps]) - - def triples(self, node=None): - """ - Extract dependency triples of the form: - ((head word, head tag), rel, (dep word, dep tag)) - """ - - if not node: - node = self.root - - head = (node["word"], node["ctag"]) - for i in sorted(chain.from_iterable(node["deps"].values())): - dep = self.get_by_address(i) - yield (head, dep["rel"], (dep["word"], dep["ctag"])) - yield from self.triples(node=dep) - - def _hd(self, i): - try: - return self.nodes[i]["head"] - except IndexError: - return None - - def _rel(self, i): - try: - return self.nodes[i]["rel"] - except IndexError: - return None - - # what's the return type? Boolean or list? - def contains_cycle(self): - """Check whether there are cycles. - - >>> dg = DependencyGraph(treebank_data) - >>> dg.contains_cycle() - False - - >>> cyclic_dg = DependencyGraph() - >>> top = {'word': None, 'deps': [1], 'rel': 'TOP', 'address': 0} - >>> child1 = {'word': None, 'deps': [2], 'rel': 'NTOP', 'address': 1} - >>> child2 = {'word': None, 'deps': [4], 'rel': 'NTOP', 'address': 2} - >>> child3 = {'word': None, 'deps': [1], 'rel': 'NTOP', 'address': 3} - >>> child4 = {'word': None, 'deps': [3], 'rel': 'NTOP', 'address': 4} - >>> cyclic_dg.nodes = { - ... 0: top, - ... 1: child1, - ... 2: child2, - ... 3: child3, - ... 4: child4, - ... } - >>> cyclic_dg.root = top - - >>> cyclic_dg.contains_cycle() - [1, 2, 4, 3] - - """ - distances = {} - - for node in self.nodes.values(): - for dep in node["deps"]: - key = tuple([node["address"], dep]) - distances[key] = 1 - - for _ in self.nodes: - new_entries = {} - - for pair1 in distances: - for pair2 in distances: - if pair1[1] == pair2[0]: - key = tuple([pair1[0], pair2[1]]) - new_entries[key] = distances[pair1] + distances[pair2] - - for pair in new_entries: - distances[pair] = new_entries[pair] - if pair[0] == pair[1]: - path = self.get_cycle_path(self.get_by_address(pair[0]), pair[0]) - return path - - return False # return []? - - def get_cycle_path(self, curr_node, goal_node_index): - for dep in curr_node["deps"]: - if dep == goal_node_index: - return [curr_node["address"]] - for dep in curr_node["deps"]: - path = self.get_cycle_path(self.get_by_address(dep), goal_node_index) - if len(path) > 0: - path.insert(0, curr_node["address"]) - return path - return [] - - def to_conll(self, style): - """ - The dependency graph in CoNLL format. - - :param style: the style to use for the format (3, 4, 10 columns) - :type style: int - :rtype: str - """ - - if style == 3: - template = "{word}\t{tag}\t{head}\n" - elif style == 4: - template = "{word}\t{tag}\t{head}\t{rel}\n" - elif style == 10: - template = ( - "{i}\t{word}\t{lemma}\t{ctag}\t{tag}\t{feats}\t{head}\t{rel}\t_\t_\n" - ) - else: - raise ValueError( - "Number of tab-delimited fields ({}) not supported by " - "CoNLL(10) or Malt-Tab(4) format".format(style) - ) - - return "".join( - template.format(i=i, **node) - for i, node in sorted(self.nodes.items()) - if node["tag"] != "TOP" - ) - - def nx_graph(self): - """Convert the data in a ``nodelist`` into a networkx labeled directed graph.""" - import networkx - - nx_nodelist = list(range(1, len(self.nodes))) - nx_edgelist = [ - (n, self._hd(n), self._rel(n)) for n in nx_nodelist if self._hd(n) - ] - self.nx_labels = {} - for n in nx_nodelist: - self.nx_labels[n] = self.nodes[n]["word"] - - g = networkx.MultiDiGraph() - g.add_nodes_from(nx_nodelist) - g.add_edges_from(nx_edgelist) - - return g - - -def dot2img(dot_string, t="svg"): - """ - Create image representation fom dot_string, using the 'dot' program - from the Graphviz package. - - Use the 't' argument to specify the image file format, for ex. 'jpeg', 'eps', - 'json', 'png' or 'webp' (Running 'dot -T:' lists all available formats). - - Note that the "capture_output" option of subprocess.run() is only available - with text formats (like svg), but not with binary image formats (like png). - """ - - try: - find_binary("dot") - try: - if t in ["dot", "dot_json", "json", "svg"]: - proc = subprocess.run( - ["dot", "-T%s" % t], - capture_output=True, - input=dot_string, - text=True, - ) - else: - proc = subprocess.run( - ["dot", "-T%s" % t], - input=bytes(dot_string, encoding="utf8"), - ) - return proc.stdout - except: - raise Exception( - "Cannot create image representation by running dot from string: {}" - "".format(dot_string) - ) - except OSError as e: - raise Exception("Cannot find the dot binary from Graphviz package") from e - - -class DependencyGraphError(Exception): - """Dependency graph exception.""" - - -def demo(): - malt_demo() - conll_demo() - conll_file_demo() - cycle_finding_demo() - - -def malt_demo(nx=False): - """ - A demonstration of the result of reading a dependency - version of the first sentence of the Penn Treebank. - """ - dg = DependencyGraph( - """Pierre NNP 2 NMOD -Vinken NNP 8 SUB -, , 2 P -61 CD 5 NMOD -years NNS 6 AMOD -old JJ 2 NMOD -, , 2 P -will MD 0 ROOT -join VB 8 VC -the DT 11 NMOD -board NN 9 OBJ -as IN 9 VMOD -a DT 15 NMOD -nonexecutive JJ 15 NMOD -director NN 12 PMOD -Nov. NNP 9 VMOD -29 CD 16 NMOD -. . 9 VMOD -""" - ) - tree = dg.tree() - tree.pprint() - if nx: - # currently doesn't work - import networkx - from matplotlib import pylab - - g = dg.nx_graph() - g.info() - pos = networkx.spring_layout(g, dim=1) - networkx.draw_networkx_nodes(g, pos, node_size=50) - # networkx.draw_networkx_edges(g, pos, edge_color='k', width=8) - networkx.draw_networkx_labels(g, pos, dg.nx_labels) - pylab.xticks([]) - pylab.yticks([]) - pylab.savefig("tree.png") - pylab.show() - - -def conll_demo(): - """ - A demonstration of how to read a string representation of - a CoNLL format dependency tree. - """ - dg = DependencyGraph(conll_data1) - tree = dg.tree() - tree.pprint() - print(dg) - print(dg.to_conll(4)) - - -def conll_file_demo(): - print("Mass conll_read demo...") - graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry] - for graph in graphs: - tree = graph.tree() - print("\n") - tree.pprint() - - -def cycle_finding_demo(): - dg = DependencyGraph(treebank_data) - print(dg.contains_cycle()) - cyclic_dg = DependencyGraph() - cyclic_dg.add_node({"word": None, "deps": [1], "rel": "TOP", "address": 0}) - cyclic_dg.add_node({"word": None, "deps": [2], "rel": "NTOP", "address": 1}) - cyclic_dg.add_node({"word": None, "deps": [4], "rel": "NTOP", "address": 2}) - cyclic_dg.add_node({"word": None, "deps": [1], "rel": "NTOP", "address": 3}) - cyclic_dg.add_node({"word": None, "deps": [3], "rel": "NTOP", "address": 4}) - print(cyclic_dg.contains_cycle()) - - -treebank_data = """Pierre NNP 2 NMOD -Vinken NNP 8 SUB -, , 2 P -61 CD 5 NMOD -years NNS 6 AMOD -old JJ 2 NMOD -, , 2 P -will MD 0 ROOT -join VB 8 VC -the DT 11 NMOD -board NN 9 OBJ -as IN 9 VMOD -a DT 15 NMOD -nonexecutive JJ 15 NMOD -director NN 12 PMOD -Nov. NNP 9 VMOD -29 CD 16 NMOD -. . 9 VMOD -""" - -conll_data1 = """ -1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _ -2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _ -3 met met Prep Prep voor 8 mod _ _ -4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _ -5 moeder moeder N N soort|ev|neut 3 obj1 _ _ -6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _ -7 gaan ga V V hulp|inf 6 vc _ _ -8 winkelen winkel V V intrans|inf 11 cnj _ _ -9 , , Punc Punc komma 8 punct _ _ -10 zwemmen zwem V V intrans|inf 11 cnj _ _ -11 of of Conj Conj neven 7 vc _ _ -12 terrassen terras N N soort|mv|neut 11 cnj _ _ -13 . . Punc Punc punt 12 punct _ _ -""" - -conll_data2 = """1 Cathy Cathy N N eigen|ev|neut 2 su _ _ -2 zag zie V V trans|ovt|1of2of3|ev 0 ROOT _ _ -3 hen hen Pron Pron per|3|mv|datofacc 2 obj1 _ _ -4 wild wild Adj Adj attr|stell|onverv 5 mod _ _ -5 zwaaien zwaai N N soort|mv|neut 2 vc _ _ -6 . . Punc Punc punt 5 punct _ _ - -1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _ -2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _ -3 met met Prep Prep voor 8 mod _ _ -4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _ -5 moeder moeder N N soort|ev|neut 3 obj1 _ _ -6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _ -7 gaan ga V V hulp|inf 6 vc _ _ -8 winkelen winkel V V intrans|inf 11 cnj _ _ -9 , , Punc Punc komma 8 punct _ _ -10 zwemmen zwem V V intrans|inf 11 cnj _ _ -11 of of Conj Conj neven 7 vc _ _ -12 terrassen terras N N soort|mv|neut 11 cnj _ _ -13 . . Punc Punc punt 12 punct _ _ - -1 Dat dat Pron Pron aanw|neut|attr 2 det _ _ -2 werkwoord werkwoord N N soort|ev|neut 6 obj1 _ _ -3 had heb V V hulp|ovt|1of2of3|ev 0 ROOT _ _ -4 ze ze Pron Pron per|3|evofmv|nom 6 su _ _ -5 zelf zelf Pron Pron aanw|neut|attr|wzelf 3 predm _ _ -6 uitgevonden vind V V trans|verldw|onverv 3 vc _ _ -7 . . Punc Punc punt 6 punct _ _ - -1 Het het Pron Pron onbep|neut|zelfst 2 su _ _ -2 hoorde hoor V V trans|ovt|1of2of3|ev 0 ROOT _ _ -3 bij bij Prep Prep voor 2 ld _ _ -4 de de Art Art bep|zijdofmv|neut 6 det _ _ -5 warme warm Adj Adj attr|stell|vervneut 6 mod _ _ -6 zomerdag zomerdag N N soort|ev|neut 3 obj1 _ _ -7 die die Pron Pron betr|neut|zelfst 6 mod _ _ -8 ze ze Pron Pron per|3|evofmv|nom 12 su _ _ -9 ginds ginds Adv Adv gew|aanw 12 mod _ _ -10 achter achter Adv Adv gew|geenfunc|stell|onverv 12 svp _ _ -11 had heb V V hulp|ovt|1of2of3|ev 7 body _ _ -12 gelaten laat V V trans|verldw|onverv 11 vc _ _ -13 . . Punc Punc punt 12 punct _ _ - -1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _ -2 hadden heb V V trans|ovt|1of2of3|mv 0 ROOT _ _ -3 languit languit Adv Adv gew|geenfunc|stell|onverv 11 mod _ _ -4 naast naast Prep Prep voor 11 mod _ _ -5 elkaar elkaar Pron Pron rec|neut 4 obj1 _ _ -6 op op Prep Prep voor 11 ld _ _ -7 de de Art Art bep|zijdofmv|neut 8 det _ _ -8 strandstoelen strandstoel N N soort|mv|neut 6 obj1 _ _ -9 kunnen kan V V hulp|inf 2 vc _ _ -10 gaan ga V V hulp|inf 9 vc _ _ -11 liggen lig V V intrans|inf 10 vc _ _ -12 . . Punc Punc punt 11 punct _ _ - -1 Zij zij Pron Pron per|3|evofmv|nom 2 su _ _ -2 zou zal V V hulp|ovt|1of2of3|ev 7 cnj _ _ -3 mams mams N N soort|ev|neut 4 det _ _ -4 rug rug N N soort|ev|neut 5 obj1 _ _ -5 ingewreven wrijf V V trans|verldw|onverv 6 vc _ _ -6 hebben heb V V hulp|inf 2 vc _ _ -7 en en Conj Conj neven 0 ROOT _ _ -8 mam mam V V trans|ovt|1of2of3|ev 7 cnj _ _ -9 de de Art Art bep|zijdofmv|neut 10 det _ _ -10 hare hare Pron Pron bez|3|ev|neut|attr 8 obj1 _ _ -11 . . Punc Punc punt 10 punct _ _ - -1 Of of Conj Conj onder|metfin 0 ROOT _ _ -2 ze ze Pron Pron per|3|evofmv|nom 3 su _ _ -3 had heb V V hulp|ovt|1of2of3|ev 0 ROOT _ _ -4 gewoon gewoon Adj Adj adv|stell|onverv 10 mod _ _ -5 met met Prep Prep voor 10 mod _ _ -6 haar haar Pron Pron bez|3|ev|neut|attr 7 det _ _ -7 vriendinnen vriendin N N soort|mv|neut 5 obj1 _ _ -8 rond rond Adv Adv deelv 10 svp _ _ -9 kunnen kan V V hulp|inf 3 vc _ _ -10 slenteren slenter V V intrans|inf 9 vc _ _ -11 in in Prep Prep voor 10 mod _ _ -12 de de Art Art bep|zijdofmv|neut 13 det _ _ -13 buurt buurt N N soort|ev|neut 11 obj1 _ _ -14 van van Prep Prep voor 13 mod _ _ -15 Trafalgar_Square Trafalgar_Square MWU N_N eigen|ev|neut_eigen|ev|neut 14 obj1 _ _ -16 . . Punc Punc punt 15 punct _ _ -""" - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/parse/earleychart.py b/pipeline/nltk/parse/earleychart.py deleted file mode 100644 index 1054e114c8e3177754ed895b67ac2b2f4d39cc21..0000000000000000000000000000000000000000 --- a/pipeline/nltk/parse/earleychart.py +++ /dev/null @@ -1,552 +0,0 @@ -# Natural Language Toolkit: An Incremental Earley Chart Parser -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Peter Ljunglöf -# Rob Speer -# Edward Loper -# Steven Bird -# Jean Mark Gawron -# URL: -# For license information, see LICENSE.TXT - -""" -Data classes and parser implementations for *incremental* chart -parsers, which use dynamic programming to efficiently parse a text. -A "chart parser" derives parse trees for a text by iteratively adding -\"edges\" to a \"chart\". Each "edge" represents a hypothesis about the tree -structure for a subsequence of the text. The "chart" is a -\"blackboard\" for composing and combining these hypotheses. - -A parser is "incremental", if it guarantees that for all i, j where i < j, -all edges ending at i are built before any edges ending at j. -This is appealing for, say, speech recognizer hypothesis filtering. - -The main parser class is ``EarleyChartParser``, which is a top-down -algorithm, originally formulated by Jay Earley (1970). -""" - -from time import perf_counter - -from nltk.parse.chart import ( - BottomUpPredictCombineRule, - BottomUpPredictRule, - CachedTopDownPredictRule, - Chart, - ChartParser, - EdgeI, - EmptyPredictRule, - FilteredBottomUpPredictCombineRule, - FilteredSingleEdgeFundamentalRule, - LeafEdge, - LeafInitRule, - SingleEdgeFundamentalRule, - TopDownInitRule, -) -from nltk.parse.featurechart import ( - FeatureBottomUpPredictCombineRule, - FeatureBottomUpPredictRule, - FeatureChart, - FeatureChartParser, - FeatureEmptyPredictRule, - FeatureSingleEdgeFundamentalRule, - FeatureTopDownInitRule, - FeatureTopDownPredictRule, -) - -# //////////////////////////////////////////////////////////// -# Incremental Chart -# //////////////////////////////////////////////////////////// - - -class IncrementalChart(Chart): - def initialize(self): - # A sequence of edge lists contained in this chart. - self._edgelists = tuple([] for x in self._positions()) - - # The set of child pointer lists associated with each edge. - self._edge_to_cpls = {} - - # Indexes mapping attribute values to lists of edges - # (used by select()). - self._indexes = {} - - def edges(self): - return list(self.iteredges()) - - def iteredges(self): - return (edge for edgelist in self._edgelists for edge in edgelist) - - def select(self, end, **restrictions): - edgelist = self._edgelists[end] - - # If there are no restrictions, then return all edges. - if restrictions == {}: - return iter(edgelist) - - # Find the index corresponding to the given restrictions. - restr_keys = sorted(restrictions.keys()) - restr_keys = tuple(restr_keys) - - # If it doesn't exist, then create it. - if restr_keys not in self._indexes: - self._add_index(restr_keys) - - vals = tuple(restrictions[key] for key in restr_keys) - return iter(self._indexes[restr_keys][end].get(vals, [])) - - def _add_index(self, restr_keys): - # Make sure it's a valid index. - for key in restr_keys: - if not hasattr(EdgeI, key): - raise ValueError("Bad restriction: %s" % key) - - # Create the index. - index = self._indexes[restr_keys] = tuple({} for x in self._positions()) - - # Add all existing edges to the index. - for end, edgelist in enumerate(self._edgelists): - this_index = index[end] - for edge in edgelist: - vals = tuple(getattr(edge, key)() for key in restr_keys) - this_index.setdefault(vals, []).append(edge) - - def _register_with_indexes(self, edge): - end = edge.end() - for (restr_keys, index) in self._indexes.items(): - vals = tuple(getattr(edge, key)() for key in restr_keys) - index[end].setdefault(vals, []).append(edge) - - def _append_edge(self, edge): - self._edgelists[edge.end()].append(edge) - - def _positions(self): - return range(self.num_leaves() + 1) - - -class FeatureIncrementalChart(IncrementalChart, FeatureChart): - def select(self, end, **restrictions): - edgelist = self._edgelists[end] - - # If there are no restrictions, then return all edges. - if restrictions == {}: - return iter(edgelist) - - # Find the index corresponding to the given restrictions. - restr_keys = sorted(restrictions.keys()) - restr_keys = tuple(restr_keys) - - # If it doesn't exist, then create it. - if restr_keys not in self._indexes: - self._add_index(restr_keys) - - vals = tuple( - self._get_type_if_possible(restrictions[key]) for key in restr_keys - ) - return iter(self._indexes[restr_keys][end].get(vals, [])) - - def _add_index(self, restr_keys): - # Make sure it's a valid index. - for key in restr_keys: - if not hasattr(EdgeI, key): - raise ValueError("Bad restriction: %s" % key) - - # Create the index. - index = self._indexes[restr_keys] = tuple({} for x in self._positions()) - - # Add all existing edges to the index. - for end, edgelist in enumerate(self._edgelists): - this_index = index[end] - for edge in edgelist: - vals = tuple( - self._get_type_if_possible(getattr(edge, key)()) - for key in restr_keys - ) - this_index.setdefault(vals, []).append(edge) - - def _register_with_indexes(self, edge): - end = edge.end() - for (restr_keys, index) in self._indexes.items(): - vals = tuple( - self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys - ) - index[end].setdefault(vals, []).append(edge) - - -# //////////////////////////////////////////////////////////// -# Incremental CFG Rules -# //////////////////////////////////////////////////////////// - - -class CompleteFundamentalRule(SingleEdgeFundamentalRule): - def _apply_incomplete(self, chart, grammar, left_edge): - end = left_edge.end() - # When the chart is incremental, we only have to look for - # empty complete edges here. - for right_edge in chart.select( - start=end, end=end, is_complete=True, lhs=left_edge.nextsym() - ): - new_edge = left_edge.move_dot_forward(right_edge.end()) - if chart.insert_with_backpointer(new_edge, left_edge, right_edge): - yield new_edge - - -class CompleterRule(CompleteFundamentalRule): - _fundamental_rule = CompleteFundamentalRule() - - def apply(self, chart, grammar, edge): - if not isinstance(edge, LeafEdge): - yield from self._fundamental_rule.apply(chart, grammar, edge) - - -class ScannerRule(CompleteFundamentalRule): - _fundamental_rule = CompleteFundamentalRule() - - def apply(self, chart, grammar, edge): - if isinstance(edge, LeafEdge): - yield from self._fundamental_rule.apply(chart, grammar, edge) - - -class PredictorRule(CachedTopDownPredictRule): - pass - - -class FilteredCompleteFundamentalRule(FilteredSingleEdgeFundamentalRule): - def apply(self, chart, grammar, edge): - # Since the Filtered rule only works for grammars without empty productions, - # we only have to bother with complete edges here. - if edge.is_complete(): - yield from self._apply_complete(chart, grammar, edge) - - -# //////////////////////////////////////////////////////////// -# Incremental FCFG Rules -# //////////////////////////////////////////////////////////// - - -class FeatureCompleteFundamentalRule(FeatureSingleEdgeFundamentalRule): - def _apply_incomplete(self, chart, grammar, left_edge): - fr = self._fundamental_rule - end = left_edge.end() - # When the chart is incremental, we only have to look for - # empty complete edges here. - for right_edge in chart.select( - start=end, end=end, is_complete=True, lhs=left_edge.nextsym() - ): - yield from fr.apply(chart, grammar, left_edge, right_edge) - - -class FeatureCompleterRule(CompleterRule): - _fundamental_rule = FeatureCompleteFundamentalRule() - - -class FeatureScannerRule(ScannerRule): - _fundamental_rule = FeatureCompleteFundamentalRule() - - -class FeaturePredictorRule(FeatureTopDownPredictRule): - pass - - -# //////////////////////////////////////////////////////////// -# Incremental CFG Chart Parsers -# //////////////////////////////////////////////////////////// - -EARLEY_STRATEGY = [ - LeafInitRule(), - TopDownInitRule(), - CompleterRule(), - ScannerRule(), - PredictorRule(), -] -TD_INCREMENTAL_STRATEGY = [ - LeafInitRule(), - TopDownInitRule(), - CachedTopDownPredictRule(), - CompleteFundamentalRule(), -] -BU_INCREMENTAL_STRATEGY = [ - LeafInitRule(), - EmptyPredictRule(), - BottomUpPredictRule(), - CompleteFundamentalRule(), -] -BU_LC_INCREMENTAL_STRATEGY = [ - LeafInitRule(), - EmptyPredictRule(), - BottomUpPredictCombineRule(), - CompleteFundamentalRule(), -] - -LC_INCREMENTAL_STRATEGY = [ - LeafInitRule(), - FilteredBottomUpPredictCombineRule(), - FilteredCompleteFundamentalRule(), -] - - -class IncrementalChartParser(ChartParser): - """ - An *incremental* chart parser implementing Jay Earley's - parsing algorithm: - - | For each index end in [0, 1, ..., N]: - | For each edge such that edge.end = end: - | If edge is incomplete and edge.next is not a part of speech: - | Apply PredictorRule to edge - | If edge is incomplete and edge.next is a part of speech: - | Apply ScannerRule to edge - | If edge is complete: - | Apply CompleterRule to edge - | Return any complete parses in the chart - """ - - def __init__( - self, - grammar, - strategy=BU_LC_INCREMENTAL_STRATEGY, - trace=0, - trace_chart_width=50, - chart_class=IncrementalChart, - ): - """ - Create a new Earley chart parser, that uses ``grammar`` to - parse texts. - - :type grammar: CFG - :param grammar: The grammar used to parse texts. - :type trace: int - :param trace: The level of tracing that should be used when - parsing a text. ``0`` will generate no tracing output; - and higher numbers will produce more verbose tracing - output. - :type trace_chart_width: int - :param trace_chart_width: The default total width reserved for - the chart in trace output. The remainder of each line will - be used to display edges. - :param chart_class: The class that should be used to create - the charts used by this parser. - """ - self._grammar = grammar - self._trace = trace - self._trace_chart_width = trace_chart_width - self._chart_class = chart_class - - self._axioms = [] - self._inference_rules = [] - for rule in strategy: - if rule.NUM_EDGES == 0: - self._axioms.append(rule) - elif rule.NUM_EDGES == 1: - self._inference_rules.append(rule) - else: - raise ValueError( - "Incremental inference rules must have " "NUM_EDGES == 0 or 1" - ) - - def chart_parse(self, tokens, trace=None): - if trace is None: - trace = self._trace - trace_new_edges = self._trace_new_edges - - tokens = list(tokens) - self._grammar.check_coverage(tokens) - chart = self._chart_class(tokens) - grammar = self._grammar - - # Width, for printing trace edges. - trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1) - if trace: - print(chart.pretty_format_leaves(trace_edge_width)) - - for axiom in self._axioms: - new_edges = list(axiom.apply(chart, grammar)) - trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width) - - inference_rules = self._inference_rules - for end in range(chart.num_leaves() + 1): - if trace > 1: - print("\n* Processing queue:", end, "\n") - agenda = list(chart.select(end=end)) - while agenda: - edge = agenda.pop() - for rule in inference_rules: - new_edges = list(rule.apply(chart, grammar, edge)) - trace_new_edges(chart, rule, new_edges, trace, trace_edge_width) - for new_edge in new_edges: - if new_edge.end() == end: - agenda.append(new_edge) - - return chart - - -class EarleyChartParser(IncrementalChartParser): - def __init__(self, grammar, **parser_args): - IncrementalChartParser.__init__(self, grammar, EARLEY_STRATEGY, **parser_args) - - -class IncrementalTopDownChartParser(IncrementalChartParser): - def __init__(self, grammar, **parser_args): - IncrementalChartParser.__init__( - self, grammar, TD_INCREMENTAL_STRATEGY, **parser_args - ) - - -class IncrementalBottomUpChartParser(IncrementalChartParser): - def __init__(self, grammar, **parser_args): - IncrementalChartParser.__init__( - self, grammar, BU_INCREMENTAL_STRATEGY, **parser_args - ) - - -class IncrementalBottomUpLeftCornerChartParser(IncrementalChartParser): - def __init__(self, grammar, **parser_args): - IncrementalChartParser.__init__( - self, grammar, BU_LC_INCREMENTAL_STRATEGY, **parser_args - ) - - -class IncrementalLeftCornerChartParser(IncrementalChartParser): - def __init__(self, grammar, **parser_args): - if not grammar.is_nonempty(): - raise ValueError( - "IncrementalLeftCornerParser only works for grammars " - "without empty productions." - ) - IncrementalChartParser.__init__( - self, grammar, LC_INCREMENTAL_STRATEGY, **parser_args - ) - - -# //////////////////////////////////////////////////////////// -# Incremental FCFG Chart Parsers -# //////////////////////////////////////////////////////////// - -EARLEY_FEATURE_STRATEGY = [ - LeafInitRule(), - FeatureTopDownInitRule(), - FeatureCompleterRule(), - FeatureScannerRule(), - FeaturePredictorRule(), -] -TD_INCREMENTAL_FEATURE_STRATEGY = [ - LeafInitRule(), - FeatureTopDownInitRule(), - FeatureTopDownPredictRule(), - FeatureCompleteFundamentalRule(), -] -BU_INCREMENTAL_FEATURE_STRATEGY = [ - LeafInitRule(), - FeatureEmptyPredictRule(), - FeatureBottomUpPredictRule(), - FeatureCompleteFundamentalRule(), -] -BU_LC_INCREMENTAL_FEATURE_STRATEGY = [ - LeafInitRule(), - FeatureEmptyPredictRule(), - FeatureBottomUpPredictCombineRule(), - FeatureCompleteFundamentalRule(), -] - - -class FeatureIncrementalChartParser(IncrementalChartParser, FeatureChartParser): - def __init__( - self, - grammar, - strategy=BU_LC_INCREMENTAL_FEATURE_STRATEGY, - trace_chart_width=20, - chart_class=FeatureIncrementalChart, - **parser_args - ): - IncrementalChartParser.__init__( - self, - grammar, - strategy=strategy, - trace_chart_width=trace_chart_width, - chart_class=chart_class, - **parser_args - ) - - -class FeatureEarleyChartParser(FeatureIncrementalChartParser): - def __init__(self, grammar, **parser_args): - FeatureIncrementalChartParser.__init__( - self, grammar, EARLEY_FEATURE_STRATEGY, **parser_args - ) - - -class FeatureIncrementalTopDownChartParser(FeatureIncrementalChartParser): - def __init__(self, grammar, **parser_args): - FeatureIncrementalChartParser.__init__( - self, grammar, TD_INCREMENTAL_FEATURE_STRATEGY, **parser_args - ) - - -class FeatureIncrementalBottomUpChartParser(FeatureIncrementalChartParser): - def __init__(self, grammar, **parser_args): - FeatureIncrementalChartParser.__init__( - self, grammar, BU_INCREMENTAL_FEATURE_STRATEGY, **parser_args - ) - - -class FeatureIncrementalBottomUpLeftCornerChartParser(FeatureIncrementalChartParser): - def __init__(self, grammar, **parser_args): - FeatureIncrementalChartParser.__init__( - self, grammar, BU_LC_INCREMENTAL_FEATURE_STRATEGY, **parser_args - ) - - -# //////////////////////////////////////////////////////////// -# Demonstration -# //////////////////////////////////////////////////////////// - - -def demo( - print_times=True, - print_grammar=False, - print_trees=True, - trace=2, - sent="I saw John with a dog with my cookie", - numparses=5, -): - """ - A demonstration of the Earley parsers. - """ - import sys - import time - - from nltk.parse.chart import demo_grammar - - # The grammar for ChartParser and SteppingChartParser: - grammar = demo_grammar() - if print_grammar: - print("* Grammar") - print(grammar) - - # Tokenize the sample sentence. - print("* Sentence:") - print(sent) - tokens = sent.split() - print(tokens) - print() - - # Do the parsing. - earley = EarleyChartParser(grammar, trace=trace) - t = perf_counter() - chart = earley.chart_parse(tokens) - parses = list(chart.parses(grammar.start())) - t = perf_counter() - t - - # Print results. - if numparses: - assert len(parses) == numparses, "Not all parses found" - if print_trees: - for tree in parses: - print(tree) - else: - print("Nr trees:", len(parses)) - if print_times: - print("Time:", t) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/parse/evaluate.py b/pipeline/nltk/parse/evaluate.py deleted file mode 100644 index 07ab1c9832b42be2e655663cacf87d84db5ea3a9..0000000000000000000000000000000000000000 --- a/pipeline/nltk/parse/evaluate.py +++ /dev/null @@ -1,129 +0,0 @@ -# Natural Language Toolkit: evaluation of dependency parser -# -# Author: Long Duong -# -# Copyright (C) 2001-2023 NLTK Project -# URL: -# For license information, see LICENSE.TXT - -import unicodedata - - -class DependencyEvaluator: - """ - Class for measuring labelled and unlabelled attachment score for - dependency parsing. Note that the evaluation ignores punctuation. - - >>> from nltk.parse import DependencyGraph, DependencyEvaluator - - >>> gold_sent = DependencyGraph(\""" - ... Pierre NNP 2 NMOD - ... Vinken NNP 8 SUB - ... , , 2 P - ... 61 CD 5 NMOD - ... years NNS 6 AMOD - ... old JJ 2 NMOD - ... , , 2 P - ... will MD 0 ROOT - ... join VB 8 VC - ... the DT 11 NMOD - ... board NN 9 OBJ - ... as IN 9 VMOD - ... a DT 15 NMOD - ... nonexecutive JJ 15 NMOD - ... director NN 12 PMOD - ... Nov. NNP 9 VMOD - ... 29 CD 16 NMOD - ... . . 9 VMOD - ... \""") - - >>> parsed_sent = DependencyGraph(\""" - ... Pierre NNP 8 NMOD - ... Vinken NNP 1 SUB - ... , , 3 P - ... 61 CD 6 NMOD - ... years NNS 6 AMOD - ... old JJ 2 NMOD - ... , , 3 AMOD - ... will MD 0 ROOT - ... join VB 8 VC - ... the DT 11 AMOD - ... board NN 9 OBJECT - ... as IN 9 NMOD - ... a DT 15 NMOD - ... nonexecutive JJ 15 NMOD - ... director NN 12 PMOD - ... Nov. NNP 9 VMOD - ... 29 CD 16 NMOD - ... . . 9 VMOD - ... \""") - - >>> de = DependencyEvaluator([parsed_sent],[gold_sent]) - >>> las, uas = de.eval() - >>> las - 0.6 - >>> uas - 0.8 - >>> abs(uas - 0.8) < 0.00001 - True - """ - - def __init__(self, parsed_sents, gold_sents): - """ - :param parsed_sents: the list of parsed_sents as the output of parser - :type parsed_sents: list(DependencyGraph) - """ - self._parsed_sents = parsed_sents - self._gold_sents = gold_sents - - def _remove_punct(self, inStr): - """ - Function to remove punctuation from Unicode string. - :param input: the input string - :return: Unicode string after remove all punctuation - """ - punc_cat = {"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"} - return "".join(x for x in inStr if unicodedata.category(x) not in punc_cat) - - def eval(self): - """ - Return the Labeled Attachment Score (LAS) and Unlabeled Attachment Score (UAS) - - :return : tuple(float,float) - """ - if len(self._parsed_sents) != len(self._gold_sents): - raise ValueError( - " Number of parsed sentence is different with number of gold sentence." - ) - - corr = 0 - corrL = 0 - total = 0 - - for i in range(len(self._parsed_sents)): - parsed_sent_nodes = self._parsed_sents[i].nodes - gold_sent_nodes = self._gold_sents[i].nodes - - if len(parsed_sent_nodes) != len(gold_sent_nodes): - raise ValueError("Sentences must have equal length.") - - for parsed_node_address, parsed_node in parsed_sent_nodes.items(): - gold_node = gold_sent_nodes[parsed_node_address] - - if parsed_node["word"] is None: - continue - if parsed_node["word"] != gold_node["word"]: - raise ValueError("Sentence sequence is not matched.") - - # Ignore if word is punctuation by default - # if (parsed_sent[j]["word"] in string.punctuation): - if self._remove_punct(parsed_node["word"]) == "": - continue - - total += 1 - if parsed_node["head"] == gold_node["head"]: - corr += 1 - if parsed_node["rel"] == gold_node["rel"]: - corrL += 1 - - return corrL / total, corr / total diff --git a/pipeline/nltk/parse/featurechart.py b/pipeline/nltk/parse/featurechart.py deleted file mode 100644 index 0a981001e4f9ad301d4c564ac45c6a0bdcbd310e..0000000000000000000000000000000000000000 --- a/pipeline/nltk/parse/featurechart.py +++ /dev/null @@ -1,674 +0,0 @@ -# Natural Language Toolkit: Chart Parser for Feature-Based Grammars -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Rob Speer -# Peter Ljunglöf -# URL: -# For license information, see LICENSE.TXT - -""" -Extension of chart parsing implementation to handle grammars with -feature structures as nodes. -""" -from time import perf_counter - -from nltk.featstruct import TYPE, FeatStruct, find_variables, unify -from nltk.grammar import ( - CFG, - FeatStructNonterminal, - Nonterminal, - Production, - is_nonterminal, - is_terminal, -) -from nltk.parse.chart import ( - BottomUpPredictCombineRule, - BottomUpPredictRule, - CachedTopDownPredictRule, - Chart, - ChartParser, - EdgeI, - EmptyPredictRule, - FundamentalRule, - LeafInitRule, - SingleEdgeFundamentalRule, - TopDownInitRule, - TreeEdge, -) -from nltk.sem import logic -from nltk.tree import Tree - -# //////////////////////////////////////////////////////////// -# Tree Edge -# //////////////////////////////////////////////////////////// - - -class FeatureTreeEdge(TreeEdge): - """ - A specialized tree edge that allows shared variable bindings - between nonterminals on the left-hand side and right-hand side. - - Each ``FeatureTreeEdge`` contains a set of ``bindings``, i.e., a - dictionary mapping from variables to values. If the edge is not - complete, then these bindings are simply stored. However, if the - edge is complete, then the constructor applies these bindings to - every nonterminal in the edge whose symbol implements the - interface ``SubstituteBindingsI``. - """ - - def __init__(self, span, lhs, rhs, dot=0, bindings=None): - """ - Construct a new edge. If the edge is incomplete (i.e., if - ``dot alpha \* B1 beta][i:j]`` - - ``[B2 -> gamma \*][j:k]`` - - licenses the edge: - - - ``[A -> alpha B3 \* beta][i:j]`` - - assuming that B1 and B2 can be unified to generate B3. - """ - - def apply(self, chart, grammar, left_edge, right_edge): - # Make sure the rule is applicable. - if not ( - left_edge.end() == right_edge.start() - and left_edge.is_incomplete() - and right_edge.is_complete() - and isinstance(left_edge, FeatureTreeEdge) - ): - return - found = right_edge.lhs() - nextsym = left_edge.nextsym() - if isinstance(right_edge, FeatureTreeEdge): - if not is_nonterminal(nextsym): - return - if left_edge.nextsym()[TYPE] != right_edge.lhs()[TYPE]: - return - # Create a copy of the bindings. - bindings = left_edge.bindings() - # We rename vars here, because we don't want variables - # from the two different productions to match. - found = found.rename_variables(used_vars=left_edge.variables()) - # Unify B1 (left_edge.nextsym) with B2 (right_edge.lhs) to - # generate B3 (result). - result = unify(nextsym, found, bindings, rename_vars=False) - if result is None: - return - else: - if nextsym != found: - return - # Create a copy of the bindings. - bindings = left_edge.bindings() - - # Construct the new edge. - new_edge = left_edge.move_dot_forward(right_edge.end(), bindings) - - # Add it to the chart, with appropriate child pointers. - if chart.insert_with_backpointer(new_edge, left_edge, right_edge): - yield new_edge - - -class FeatureSingleEdgeFundamentalRule(SingleEdgeFundamentalRule): - """ - A specialized version of the completer / single edge fundamental rule - that operates on nonterminals whose symbols are ``FeatStructNonterminal``. - Rather than simply comparing the nonterminals for equality, they are - unified. - """ - - _fundamental_rule = FeatureFundamentalRule() - - def _apply_complete(self, chart, grammar, right_edge): - fr = self._fundamental_rule - for left_edge in chart.select( - end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs() - ): - yield from fr.apply(chart, grammar, left_edge, right_edge) - - def _apply_incomplete(self, chart, grammar, left_edge): - fr = self._fundamental_rule - for right_edge in chart.select( - start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym() - ): - yield from fr.apply(chart, grammar, left_edge, right_edge) - - -# //////////////////////////////////////////////////////////// -# Top-Down Prediction -# //////////////////////////////////////////////////////////// - - -class FeatureTopDownInitRule(TopDownInitRule): - def apply(self, chart, grammar): - for prod in grammar.productions(lhs=grammar.start()): - new_edge = FeatureTreeEdge.from_production(prod, 0) - if chart.insert(new_edge, ()): - yield new_edge - - -class FeatureTopDownPredictRule(CachedTopDownPredictRule): - r""" - A specialized version of the (cached) top down predict rule that operates - on nonterminals whose symbols are ``FeatStructNonterminal``. Rather - than simply comparing the nonterminals for equality, they are - unified. - - The top down expand rule states that: - - - ``[A -> alpha \* B1 beta][i:j]`` - - licenses the edge: - - - ``[B2 -> \* gamma][j:j]`` - - for each grammar production ``B2 -> gamma``, assuming that B1 - and B2 can be unified. - """ - - def apply(self, chart, grammar, edge): - if edge.is_complete(): - return - nextsym, index = edge.nextsym(), edge.end() - if not is_nonterminal(nextsym): - return - - # If we've already applied this rule to an edge with the same - # next & end, and the chart & grammar have not changed, then - # just return (no new edges to add). - nextsym_with_bindings = edge.next_with_bindings() - done = self._done.get((nextsym_with_bindings, index), (None, None)) - if done[0] is chart and done[1] is grammar: - return - - for prod in grammar.productions(lhs=nextsym): - # If the left corner in the predicted production is - # leaf, it must match with the input. - if prod.rhs(): - first = prod.rhs()[0] - if is_terminal(first): - if index >= chart.num_leaves(): - continue - if first != chart.leaf(index): - continue - - # We rename vars here, because we don't want variables - # from the two different productions to match. - if unify(prod.lhs(), nextsym_with_bindings, rename_vars=True): - new_edge = FeatureTreeEdge.from_production(prod, edge.end()) - if chart.insert(new_edge, ()): - yield new_edge - - # Record the fact that we've applied this rule. - self._done[nextsym_with_bindings, index] = (chart, grammar) - - -# //////////////////////////////////////////////////////////// -# Bottom-Up Prediction -# //////////////////////////////////////////////////////////// - - -class FeatureBottomUpPredictRule(BottomUpPredictRule): - def apply(self, chart, grammar, edge): - if edge.is_incomplete(): - return - for prod in grammar.productions(rhs=edge.lhs()): - if isinstance(edge, FeatureTreeEdge): - _next = prod.rhs()[0] - if not is_nonterminal(_next): - continue - - new_edge = FeatureTreeEdge.from_production(prod, edge.start()) - if chart.insert(new_edge, ()): - yield new_edge - - -class FeatureBottomUpPredictCombineRule(BottomUpPredictCombineRule): - def apply(self, chart, grammar, edge): - if edge.is_incomplete(): - return - found = edge.lhs() - for prod in grammar.productions(rhs=found): - bindings = {} - if isinstance(edge, FeatureTreeEdge): - _next = prod.rhs()[0] - if not is_nonterminal(_next): - continue - - # We rename vars here, because we don't want variables - # from the two different productions to match. - used_vars = find_variables( - (prod.lhs(),) + prod.rhs(), fs_class=FeatStruct - ) - found = found.rename_variables(used_vars=used_vars) - - result = unify(_next, found, bindings, rename_vars=False) - if result is None: - continue - - new_edge = FeatureTreeEdge.from_production( - prod, edge.start() - ).move_dot_forward(edge.end(), bindings) - if chart.insert(new_edge, (edge,)): - yield new_edge - - -class FeatureEmptyPredictRule(EmptyPredictRule): - def apply(self, chart, grammar): - for prod in grammar.productions(empty=True): - for index in range(chart.num_leaves() + 1): - new_edge = FeatureTreeEdge.from_production(prod, index) - if chart.insert(new_edge, ()): - yield new_edge - - -# //////////////////////////////////////////////////////////// -# Feature Chart Parser -# //////////////////////////////////////////////////////////// - -TD_FEATURE_STRATEGY = [ - LeafInitRule(), - FeatureTopDownInitRule(), - FeatureTopDownPredictRule(), - FeatureSingleEdgeFundamentalRule(), -] -BU_FEATURE_STRATEGY = [ - LeafInitRule(), - FeatureEmptyPredictRule(), - FeatureBottomUpPredictRule(), - FeatureSingleEdgeFundamentalRule(), -] -BU_LC_FEATURE_STRATEGY = [ - LeafInitRule(), - FeatureEmptyPredictRule(), - FeatureBottomUpPredictCombineRule(), - FeatureSingleEdgeFundamentalRule(), -] - - -class FeatureChartParser(ChartParser): - def __init__( - self, - grammar, - strategy=BU_LC_FEATURE_STRATEGY, - trace_chart_width=20, - chart_class=FeatureChart, - **parser_args, - ): - ChartParser.__init__( - self, - grammar, - strategy=strategy, - trace_chart_width=trace_chart_width, - chart_class=chart_class, - **parser_args, - ) - - -class FeatureTopDownChartParser(FeatureChartParser): - def __init__(self, grammar, **parser_args): - FeatureChartParser.__init__(self, grammar, TD_FEATURE_STRATEGY, **parser_args) - - -class FeatureBottomUpChartParser(FeatureChartParser): - def __init__(self, grammar, **parser_args): - FeatureChartParser.__init__(self, grammar, BU_FEATURE_STRATEGY, **parser_args) - - -class FeatureBottomUpLeftCornerChartParser(FeatureChartParser): - def __init__(self, grammar, **parser_args): - FeatureChartParser.__init__( - self, grammar, BU_LC_FEATURE_STRATEGY, **parser_args - ) - - -# //////////////////////////////////////////////////////////// -# Instantiate Variable Chart -# //////////////////////////////////////////////////////////// - - -class InstantiateVarsChart(FeatureChart): - """ - A specialized chart that 'instantiates' variables whose names - start with '@', by replacing them with unique new variables. - In particular, whenever a complete edge is added to the chart, any - variables in the edge's ``lhs`` whose names start with '@' will be - replaced by unique new ``Variable``. - """ - - def __init__(self, tokens): - FeatureChart.__init__(self, tokens) - - def initialize(self): - self._instantiated = set() - FeatureChart.initialize(self) - - def insert(self, edge, child_pointer_list): - if edge in self._instantiated: - return False - self.instantiate_edge(edge) - return FeatureChart.insert(self, edge, child_pointer_list) - - def instantiate_edge(self, edge): - """ - If the edge is a ``FeatureTreeEdge``, and it is complete, - then instantiate all variables whose names start with '@', - by replacing them with unique new variables. - - Note that instantiation is done in-place, since the - parsing algorithms might already hold a reference to - the edge for future use. - """ - # If the edge is a leaf, or is not complete, or is - # already in the chart, then just return it as-is. - if not isinstance(edge, FeatureTreeEdge): - return - if not edge.is_complete(): - return - if edge in self._edge_to_cpls: - return - - # Get a list of variables that need to be instantiated. - # If there are none, then return as-is. - inst_vars = self.inst_vars(edge) - if not inst_vars: - return - - # Instantiate the edge! - self._instantiated.add(edge) - edge._lhs = edge.lhs().substitute_bindings(inst_vars) - - def inst_vars(self, edge): - return { - var: logic.unique_variable() - for var in edge.lhs().variables() - if var.name.startswith("@") - } - - -# //////////////////////////////////////////////////////////// -# Demo -# //////////////////////////////////////////////////////////// - - -def demo_grammar(): - from nltk.grammar import FeatureGrammar - - return FeatureGrammar.fromstring( - """ -S -> NP VP -PP -> Prep NP -NP -> NP PP -VP -> VP PP -VP -> Verb NP -VP -> Verb -NP -> Det[pl=?x] Noun[pl=?x] -NP -> "John" -NP -> "I" -Det -> "the" -Det -> "my" -Det[-pl] -> "a" -Noun[-pl] -> "dog" -Noun[-pl] -> "cookie" -Verb -> "ate" -Verb -> "saw" -Prep -> "with" -Prep -> "under" -""" - ) - - -def demo( - print_times=True, - print_grammar=True, - print_trees=True, - print_sentence=True, - trace=1, - parser=FeatureChartParser, - sent="I saw John with a dog with my cookie", -): - import sys - import time - - print() - grammar = demo_grammar() - if print_grammar: - print(grammar) - print() - print("*", parser.__name__) - if print_sentence: - print("Sentence:", sent) - tokens = sent.split() - t = perf_counter() - cp = parser(grammar, trace=trace) - chart = cp.chart_parse(tokens) - trees = list(chart.parses(grammar.start())) - if print_times: - print("Time: %s" % (perf_counter() - t)) - if print_trees: - for tree in trees: - print(tree) - else: - print("Nr trees:", len(trees)) - - -def run_profile(): - import profile - - profile.run("for i in range(1): demo()", "/tmp/profile.out") - import pstats - - p = pstats.Stats("/tmp/profile.out") - p.strip_dirs().sort_stats("time", "cum").print_stats(60) - p.strip_dirs().sort_stats("cum", "time").print_stats(60) - - -if __name__ == "__main__": - from nltk.data import load - - demo() - print() - grammar = load("grammars/book_grammars/feat0.fcfg") - cp = FeatureChartParser(grammar, trace=2) - sent = "Kim likes children" - tokens = sent.split() - trees = cp.parse(tokens) - for tree in trees: - print(tree) diff --git a/pipeline/nltk/parse/generate.py b/pipeline/nltk/parse/generate.py deleted file mode 100644 index fb2f4e9fa03ee09d5de2c25bf15d728033b577e2..0000000000000000000000000000000000000000 --- a/pipeline/nltk/parse/generate.py +++ /dev/null @@ -1,85 +0,0 @@ -# Natural Language Toolkit: Generating from a CFG -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Peter Ljunglöf -# URL: -# For license information, see LICENSE.TXT -# - -import itertools -import sys - -from nltk.grammar import Nonterminal - - -def generate(grammar, start=None, depth=None, n=None): - """ - Generates an iterator of all sentences from a CFG. - - :param grammar: The Grammar used to generate sentences. - :param start: The Nonterminal from which to start generate sentences. - :param depth: The maximal depth of the generated tree. - :param n: The maximum number of sentences to return. - :return: An iterator of lists of terminal tokens. - """ - if not start: - start = grammar.start() - if depth is None: - depth = sys.maxsize - - iter = _generate_all(grammar, [start], depth) - - if n: - iter = itertools.islice(iter, n) - - return iter - - -def _generate_all(grammar, items, depth): - if items: - try: - for frag1 in _generate_one(grammar, items[0], depth): - for frag2 in _generate_all(grammar, items[1:], depth): - yield frag1 + frag2 - except RecursionError as error: - # Helpful error message while still showing the recursion stack. - raise RuntimeError( - "The grammar has rule(s) that yield infinite recursion!" - ) from error - else: - yield [] - - -def _generate_one(grammar, item, depth): - if depth > 0: - if isinstance(item, Nonterminal): - for prod in grammar.productions(lhs=item): - yield from _generate_all(grammar, prod.rhs(), depth - 1) - else: - yield [item] - - -demo_grammar = """ - S -> NP VP - NP -> Det N - PP -> P NP - VP -> 'slept' | 'saw' NP | 'walked' PP - Det -> 'the' | 'a' - N -> 'man' | 'park' | 'dog' - P -> 'in' | 'with' -""" - - -def demo(N=23): - from nltk.grammar import CFG - - print("Generating the first %d sentences for demo grammar:" % (N,)) - print(demo_grammar) - grammar = CFG.fromstring(demo_grammar) - for n, sent in enumerate(generate(grammar, n=N), 1): - print("%3d. %s" % (n, " ".join(sent))) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/parse/malt.py b/pipeline/nltk/parse/malt.py deleted file mode 100644 index 229e8242719dc4645763706b58363b546bc7e6ae..0000000000000000000000000000000000000000 --- a/pipeline/nltk/parse/malt.py +++ /dev/null @@ -1,393 +0,0 @@ -# Natural Language Toolkit: Interface to MaltParser -# -# Author: Dan Garrette -# Contributor: Liling Tan, Mustufain, osamamukhtar11 -# -# Copyright (C) 2001-2023 NLTK Project -# URL: -# For license information, see LICENSE.TXT - -import inspect -import os -import subprocess -import sys -import tempfile - -from nltk.data import ZipFilePathPointer -from nltk.internals import find_dir, find_file, find_jars_within_path -from nltk.parse.api import ParserI -from nltk.parse.dependencygraph import DependencyGraph -from nltk.parse.util import taggedsents_to_conll - - -def malt_regex_tagger(): - from nltk.tag import RegexpTagger - - _tagger = RegexpTagger( - [ - (r"\.$", "."), - (r"\,$", ","), - (r"\?$", "?"), # fullstop, comma, Qmark - (r"\($", "("), - (r"\)$", ")"), # round brackets - (r"\[$", "["), - (r"\]$", "]"), # square brackets - (r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers - (r"(The|the|A|a|An|an)$", "DT"), # articles - (r"(He|he|She|she|It|it|I|me|Me|You|you)$", "PRP"), # pronouns - (r"(His|his|Her|her|Its|its)$", "PRP$"), # possessive - (r"(my|Your|your|Yours|yours)$", "PRP$"), # possessive - (r"(on|On|in|In|at|At|since|Since)$", "IN"), # time prepopsitions - (r"(for|For|ago|Ago|before|Before)$", "IN"), # time prepopsitions - (r"(till|Till|until|Until)$", "IN"), # time prepopsitions - (r"(by|By|beside|Beside)$", "IN"), # space prepopsitions - (r"(under|Under|below|Below)$", "IN"), # space prepopsitions - (r"(over|Over|above|Above)$", "IN"), # space prepopsitions - (r"(across|Across|through|Through)$", "IN"), # space prepopsitions - (r"(into|Into|towards|Towards)$", "IN"), # space prepopsitions - (r"(onto|Onto|from|From)$", "IN"), # space prepopsitions - (r".*able$", "JJ"), # adjectives - (r".*ness$", "NN"), # nouns formed from adjectives - (r".*ly$", "RB"), # adverbs - (r".*s$", "NNS"), # plural nouns - (r".*ing$", "VBG"), # gerunds - (r".*ed$", "VBD"), # past tense verbs - (r".*", "NN"), # nouns (default) - ] - ) - return _tagger.tag - - -def find_maltparser(parser_dirname): - """ - A module to find MaltParser .jar file and its dependencies. - """ - if os.path.exists(parser_dirname): # If a full path is given. - _malt_dir = parser_dirname - else: # Try to find path to maltparser directory in environment variables. - _malt_dir = find_dir(parser_dirname, env_vars=("MALT_PARSER",)) - # Checks that that the found directory contains all the necessary .jar - malt_dependencies = ["", "", ""] - _malt_jars = set(find_jars_within_path(_malt_dir)) - _jars = {os.path.split(jar)[1] for jar in _malt_jars} - malt_dependencies = {"log4j.jar", "libsvm.jar", "liblinear-1.8.jar"} - - assert malt_dependencies.issubset(_jars) - assert any( - filter(lambda i: i.startswith("maltparser-") and i.endswith(".jar"), _jars) - ) - return list(_malt_jars) - - -def find_malt_model(model_filename): - """ - A module to find pre-trained MaltParser model. - """ - if model_filename is None: - return "malt_temp.mco" - elif os.path.exists(model_filename): # If a full path is given. - return model_filename - else: # Try to find path to malt model in environment variables. - return find_file(model_filename, env_vars=("MALT_MODEL",), verbose=False) - - -class MaltParser(ParserI): - """ - A class for dependency parsing with MaltParser. The input is the paths to: - - (optionally) a maltparser directory - - (optionally) the path to a pre-trained MaltParser .mco model file - - (optionally) the tagger to use for POS tagging before parsing - - (optionally) additional Java arguments - - Example: - >>> from nltk.parse import malt - >>> # With MALT_PARSER and MALT_MODEL environment set. - >>> mp = malt.MaltParser(model_filename='engmalt.linear-1.7.mco') # doctest: +SKIP - >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP - (shot I (elephant an) (in (pajamas my)) .) - >>> # Without MALT_PARSER and MALT_MODEL environment. - >>> mp = malt.MaltParser('/home/user/maltparser-1.9.2/', '/home/user/engmalt.linear-1.7.mco') # doctest: +SKIP - >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP - (shot I (elephant an) (in (pajamas my)) .) - """ - - def __init__( - self, - parser_dirname="", - model_filename=None, - tagger=None, - additional_java_args=None, - ): - """ - An interface for parsing with the Malt Parser. - - :param parser_dirname: The path to the maltparser directory that - contains the maltparser-1.x.jar - :type parser_dirname: str - :param model_filename: The name of the pre-trained model with .mco file - extension. If provided, training will not be required. - (see http://www.maltparser.org/mco/mco.html and - see http://www.patful.com/chalk/node/185) - :type model_filename: str - :param tagger: The tagger used to POS tag the raw string before - formatting to CONLL format. It should behave like `nltk.pos_tag` - :type tagger: function - :param additional_java_args: This is the additional Java arguments that - one can use when calling Maltparser, usually this is the heapsize - limits, e.g. `additional_java_args=['-Xmx1024m']` - (see https://goo.gl/mpDBvQ) - :type additional_java_args: list - """ - - # Find all the necessary jar files for MaltParser. - self.malt_jars = find_maltparser(parser_dirname) - # Initialize additional java arguments. - self.additional_java_args = ( - additional_java_args if additional_java_args is not None else [] - ) - # Initialize model. - self.model = find_malt_model(model_filename) - self._trained = self.model != "malt_temp.mco" - # Set the working_dir parameters i.e. `-w` from MaltParser's option. - self.working_dir = tempfile.gettempdir() - # Initialize POS tagger. - self.tagger = tagger if tagger is not None else malt_regex_tagger() - - def parse_tagged_sents(self, sentences, verbose=False, top_relation_label="null"): - """ - Use MaltParser to parse multiple POS tagged sentences. Takes multiple - sentences where each sentence is a list of (word, tag) tuples. - The sentences must have already been tokenized and tagged. - - :param sentences: Input sentences to parse - :type sentence: list(list(tuple(str, str))) - :return: iter(iter(``DependencyGraph``)) the dependency graph - representation of each sentence - """ - if not self._trained: - raise Exception("Parser has not been trained. Call train() first.") - - with tempfile.NamedTemporaryFile( - prefix="malt_input.conll.", dir=self.working_dir, mode="w", delete=False - ) as input_file: - with tempfile.NamedTemporaryFile( - prefix="malt_output.conll.", - dir=self.working_dir, - mode="w", - delete=False, - ) as output_file: - # Convert list of sentences to CONLL format. - for line in taggedsents_to_conll(sentences): - input_file.write(str(line)) - input_file.close() - - # Generate command to run maltparser. - cmd = self.generate_malt_command( - input_file.name, output_file.name, mode="parse" - ) - - # This is a maltparser quirk, it needs to be run - # where the model file is. otherwise it goes into an awkward - # missing .jars or strange -w working_dir problem. - _current_path = os.getcwd() # Remembers the current path. - try: # Change to modelfile path - os.chdir(os.path.split(self.model)[0]) - except: - pass - ret = self._execute(cmd, verbose) # Run command. - os.chdir(_current_path) # Change back to current path. - - if ret != 0: - raise Exception( - "MaltParser parsing (%s) failed with exit " - "code %d" % (" ".join(cmd), ret) - ) - - # Must return iter(iter(Tree)) - with open(output_file.name) as infile: - for tree_str in infile.read().split("\n\n"): - yield ( - iter( - [ - DependencyGraph( - tree_str, top_relation_label=top_relation_label - ) - ] - ) - ) - - os.remove(input_file.name) - os.remove(output_file.name) - - def parse_sents(self, sentences, verbose=False, top_relation_label="null"): - """ - Use MaltParser to parse multiple sentences. - Takes a list of sentences, where each sentence is a list of words. - Each sentence will be automatically tagged with this - MaltParser instance's tagger. - - :param sentences: Input sentences to parse - :type sentence: list(list(str)) - :return: iter(DependencyGraph) - """ - tagged_sentences = (self.tagger(sentence) for sentence in sentences) - return self.parse_tagged_sents( - tagged_sentences, verbose, top_relation_label=top_relation_label - ) - - def generate_malt_command(self, inputfilename, outputfilename=None, mode=None): - """ - This function generates the maltparser command use at the terminal. - - :param inputfilename: path to the input file - :type inputfilename: str - :param outputfilename: path to the output file - :type outputfilename: str - """ - - cmd = ["java"] - cmd += self.additional_java_args # Adds additional java arguments - # Joins classpaths with ";" if on Windows and on Linux/Mac use ":" - classpaths_separator = ";" if sys.platform.startswith("win") else ":" - cmd += [ - "-cp", - classpaths_separator.join(self.malt_jars), - ] # Adds classpaths for jars - cmd += ["org.maltparser.Malt"] # Adds the main function. - - # Adds the model file. - if os.path.exists(self.model): # when parsing - cmd += ["-c", os.path.split(self.model)[-1]] - else: # when learning - cmd += ["-c", self.model] - - cmd += ["-i", inputfilename] - if mode == "parse": - cmd += ["-o", outputfilename] - cmd += ["-m", mode] # mode use to generate parses. - return cmd - - @staticmethod - def _execute(cmd, verbose=False): - output = None if verbose else subprocess.PIPE - p = subprocess.Popen(cmd, stdout=output, stderr=output) - return p.wait() - - def train(self, depgraphs, verbose=False): - """ - Train MaltParser from a list of ``DependencyGraph`` objects - - :param depgraphs: list of ``DependencyGraph`` objects for training input data - :type depgraphs: DependencyGraph - """ - - # Write the conll_str to malt_train.conll file in /tmp/ - with tempfile.NamedTemporaryFile( - prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False - ) as input_file: - input_str = "\n".join(dg.to_conll(10) for dg in depgraphs) - input_file.write(str(input_str)) - # Trains the model with the malt_train.conll - self.train_from_file(input_file.name, verbose=verbose) - # Removes the malt_train.conll once training finishes. - os.remove(input_file.name) - - def train_from_file(self, conll_file, verbose=False): - """ - Train MaltParser from a file - :param conll_file: str for the filename of the training input data - :type conll_file: str - """ - - # If conll_file is a ZipFilePathPointer, - # then we need to do some extra massaging - if isinstance(conll_file, ZipFilePathPointer): - with tempfile.NamedTemporaryFile( - prefix="malt_train.conll.", dir=self.working_dir, mode="w", delete=False - ) as input_file: - with conll_file.open() as conll_input_file: - conll_str = conll_input_file.read() - input_file.write(str(conll_str)) - return self.train_from_file(input_file.name, verbose=verbose) - - # Generate command to run maltparser. - cmd = self.generate_malt_command(conll_file, mode="learn") - ret = self._execute(cmd, verbose) - if ret != 0: - raise Exception( - "MaltParser training (%s) failed with exit " - "code %d" % (" ".join(cmd), ret) - ) - self._trained = True - - -if __name__ == "__main__": - """ - A demonstration function to show how NLTK users can use the malt parser API. - - >>> from nltk import pos_tag - >>> assert 'MALT_PARSER' in os.environ, str( - ... "Please set MALT_PARSER in your global environment, e.g.:\n" - ... "$ export MALT_PARSER='/home/user/maltparser-1.9.2/'") - >>> - >>> assert 'MALT_MODEL' in os.environ, str( - ... "Please set MALT_MODEL in your global environment, e.g.:\n" - ... "$ export MALT_MODEL='/home/user/engmalt.linear-1.7.mco'") - >>> - >>> _dg1_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n" - ... "2 sees _ VB _ _ 0 ROOT _ _\n" - ... "3 a _ DT _ _ 4 SPEC _ _\n" - ... "4 dog _ NN _ _ 2 OBJ _ _\n" - ... "5 . _ . _ _ 2 PUNCT _ _\n") - >>> - >>> - >>> _dg2_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n" - ... "2 walks _ VB _ _ 0 ROOT _ _\n" - ... "3 . _ . _ _ 2 PUNCT _ _\n") - >>> dg1 = DependencyGraph(_dg1_str) - >>> dg2 = DependencyGraph(_dg2_str) - >>> # Initialize a MaltParser object - >>> mp = MaltParser() - >>> - >>> # Trains a model. - >>> mp.train([dg1,dg2], verbose=False) - >>> sent1 = ['John','sees','Mary', '.'] - >>> sent2 = ['John', 'walks', 'a', 'dog', '.'] - >>> - >>> # Parse a single sentence. - >>> parsed_sent1 = mp.parse_one(sent1) - >>> parsed_sent2 = mp.parse_one(sent2) - >>> print(parsed_sent1.tree()) - (sees John Mary .) - >>> print(parsed_sent2.tree()) - (walks John (dog a) .) - >>> - >>> # Parsing multiple sentences. - >>> sentences = [sent1,sent2] - >>> parsed_sents = mp.parse_sents(sentences) - >>> print(next(next(parsed_sents)).tree()) - (sees John Mary .) - >>> print(next(next(parsed_sents)).tree()) - (walks John (dog a) .) - >>> - >>> # Initialize a MaltParser object with an English pre-trained model. - >>> parser_dirname = 'maltparser-1.9.2' - >>> model_name = 'engmalt.linear-1.7.mco' - >>> mp = MaltParser(parser_dirname=parser_dirname, model_filename=model_name, tagger=pos_tag) - >>> sent1 = 'I shot an elephant in my pajamas .'.split() - >>> sent2 = 'Time flies like banana .'.split() - >>> # Parse a single sentence. - >>> print(mp.parse_one(sent1).tree()) - (shot I (elephant an) (in (pajamas my)) .) - # Parsing multiple sentences - >>> sentences = [sent1,sent2] - >>> parsed_sents = mp.parse_sents(sentences) - >>> print(next(next(parsed_sents)).tree()) - (shot I (elephant an) (in (pajamas my)) .) - >>> print(next(next(parsed_sents)).tree()) - (flies Time (like banana) .) - """ - - import doctest - - doctest.testmod() diff --git a/pipeline/nltk/parse/nonprojectivedependencyparser.py b/pipeline/nltk/parse/nonprojectivedependencyparser.py deleted file mode 100644 index b96f996cf63b4d3e093994d6319c8fb9fb91569a..0000000000000000000000000000000000000000 --- a/pipeline/nltk/parse/nonprojectivedependencyparser.py +++ /dev/null @@ -1,772 +0,0 @@ -# Natural Language Toolkit: Dependency Grammars -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Jason Narad -# -# URL: -# For license information, see LICENSE.TXT -# - -import logging -import math - -from nltk.parse.dependencygraph import DependencyGraph - -logger = logging.getLogger(__name__) - -################################################################# -# DependencyScorerI - Interface for Graph-Edge Weight Calculation -################################################################# - - -class DependencyScorerI: - """ - A scorer for calculated the weights on the edges of a weighted - dependency graph. This is used by a - ``ProbabilisticNonprojectiveParser`` to initialize the edge - weights of a ``DependencyGraph``. While typically this would be done - by training a binary classifier, any class that can return a - multidimensional list representation of the edge weights can - implement this interface. As such, it has no necessary - fields. - """ - - def __init__(self): - if self.__class__ == DependencyScorerI: - raise TypeError("DependencyScorerI is an abstract interface") - - def train(self, graphs): - """ - :type graphs: list(DependencyGraph) - :param graphs: A list of dependency graphs to train the scorer. - Typically the edges present in the graphs can be used as - positive training examples, and the edges not present as negative - examples. - """ - raise NotImplementedError() - - def score(self, graph): - """ - :type graph: DependencyGraph - :param graph: A dependency graph whose set of edges need to be - scored. - :rtype: A three-dimensional list of numbers. - :return: The score is returned in a multidimensional(3) list, such - that the outer-dimension refers to the head, and the - inner-dimension refers to the dependencies. For instance, - scores[0][1] would reference the list of scores corresponding to - arcs from node 0 to node 1. The node's 'address' field can be used - to determine its number identification. - - For further illustration, a score list corresponding to Fig.2 of - Keith Hall's 'K-best Spanning Tree Parsing' paper:: - - scores = [[[], [5], [1], [1]], - [[], [], [11], [4]], - [[], [10], [], [5]], - [[], [8], [8], []]] - - When used in conjunction with a MaxEntClassifier, each score would - correspond to the confidence of a particular edge being classified - with the positive training examples. - """ - raise NotImplementedError() - - -################################################################# -# NaiveBayesDependencyScorer -################################################################# - - -class NaiveBayesDependencyScorer(DependencyScorerI): - """ - A dependency scorer built around a MaxEnt classifier. In this - particular class that classifier is a ``NaiveBayesClassifier``. - It uses head-word, head-tag, child-word, and child-tag features - for classification. - - >>> from nltk.parse.dependencygraph import DependencyGraph, conll_data2 - - >>> graphs = [DependencyGraph(entry) for entry in conll_data2.split('\\n\\n') if entry] - >>> npp = ProbabilisticNonprojectiveParser() - >>> npp.train(graphs, NaiveBayesDependencyScorer()) - >>> parses = npp.parse(['Cathy', 'zag', 'hen', 'zwaaien', '.'], ['N', 'V', 'Pron', 'Adj', 'N', 'Punc']) - >>> len(list(parses)) - 1 - - """ - - def __init__(self): - pass # Do nothing without throwing error - - def train(self, graphs): - """ - Trains a ``NaiveBayesClassifier`` using the edges present in - graphs list as positive examples, the edges not present as - negative examples. Uses a feature vector of head-word, - head-tag, child-word, and child-tag. - - :type graphs: list(DependencyGraph) - :param graphs: A list of dependency graphs to train the scorer. - """ - - from nltk.classify import NaiveBayesClassifier - - # Create training labeled training examples - labeled_examples = [] - for graph in graphs: - for head_node in graph.nodes.values(): - for child_index, child_node in graph.nodes.items(): - if child_index in head_node["deps"]: - label = "T" - else: - label = "F" - labeled_examples.append( - ( - dict( - a=head_node["word"], - b=head_node["tag"], - c=child_node["word"], - d=child_node["tag"], - ), - label, - ) - ) - - self.classifier = NaiveBayesClassifier.train(labeled_examples) - - def score(self, graph): - """ - Converts the graph into a feature-based representation of - each edge, and then assigns a score to each based on the - confidence of the classifier in assigning it to the - positive label. Scores are returned in a multidimensional list. - - :type graph: DependencyGraph - :param graph: A dependency graph to score. - :rtype: 3 dimensional list - :return: Edge scores for the graph parameter. - """ - # Convert graph to feature representation - edges = [] - for head_node in graph.nodes.values(): - for child_node in graph.nodes.values(): - edges.append( - dict( - a=head_node["word"], - b=head_node["tag"], - c=child_node["word"], - d=child_node["tag"], - ) - ) - - # Score edges - edge_scores = [] - row = [] - count = 0 - for pdist in self.classifier.prob_classify_many(edges): - logger.debug("%.4f %.4f", pdist.prob("T"), pdist.prob("F")) - # smoothing in case the probability = 0 - row.append([math.log(pdist.prob("T") + 0.00000000001)]) - count += 1 - if count == len(graph.nodes): - edge_scores.append(row) - row = [] - count = 0 - return edge_scores - - -################################################################# -# A Scorer for Demo Purposes -################################################################# -# A short class necessary to show parsing example from paper -class DemoScorer(DependencyScorerI): - def train(self, graphs): - print("Training...") - - def score(self, graph): - # scores for Keith Hall 'K-best Spanning Tree Parsing' paper - return [ - [[], [5], [1], [1]], - [[], [], [11], [4]], - [[], [10], [], [5]], - [[], [8], [8], []], - ] - - -################################################################# -# Non-Projective Probabilistic Parsing -################################################################# - - -class ProbabilisticNonprojectiveParser: - """A probabilistic non-projective dependency parser. - - Nonprojective dependencies allows for "crossing branches" in the parse tree - which is necessary for representing particular linguistic phenomena, or even - typical parses in some languages. This parser follows the MST parsing - algorithm, outlined in McDonald(2005), which likens the search for the best - non-projective parse to finding the maximum spanning tree in a weighted - directed graph. - - >>> class Scorer(DependencyScorerI): - ... def train(self, graphs): - ... pass - ... - ... def score(self, graph): - ... return [ - ... [[], [5], [1], [1]], - ... [[], [], [11], [4]], - ... [[], [10], [], [5]], - ... [[], [8], [8], []], - ... ] - - - >>> npp = ProbabilisticNonprojectiveParser() - >>> npp.train([], Scorer()) - - >>> parses = npp.parse(['v1', 'v2', 'v3'], [None, None, None]) - >>> len(list(parses)) - 1 - - Rule based example - - >>> from nltk.grammar import DependencyGrammar - - >>> grammar = DependencyGrammar.fromstring(''' - ... 'taught' -> 'play' | 'man' - ... 'man' -> 'the' | 'in' - ... 'in' -> 'corner' - ... 'corner' -> 'the' - ... 'play' -> 'golf' | 'dachshund' | 'to' - ... 'dachshund' -> 'his' - ... ''') - - >>> ndp = NonprojectiveDependencyParser(grammar) - >>> parses = ndp.parse(['the', 'man', 'in', 'the', 'corner', 'taught', 'his', 'dachshund', 'to', 'play', 'golf']) - >>> len(list(parses)) - 4 - - """ - - def __init__(self): - """ - Creates a new non-projective parser. - """ - logging.debug("initializing prob. nonprojective...") - - def train(self, graphs, dependency_scorer): - """ - Trains a ``DependencyScorerI`` from a set of ``DependencyGraph`` objects, - and establishes this as the parser's scorer. This is used to - initialize the scores on a ``DependencyGraph`` during the parsing - procedure. - - :type graphs: list(DependencyGraph) - :param graphs: A list of dependency graphs to train the scorer. - :type dependency_scorer: DependencyScorerI - :param dependency_scorer: A scorer which implements the - ``DependencyScorerI`` interface. - """ - self._scorer = dependency_scorer - self._scorer.train(graphs) - - def initialize_edge_scores(self, graph): - """ - Assigns a score to every edge in the ``DependencyGraph`` graph. - These scores are generated via the parser's scorer which - was assigned during the training process. - - :type graph: DependencyGraph - :param graph: A dependency graph to assign scores to. - """ - self.scores = self._scorer.score(graph) - - def collapse_nodes(self, new_node, cycle_path, g_graph, b_graph, c_graph): - """ - Takes a list of nodes that have been identified to belong to a cycle, - and collapses them into on larger node. The arcs of all nodes in - the graph must be updated to account for this. - - :type new_node: Node. - :param new_node: A Node (Dictionary) to collapse the cycle nodes into. - :type cycle_path: A list of integers. - :param cycle_path: A list of node addresses, each of which is in the cycle. - :type g_graph, b_graph, c_graph: DependencyGraph - :param g_graph, b_graph, c_graph: Graphs which need to be updated. - """ - logger.debug("Collapsing nodes...") - # Collapse all cycle nodes into v_n+1 in G_Graph - for cycle_node_index in cycle_path: - g_graph.remove_by_address(cycle_node_index) - g_graph.add_node(new_node) - g_graph.redirect_arcs(cycle_path, new_node["address"]) - - def update_edge_scores(self, new_node, cycle_path): - """ - Updates the edge scores to reflect a collapse operation into - new_node. - - :type new_node: A Node. - :param new_node: The node which cycle nodes are collapsed into. - :type cycle_path: A list of integers. - :param cycle_path: A list of node addresses that belong to the cycle. - """ - logger.debug("cycle %s", cycle_path) - - cycle_path = self.compute_original_indexes(cycle_path) - - logger.debug("old cycle %s", cycle_path) - logger.debug("Prior to update: %s", self.scores) - - for i, row in enumerate(self.scores): - for j, column in enumerate(self.scores[i]): - logger.debug(self.scores[i][j]) - if j in cycle_path and i not in cycle_path and self.scores[i][j]: - subtract_val = self.compute_max_subtract_score(j, cycle_path) - - logger.debug("%s - %s", self.scores[i][j], subtract_val) - - new_vals = [] - for cur_val in self.scores[i][j]: - new_vals.append(cur_val - subtract_val) - - self.scores[i][j] = new_vals - - for i, row in enumerate(self.scores): - for j, cell in enumerate(self.scores[i]): - if i in cycle_path and j in cycle_path: - self.scores[i][j] = [] - - logger.debug("After update: %s", self.scores) - - def compute_original_indexes(self, new_indexes): - """ - As nodes are collapsed into others, they are replaced - by the new node in the graph, but it's still necessary - to keep track of what these original nodes were. This - takes a list of node addresses and replaces any collapsed - node addresses with their original addresses. - - :type new_indexes: A list of integers. - :param new_indexes: A list of node addresses to check for - subsumed nodes. - """ - swapped = True - while swapped: - originals = [] - swapped = False - for new_index in new_indexes: - if new_index in self.inner_nodes: - for old_val in self.inner_nodes[new_index]: - if old_val not in originals: - originals.append(old_val) - swapped = True - else: - originals.append(new_index) - new_indexes = originals - return new_indexes - - def compute_max_subtract_score(self, column_index, cycle_indexes): - """ - When updating scores the score of the highest-weighted incoming - arc is subtracted upon collapse. This returns the correct - amount to subtract from that edge. - - :type column_index: integer. - :param column_index: A index representing the column of incoming arcs - to a particular node being updated - :type cycle_indexes: A list of integers. - :param cycle_indexes: Only arcs from cycle nodes are considered. This - is a list of such nodes addresses. - """ - max_score = -100000 - for row_index in cycle_indexes: - for subtract_val in self.scores[row_index][column_index]: - if subtract_val > max_score: - max_score = subtract_val - return max_score - - def best_incoming_arc(self, node_index): - """ - Returns the source of the best incoming arc to the - node with address: node_index - - :type node_index: integer. - :param node_index: The address of the 'destination' node, - the node that is arced to. - """ - originals = self.compute_original_indexes([node_index]) - logger.debug("originals: %s", originals) - - max_arc = None - max_score = None - for row_index in range(len(self.scores)): - for col_index in range(len(self.scores[row_index])): - if col_index in originals and ( - max_score is None or self.scores[row_index][col_index] > max_score - ): - max_score = self.scores[row_index][col_index] - max_arc = row_index - logger.debug("%s, %s", row_index, col_index) - - logger.debug(max_score) - - for key in self.inner_nodes: - replaced_nodes = self.inner_nodes[key] - if max_arc in replaced_nodes: - return key - - return max_arc - - def original_best_arc(self, node_index): - originals = self.compute_original_indexes([node_index]) - max_arc = None - max_score = None - max_orig = None - for row_index in range(len(self.scores)): - for col_index in range(len(self.scores[row_index])): - if col_index in originals and ( - max_score is None or self.scores[row_index][col_index] > max_score - ): - max_score = self.scores[row_index][col_index] - max_arc = row_index - max_orig = col_index - return [max_arc, max_orig] - - def parse(self, tokens, tags): - """ - Parses a list of tokens in accordance to the MST parsing algorithm - for non-projective dependency parses. Assumes that the tokens to - be parsed have already been tagged and those tags are provided. Various - scoring methods can be used by implementing the ``DependencyScorerI`` - interface and passing it to the training algorithm. - - :type tokens: list(str) - :param tokens: A list of words or punctuation to be parsed. - :type tags: list(str) - :param tags: A list of tags corresponding by index to the words in the tokens list. - :return: An iterator of non-projective parses. - :rtype: iter(DependencyGraph) - """ - self.inner_nodes = {} - - # Initialize g_graph - g_graph = DependencyGraph() - for index, token in enumerate(tokens): - g_graph.nodes[index + 1].update( - {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1} - ) - - # Fully connect non-root nodes in g_graph - g_graph.connect_graph() - original_graph = DependencyGraph() - for index, token in enumerate(tokens): - original_graph.nodes[index + 1].update( - {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1} - ) - - b_graph = DependencyGraph() - c_graph = DependencyGraph() - - for index, token in enumerate(tokens): - c_graph.nodes[index + 1].update( - {"word": token, "tag": tags[index], "rel": "NTOP", "address": index + 1} - ) - - # Assign initial scores to g_graph edges - self.initialize_edge_scores(g_graph) - logger.debug(self.scores) - # Initialize a list of unvisited vertices (by node address) - unvisited_vertices = [vertex["address"] for vertex in c_graph.nodes.values()] - # Iterate over unvisited vertices - nr_vertices = len(tokens) - betas = {} - while unvisited_vertices: - # Mark current node as visited - current_vertex = unvisited_vertices.pop(0) - logger.debug("current_vertex: %s", current_vertex) - # Get corresponding node n_i to vertex v_i - current_node = g_graph.get_by_address(current_vertex) - logger.debug("current_node: %s", current_node) - # Get best in-edge node b for current node - best_in_edge = self.best_incoming_arc(current_vertex) - betas[current_vertex] = self.original_best_arc(current_vertex) - logger.debug("best in arc: %s --> %s", best_in_edge, current_vertex) - # b_graph = Union(b_graph, b) - for new_vertex in [current_vertex, best_in_edge]: - b_graph.nodes[new_vertex].update( - {"word": "TEMP", "rel": "NTOP", "address": new_vertex} - ) - b_graph.add_arc(best_in_edge, current_vertex) - # Beta(current node) = b - stored for parse recovery - # If b_graph contains a cycle, collapse it - cycle_path = b_graph.contains_cycle() - if cycle_path: - # Create a new node v_n+1 with address = len(nodes) + 1 - new_node = {"word": "NONE", "rel": "NTOP", "address": nr_vertices + 1} - # c_graph = Union(c_graph, v_n+1) - c_graph.add_node(new_node) - # Collapse all nodes in cycle C into v_n+1 - self.update_edge_scores(new_node, cycle_path) - self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph) - for cycle_index in cycle_path: - c_graph.add_arc(new_node["address"], cycle_index) - # self.replaced_by[cycle_index] = new_node['address'] - - self.inner_nodes[new_node["address"]] = cycle_path - - # Add v_n+1 to list of unvisited vertices - unvisited_vertices.insert(0, nr_vertices + 1) - - # increment # of nodes counter - nr_vertices += 1 - - # Remove cycle nodes from b_graph; B = B - cycle c - for cycle_node_address in cycle_path: - b_graph.remove_by_address(cycle_node_address) - - logger.debug("g_graph: %s", g_graph) - logger.debug("b_graph: %s", b_graph) - logger.debug("c_graph: %s", c_graph) - logger.debug("Betas: %s", betas) - logger.debug("replaced nodes %s", self.inner_nodes) - - # Recover parse tree - logger.debug("Final scores: %s", self.scores) - - logger.debug("Recovering parse...") - for i in range(len(tokens) + 1, nr_vertices + 1): - betas[betas[i][1]] = betas[i] - - logger.debug("Betas: %s", betas) - for node in original_graph.nodes.values(): - # TODO: It's dangerous to assume that deps it a dictionary - # because it's a default dictionary. Ideally, here we should not - # be concerned how dependencies are stored inside of a dependency - # graph. - node["deps"] = {} - for i in range(1, len(tokens) + 1): - original_graph.add_arc(betas[i][0], betas[i][1]) - - logger.debug("Done.") - yield original_graph - - -################################################################# -# Rule-based Non-Projective Parser -################################################################# - - -class NonprojectiveDependencyParser: - """ - A non-projective, rule-based, dependency parser. This parser - will return the set of all possible non-projective parses based on - the word-to-word relations defined in the parser's dependency - grammar, and will allow the branches of the parse tree to cross - in order to capture a variety of linguistic phenomena that a - projective parser will not. - """ - - def __init__(self, dependency_grammar): - """ - Creates a new ``NonprojectiveDependencyParser``. - - :param dependency_grammar: a grammar of word-to-word relations. - :type dependency_grammar: DependencyGrammar - """ - self._grammar = dependency_grammar - - def parse(self, tokens): - """ - Parses the input tokens with respect to the parser's grammar. Parsing - is accomplished by representing the search-space of possible parses as - a fully-connected directed graph. Arcs that would lead to ungrammatical - parses are removed and a lattice is constructed of length n, where n is - the number of input tokens, to represent all possible grammatical - traversals. All possible paths through the lattice are then enumerated - to produce the set of non-projective parses. - - param tokens: A list of tokens to parse. - type tokens: list(str) - return: An iterator of non-projective parses. - rtype: iter(DependencyGraph) - """ - # Create graph representation of tokens - self._graph = DependencyGraph() - - for index, token in enumerate(tokens): - self._graph.nodes[index] = { - "word": token, - "deps": [], - "rel": "NTOP", - "address": index, - } - - for head_node in self._graph.nodes.values(): - deps = [] - for dep_node in self._graph.nodes.values(): - if ( - self._grammar.contains(head_node["word"], dep_node["word"]) - and head_node["word"] != dep_node["word"] - ): - deps.append(dep_node["address"]) - head_node["deps"] = deps - - # Create lattice of possible heads - roots = [] - possible_heads = [] - for i, word in enumerate(tokens): - heads = [] - for j, head in enumerate(tokens): - if (i != j) and self._grammar.contains(head, word): - heads.append(j) - if len(heads) == 0: - roots.append(i) - possible_heads.append(heads) - - # Set roots to attempt - if len(roots) < 2: - if len(roots) == 0: - for i in range(len(tokens)): - roots.append(i) - - # Traverse lattice - analyses = [] - for _ in roots: - stack = [] - analysis = [[] for i in range(len(possible_heads))] - i = 0 - forward = True - while i >= 0: - if forward: - if len(possible_heads[i]) == 1: - analysis[i] = possible_heads[i][0] - elif len(possible_heads[i]) == 0: - analysis[i] = -1 - else: - head = possible_heads[i].pop() - analysis[i] = head - stack.append([i, head]) - if not forward: - index_on_stack = False - for stack_item in stack: - if stack_item[0] == i: - index_on_stack = True - orig_length = len(possible_heads[i]) - - if index_on_stack and orig_length == 0: - for j in range(len(stack) - 1, -1, -1): - stack_item = stack[j] - if stack_item[0] == i: - possible_heads[i].append(stack.pop(j)[1]) - - elif index_on_stack and orig_length > 0: - head = possible_heads[i].pop() - analysis[i] = head - stack.append([i, head]) - forward = True - - if i + 1 == len(possible_heads): - analyses.append(analysis[:]) - forward = False - if forward: - i += 1 - else: - i -= 1 - - # Filter parses - # ensure 1 root, every thing has 1 head - for analysis in analyses: - if analysis.count(-1) > 1: - # there are several root elements! - continue - - graph = DependencyGraph() - graph.root = graph.nodes[analysis.index(-1) + 1] - - for address, (token, head_index) in enumerate( - zip(tokens, analysis), start=1 - ): - head_address = head_index + 1 - - node = graph.nodes[address] - node.update({"word": token, "address": address}) - - if head_address == 0: - rel = "ROOT" - else: - rel = "" - graph.nodes[head_index + 1]["deps"][rel].append(address) - - # TODO: check for cycles - yield graph - - -################################################################# -# Demos -################################################################# - - -def demo(): - # hall_demo() - nonprojective_conll_parse_demo() - rule_based_demo() - - -def hall_demo(): - npp = ProbabilisticNonprojectiveParser() - npp.train([], DemoScorer()) - for parse_graph in npp.parse(["v1", "v2", "v3"], [None, None, None]): - print(parse_graph) - - -def nonprojective_conll_parse_demo(): - from nltk.parse.dependencygraph import conll_data2 - - graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry] - npp = ProbabilisticNonprojectiveParser() - npp.train(graphs, NaiveBayesDependencyScorer()) - for parse_graph in npp.parse( - ["Cathy", "zag", "hen", "zwaaien", "."], ["N", "V", "Pron", "Adj", "N", "Punc"] - ): - print(parse_graph) - - -def rule_based_demo(): - from nltk.grammar import DependencyGrammar - - grammar = DependencyGrammar.fromstring( - """ - 'taught' -> 'play' | 'man' - 'man' -> 'the' | 'in' - 'in' -> 'corner' - 'corner' -> 'the' - 'play' -> 'golf' | 'dachshund' | 'to' - 'dachshund' -> 'his' - """ - ) - print(grammar) - ndp = NonprojectiveDependencyParser(grammar) - graphs = ndp.parse( - [ - "the", - "man", - "in", - "the", - "corner", - "taught", - "his", - "dachshund", - "to", - "play", - "golf", - ] - ) - print("Graphs:") - for graph in graphs: - print(graph) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/parse/pchart.py b/pipeline/nltk/parse/pchart.py deleted file mode 100644 index 319655d023a462c0c6c7ac087746dc77d46b7949..0000000000000000000000000000000000000000 --- a/pipeline/nltk/parse/pchart.py +++ /dev/null @@ -1,579 +0,0 @@ -# Natural Language Toolkit: Probabilistic Chart Parsers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird -# URL: -# For license information, see LICENSE.TXT - -""" -Classes and interfaces for associating probabilities with tree -structures that represent the internal organization of a text. The -probabilistic parser module defines ``BottomUpProbabilisticChartParser``. - -``BottomUpProbabilisticChartParser`` is an abstract class that implements -a bottom-up chart parser for ``PCFG`` grammars. It maintains a queue of edges, -and adds them to the chart one at a time. The ordering of this queue -is based on the probabilities associated with the edges, allowing the -parser to expand more likely edges before less likely ones. Each -subclass implements a different queue ordering, producing different -search strategies. Currently the following subclasses are defined: - - - ``InsideChartParser`` searches edges in decreasing order of - their trees' inside probabilities. - - ``RandomChartParser`` searches edges in random order. - - ``LongestChartParser`` searches edges in decreasing order of their - location's length. - -The ``BottomUpProbabilisticChartParser`` constructor has an optional -argument beam_size. If non-zero, this controls the size of the beam -(aka the edge queue). This option is most useful with InsideChartParser. -""" - -##////////////////////////////////////////////////////// -## Bottom-Up PCFG Chart Parser -##////////////////////////////////////////////////////// - -# [XX] This might not be implemented quite right -- it would be better -# to associate probabilities with child pointer lists. - -import random -from functools import reduce - -from nltk.grammar import PCFG, Nonterminal -from nltk.parse.api import ParserI -from nltk.parse.chart import AbstractChartRule, Chart, LeafEdge, TreeEdge -from nltk.tree import ProbabilisticTree, Tree - - -# Probabilistic edges -class ProbabilisticLeafEdge(LeafEdge): - def prob(self): - return 1.0 - - -class ProbabilisticTreeEdge(TreeEdge): - def __init__(self, prob, *args, **kwargs): - TreeEdge.__init__(self, *args, **kwargs) - self._prob = prob - # two edges with different probabilities are not equal. - self._comparison_key = (self._comparison_key, prob) - - def prob(self): - return self._prob - - @staticmethod - def from_production(production, index, p): - return ProbabilisticTreeEdge( - p, (index, index), production.lhs(), production.rhs(), 0 - ) - - -# Rules using probabilistic edges -class ProbabilisticBottomUpInitRule(AbstractChartRule): - NUM_EDGES = 0 - - def apply(self, chart, grammar): - for index in range(chart.num_leaves()): - new_edge = ProbabilisticLeafEdge(chart.leaf(index), index) - if chart.insert(new_edge, ()): - yield new_edge - - -class ProbabilisticBottomUpPredictRule(AbstractChartRule): - NUM_EDGES = 1 - - def apply(self, chart, grammar, edge): - if edge.is_incomplete(): - return - for prod in grammar.productions(): - if edge.lhs() == prod.rhs()[0]: - new_edge = ProbabilisticTreeEdge.from_production( - prod, edge.start(), prod.prob() - ) - if chart.insert(new_edge, ()): - yield new_edge - - -class ProbabilisticFundamentalRule(AbstractChartRule): - NUM_EDGES = 2 - - def apply(self, chart, grammar, left_edge, right_edge): - # Make sure the rule is applicable. - if not ( - left_edge.end() == right_edge.start() - and left_edge.nextsym() == right_edge.lhs() - and left_edge.is_incomplete() - and right_edge.is_complete() - ): - return - - # Construct the new edge. - p = left_edge.prob() * right_edge.prob() - new_edge = ProbabilisticTreeEdge( - p, - span=(left_edge.start(), right_edge.end()), - lhs=left_edge.lhs(), - rhs=left_edge.rhs(), - dot=left_edge.dot() + 1, - ) - - # Add it to the chart, with appropriate child pointers. - changed_chart = False - for cpl1 in chart.child_pointer_lists(left_edge): - if chart.insert(new_edge, cpl1 + (right_edge,)): - changed_chart = True - - # If we changed the chart, then generate the edge. - if changed_chart: - yield new_edge - - -class SingleEdgeProbabilisticFundamentalRule(AbstractChartRule): - NUM_EDGES = 1 - - _fundamental_rule = ProbabilisticFundamentalRule() - - def apply(self, chart, grammar, edge1): - fr = self._fundamental_rule - if edge1.is_incomplete(): - # edge1 = left_edge; edge2 = right_edge - for edge2 in chart.select( - start=edge1.end(), is_complete=True, lhs=edge1.nextsym() - ): - yield from fr.apply(chart, grammar, edge1, edge2) - else: - # edge2 = left_edge; edge1 = right_edge - for edge2 in chart.select( - end=edge1.start(), is_complete=False, nextsym=edge1.lhs() - ): - yield from fr.apply(chart, grammar, edge2, edge1) - - def __str__(self): - return "Fundamental Rule" - - -class BottomUpProbabilisticChartParser(ParserI): - """ - An abstract bottom-up parser for ``PCFG`` grammars that uses a ``Chart`` to - record partial results. ``BottomUpProbabilisticChartParser`` maintains - a queue of edges that can be added to the chart. This queue is - initialized with edges for each token in the text that is being - parsed. ``BottomUpProbabilisticChartParser`` inserts these edges into - the chart one at a time, starting with the most likely edges, and - proceeding to less likely edges. For each edge that is added to - the chart, it may become possible to insert additional edges into - the chart; these are added to the queue. This process continues - until enough complete parses have been generated, or until the - queue is empty. - - The sorting order for the queue is not specified by - ``BottomUpProbabilisticChartParser``. Different sorting orders will - result in different search strategies. The sorting order for the - queue is defined by the method ``sort_queue``; subclasses are required - to provide a definition for this method. - - :type _grammar: PCFG - :ivar _grammar: The grammar used to parse sentences. - :type _trace: int - :ivar _trace: The level of tracing output that should be generated - when parsing a text. - """ - - def __init__(self, grammar, beam_size=0, trace=0): - """ - Create a new ``BottomUpProbabilisticChartParser``, that uses - ``grammar`` to parse texts. - - :type grammar: PCFG - :param grammar: The grammar used to parse texts. - :type beam_size: int - :param beam_size: The maximum length for the parser's edge queue. - :type trace: int - :param trace: The level of tracing that should be used when - parsing a text. ``0`` will generate no tracing output; - and higher numbers will produce more verbose tracing - output. - """ - if not isinstance(grammar, PCFG): - raise ValueError("The grammar must be probabilistic PCFG") - self._grammar = grammar - self.beam_size = beam_size - self._trace = trace - - def grammar(self): - return self._grammar - - def trace(self, trace=2): - """ - Set the level of tracing output that should be generated when - parsing a text. - - :type trace: int - :param trace: The trace level. A trace level of ``0`` will - generate no tracing output; and higher trace levels will - produce more verbose tracing output. - :rtype: None - """ - self._trace = trace - - # TODO: change this to conform more with the standard ChartParser - def parse(self, tokens): - self._grammar.check_coverage(tokens) - chart = Chart(list(tokens)) - grammar = self._grammar - - # Chart parser rules. - bu_init = ProbabilisticBottomUpInitRule() - bu = ProbabilisticBottomUpPredictRule() - fr = SingleEdgeProbabilisticFundamentalRule() - - # Our queue - queue = [] - - # Initialize the chart. - for edge in bu_init.apply(chart, grammar): - if self._trace > 1: - print( - " %-50s [%s]" - % (chart.pretty_format_edge(edge, width=2), edge.prob()) - ) - queue.append(edge) - - while len(queue) > 0: - # Re-sort the queue. - self.sort_queue(queue, chart) - - # Prune the queue to the correct size if a beam was defined - if self.beam_size: - self._prune(queue, chart) - - # Get the best edge. - edge = queue.pop() - if self._trace > 0: - print( - " %-50s [%s]" - % (chart.pretty_format_edge(edge, width=2), edge.prob()) - ) - - # Apply BU & FR to it. - queue.extend(bu.apply(chart, grammar, edge)) - queue.extend(fr.apply(chart, grammar, edge)) - - # Get a list of complete parses. - parses = list(chart.parses(grammar.start(), ProbabilisticTree)) - - # Assign probabilities to the trees. - prod_probs = {} - for prod in grammar.productions(): - prod_probs[prod.lhs(), prod.rhs()] = prod.prob() - for parse in parses: - self._setprob(parse, prod_probs) - - # Sort by probability - parses.sort(reverse=True, key=lambda tree: tree.prob()) - - return iter(parses) - - def _setprob(self, tree, prod_probs): - if tree.prob() is not None: - return - - # Get the prob of the CFG production. - lhs = Nonterminal(tree.label()) - rhs = [] - for child in tree: - if isinstance(child, Tree): - rhs.append(Nonterminal(child.label())) - else: - rhs.append(child) - prob = prod_probs[lhs, tuple(rhs)] - - # Get the probs of children. - for child in tree: - if isinstance(child, Tree): - self._setprob(child, prod_probs) - prob *= child.prob() - - tree.set_prob(prob) - - def sort_queue(self, queue, chart): - """ - Sort the given queue of ``Edge`` objects, placing the edge that should - be tried first at the beginning of the queue. This method - will be called after each ``Edge`` is added to the queue. - - :param queue: The queue of ``Edge`` objects to sort. Each edge in - this queue is an edge that could be added to the chart by - the fundamental rule; but that has not yet been added. - :type queue: list(Edge) - :param chart: The chart being used to parse the text. This - chart can be used to provide extra information for sorting - the queue. - :type chart: Chart - :rtype: None - """ - raise NotImplementedError() - - def _prune(self, queue, chart): - """Discard items in the queue if the queue is longer than the beam.""" - if len(queue) > self.beam_size: - split = len(queue) - self.beam_size - if self._trace > 2: - for edge in queue[:split]: - print(" %-50s [DISCARDED]" % chart.pretty_format_edge(edge, 2)) - del queue[:split] - - -class InsideChartParser(BottomUpProbabilisticChartParser): - """ - A bottom-up parser for ``PCFG`` grammars that tries edges in descending - order of the inside probabilities of their trees. The "inside - probability" of a tree is simply the - probability of the entire tree, ignoring its context. In - particular, the inside probability of a tree generated by - production *p* with children *c[1], c[2], ..., c[n]* is - *P(p)P(c[1])P(c[2])...P(c[n])*; and the inside - probability of a token is 1 if it is present in the text, and 0 if - it is absent. - - This sorting order results in a type of lowest-cost-first search - strategy. - """ - - # Inherit constructor. - def sort_queue(self, queue, chart): - """ - Sort the given queue of edges, in descending order of the - inside probabilities of the edges' trees. - - :param queue: The queue of ``Edge`` objects to sort. Each edge in - this queue is an edge that could be added to the chart by - the fundamental rule; but that has not yet been added. - :type queue: list(Edge) - :param chart: The chart being used to parse the text. This - chart can be used to provide extra information for sorting - the queue. - :type chart: Chart - :rtype: None - """ - queue.sort(key=lambda edge: edge.prob()) - - -# Eventually, this will become some sort of inside-outside parser: -# class InsideOutsideParser(BottomUpProbabilisticChartParser): -# def __init__(self, grammar, trace=0): -# # Inherit docs. -# BottomUpProbabilisticChartParser.__init__(self, grammar, trace) -# -# # Find the best path from S to each nonterminal -# bestp = {} -# for production in grammar.productions(): bestp[production.lhs()]=0 -# bestp[grammar.start()] = 1.0 -# -# for i in range(len(grammar.productions())): -# for production in grammar.productions(): -# lhs = production.lhs() -# for elt in production.rhs(): -# bestp[elt] = max(bestp[lhs]*production.prob(), -# bestp.get(elt,0)) -# -# self._bestp = bestp -# for (k,v) in self._bestp.items(): print(k,v) -# -# def _sortkey(self, edge): -# return edge.structure()[PROB] * self._bestp[edge.lhs()] -# -# def sort_queue(self, queue, chart): -# queue.sort(key=self._sortkey) - - -class RandomChartParser(BottomUpProbabilisticChartParser): - """ - A bottom-up parser for ``PCFG`` grammars that tries edges in random order. - This sorting order results in a random search strategy. - """ - - # Inherit constructor - def sort_queue(self, queue, chart): - i = random.randint(0, len(queue) - 1) - (queue[-1], queue[i]) = (queue[i], queue[-1]) - - -class UnsortedChartParser(BottomUpProbabilisticChartParser): - """ - A bottom-up parser for ``PCFG`` grammars that tries edges in whatever order. - """ - - # Inherit constructor - def sort_queue(self, queue, chart): - return - - -class LongestChartParser(BottomUpProbabilisticChartParser): - """ - A bottom-up parser for ``PCFG`` grammars that tries longer edges before - shorter ones. This sorting order results in a type of best-first - search strategy. - """ - - # Inherit constructor - def sort_queue(self, queue, chart): - queue.sort(key=lambda edge: edge.length()) - - -##////////////////////////////////////////////////////// -## Test Code -##////////////////////////////////////////////////////// - - -def demo(choice=None, draw_parses=None, print_parses=None): - """ - A demonstration of the probabilistic parsers. The user is - prompted to select which demo to run, and how many parses should - be found; and then each parser is run on the same demo, and a - summary of the results are displayed. - """ - import sys - import time - - from nltk import tokenize - from nltk.parse import pchart - - # Define two demos. Each demo has a sentence and a grammar. - toy_pcfg1 = PCFG.fromstring( - """ - S -> NP VP [1.0] - NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] - Det -> 'the' [0.8] | 'my' [0.2] - N -> 'man' [0.5] | 'telescope' [0.5] - VP -> VP PP [0.1] | V NP [0.7] | V [0.2] - V -> 'ate' [0.35] | 'saw' [0.65] - PP -> P NP [1.0] - P -> 'with' [0.61] | 'under' [0.39] - """ - ) - - toy_pcfg2 = PCFG.fromstring( - """ - S -> NP VP [1.0] - VP -> V NP [.59] - VP -> V [.40] - VP -> VP PP [.01] - NP -> Det N [.41] - NP -> Name [.28] - NP -> NP PP [.31] - PP -> P NP [1.0] - V -> 'saw' [.21] - V -> 'ate' [.51] - V -> 'ran' [.28] - N -> 'boy' [.11] - N -> 'cookie' [.12] - N -> 'table' [.13] - N -> 'telescope' [.14] - N -> 'hill' [.5] - Name -> 'Jack' [.52] - Name -> 'Bob' [.48] - P -> 'with' [.61] - P -> 'under' [.39] - Det -> 'the' [.41] - Det -> 'a' [.31] - Det -> 'my' [.28] - """ - ) - - demos = [ - ("I saw John with my telescope", toy_pcfg1), - ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2), - ] - - if choice is None: - # Ask the user which demo they want to use. - print() - for i in range(len(demos)): - print(f"{i + 1:>3}: {demos[i][0]}") - print(" %r" % demos[i][1]) - print() - print("Which demo (%d-%d)? " % (1, len(demos)), end=" ") - choice = int(sys.stdin.readline().strip()) - 1 - try: - sent, grammar = demos[choice] - except: - print("Bad sentence number") - return - - # Tokenize the sentence. - tokens = sent.split() - - # Define a list of parsers. We'll use all parsers. - parsers = [ - pchart.InsideChartParser(grammar), - pchart.RandomChartParser(grammar), - pchart.UnsortedChartParser(grammar), - pchart.LongestChartParser(grammar), - pchart.InsideChartParser(grammar, beam_size=len(tokens) + 1), # was BeamParser - ] - - # Run the parsers on the tokenized sentence. - times = [] - average_p = [] - num_parses = [] - all_parses = {} - for parser in parsers: - print(f"\ns: {sent}\nparser: {parser}\ngrammar: {grammar}") - parser.trace(3) - t = time.time() - parses = list(parser.parse(tokens)) - times.append(time.time() - t) - p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0 - average_p.append(p) - num_parses.append(len(parses)) - for p in parses: - all_parses[p.freeze()] = 1 - - # Print some summary statistics - print() - print(" Parser Beam | Time (secs) # Parses Average P(parse)") - print("------------------------+------------------------------------------") - for i in range(len(parsers)): - print( - "%18s %4d |%11.4f%11d%19.14f" - % ( - parsers[i].__class__.__name__, - parsers[i].beam_size, - times[i], - num_parses[i], - average_p[i], - ) - ) - parses = all_parses.keys() - if parses: - p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) - else: - p = 0 - print("------------------------+------------------------------------------") - print("%18s |%11s%11d%19.14f" % ("(All Parses)", "n/a", len(parses), p)) - - if draw_parses is None: - # Ask the user if we should draw the parses. - print() - print("Draw parses (y/n)? ", end=" ") - draw_parses = sys.stdin.readline().strip().lower().startswith("y") - if draw_parses: - from nltk.draw.tree import draw_trees - - print(" please wait...") - draw_trees(*parses) - - if print_parses is None: - # Ask the user if we should print the parses. - print() - print("Print parses (y/n)? ", end=" ") - print_parses = sys.stdin.readline().strip().lower().startswith("y") - if print_parses: - for parse in parses: - print(parse) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/parse/projectivedependencyparser.py b/pipeline/nltk/parse/projectivedependencyparser.py deleted file mode 100644 index 9e4e3ba4d6d8e19820de6d527d5847e365e018d7..0000000000000000000000000000000000000000 --- a/pipeline/nltk/parse/projectivedependencyparser.py +++ /dev/null @@ -1,716 +0,0 @@ -# Natural Language Toolkit: Dependency Grammars -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Jason Narad -# -# URL: -# For license information, see LICENSE.TXT -# - -from collections import defaultdict -from functools import total_ordering -from itertools import chain - -from nltk.grammar import ( - DependencyGrammar, - DependencyProduction, - ProbabilisticDependencyGrammar, -) -from nltk.internals import raise_unorderable_types -from nltk.parse.dependencygraph import DependencyGraph - -################################################################# -# Dependency Span -################################################################# - - -@total_ordering -class DependencySpan: - """ - A contiguous span over some part of the input string representing - dependency (head -> modifier) relationships amongst words. An atomic - span corresponds to only one word so it isn't a 'span' in the conventional - sense, as its _start_index = _end_index = _head_index for concatenation - purposes. All other spans are assumed to have arcs between all nodes - within the start and end indexes of the span, and one head index corresponding - to the head word for the entire span. This is the same as the root node if - the dependency structure were depicted as a graph. - """ - - def __init__(self, start_index, end_index, head_index, arcs, tags): - self._start_index = start_index - self._end_index = end_index - self._head_index = head_index - self._arcs = arcs - self._tags = tags - self._comparison_key = (start_index, end_index, head_index, tuple(arcs)) - self._hash = hash(self._comparison_key) - - def head_index(self): - """ - :return: An value indexing the head of the entire ``DependencySpan``. - :rtype: int - """ - return self._head_index - - def __repr__(self): - """ - :return: A concise string representatino of the ``DependencySpan``. - :rtype: str. - """ - return "Span %d-%d; Head Index: %d" % ( - self._start_index, - self._end_index, - self._head_index, - ) - - def __str__(self): - """ - :return: A verbose string representation of the ``DependencySpan``. - :rtype: str - """ - str = "Span %d-%d; Head Index: %d" % ( - self._start_index, - self._end_index, - self._head_index, - ) - for i in range(len(self._arcs)): - str += "\n%d <- %d, %s" % (i, self._arcs[i], self._tags[i]) - return str - - def __eq__(self, other): - return ( - type(self) == type(other) and self._comparison_key == other._comparison_key - ) - - def __ne__(self, other): - return not self == other - - def __lt__(self, other): - if not isinstance(other, DependencySpan): - raise_unorderable_types("<", self, other) - return self._comparison_key < other._comparison_key - - def __hash__(self): - """ - :return: The hash value of this ``DependencySpan``. - """ - return self._hash - - -################################################################# -# Chart Cell -################################################################# - - -class ChartCell: - """ - A cell from the parse chart formed when performing the CYK algorithm. - Each cell keeps track of its x and y coordinates (though this will probably - be discarded), and a list of spans serving as the cell's entries. - """ - - def __init__(self, x, y): - """ - :param x: This cell's x coordinate. - :type x: int. - :param y: This cell's y coordinate. - :type y: int. - """ - self._x = x - self._y = y - self._entries = set() - - def add(self, span): - """ - Appends the given span to the list of spans - representing the chart cell's entries. - - :param span: The span to add. - :type span: DependencySpan - """ - self._entries.add(span) - - def __str__(self): - """ - :return: A verbose string representation of this ``ChartCell``. - :rtype: str. - """ - return "CC[%d,%d]: %s" % (self._x, self._y, self._entries) - - def __repr__(self): - """ - :return: A concise string representation of this ``ChartCell``. - :rtype: str. - """ - return "%s" % self - - -################################################################# -# Parsing with Dependency Grammars -################################################################# - - -class ProjectiveDependencyParser: - """ - A projective, rule-based, dependency parser. A ProjectiveDependencyParser - is created with a DependencyGrammar, a set of productions specifying - word-to-word dependency relations. The parse() method will then - return the set of all parses, in tree representation, for a given input - sequence of tokens. Each parse must meet the requirements of the both - the grammar and the projectivity constraint which specifies that the - branches of the dependency tree are not allowed to cross. Alternatively, - this can be understood as stating that each parent node and its children - in the parse tree form a continuous substring of the input sequence. - """ - - def __init__(self, dependency_grammar): - """ - Create a new ProjectiveDependencyParser, from a word-to-word - dependency grammar ``DependencyGrammar``. - - :param dependency_grammar: A word-to-word relation dependencygrammar. - :type dependency_grammar: DependencyGrammar - """ - self._grammar = dependency_grammar - - def parse(self, tokens): - """ - Performs a projective dependency parse on the list of tokens using - a chart-based, span-concatenation algorithm similar to Eisner (1996). - - :param tokens: The list of input tokens. - :type tokens: list(str) - :return: An iterator over parse trees. - :rtype: iter(Tree) - """ - self._tokens = list(tokens) - chart = [] - for i in range(0, len(self._tokens) + 1): - chart.append([]) - for j in range(0, len(self._tokens) + 1): - chart[i].append(ChartCell(i, j)) - if i == j + 1: - chart[i][j].add(DependencySpan(i - 1, i, i - 1, [-1], ["null"])) - - for i in range(1, len(self._tokens) + 1): - for j in range(i - 2, -1, -1): - for k in range(i - 1, j, -1): - for span1 in chart[k][j]._entries: - for span2 in chart[i][k]._entries: - for newspan in self.concatenate(span1, span2): - chart[i][j].add(newspan) - - for parse in chart[len(self._tokens)][0]._entries: - conll_format = "" - # malt_format = "" - for i in range(len(tokens)): - # malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null') - # conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], 'null', 'null', 'null', parse._arcs[i] + 1, 'null', '-', '-') - # Modify to comply with the new Dependency Graph requirement (at least must have an root elements) - conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % ( - i + 1, - tokens[i], - tokens[i], - "null", - "null", - "null", - parse._arcs[i] + 1, - "ROOT", - "-", - "-", - ) - dg = DependencyGraph(conll_format) - # if self.meets_arity(dg): - yield dg.tree() - - def concatenate(self, span1, span2): - """ - Concatenates the two spans in whichever way possible. This - includes rightward concatenation (from the leftmost word of the - leftmost span to the rightmost word of the rightmost span) and - leftward concatenation (vice-versa) between adjacent spans. Unlike - Eisner's presentation of span concatenation, these spans do not - share or pivot on a particular word/word-index. - - :return: A list of new spans formed through concatenation. - :rtype: list(DependencySpan) - """ - spans = [] - if span1._start_index == span2._start_index: - print("Error: Mismatched spans - replace this with thrown error") - if span1._start_index > span2._start_index: - temp_span = span1 - span1 = span2 - span2 = temp_span - # adjacent rightward covered concatenation - new_arcs = span1._arcs + span2._arcs - new_tags = span1._tags + span2._tags - if self._grammar.contains( - self._tokens[span1._head_index], self._tokens[span2._head_index] - ): - # print('Performing rightward cover %d to %d' % (span1._head_index, span2._head_index)) - new_arcs[span2._head_index - span1._start_index] = span1._head_index - spans.append( - DependencySpan( - span1._start_index, - span2._end_index, - span1._head_index, - new_arcs, - new_tags, - ) - ) - # adjacent leftward covered concatenation - new_arcs = span1._arcs + span2._arcs - if self._grammar.contains( - self._tokens[span2._head_index], self._tokens[span1._head_index] - ): - # print('performing leftward cover %d to %d' % (span2._head_index, span1._head_index)) - new_arcs[span1._head_index - span1._start_index] = span2._head_index - spans.append( - DependencySpan( - span1._start_index, - span2._end_index, - span2._head_index, - new_arcs, - new_tags, - ) - ) - return spans - - -################################################################# -# Parsing with Probabilistic Dependency Grammars -################################################################# - - -class ProbabilisticProjectiveDependencyParser: - """A probabilistic, projective dependency parser. - - This parser returns the most probable projective parse derived from the - probabilistic dependency grammar derived from the train() method. The - probabilistic model is an implementation of Eisner's (1996) Model C, which - conditions on head-word, head-tag, child-word, and child-tag. The decoding - uses a bottom-up chart-based span concatenation algorithm that's identical - to the one utilized by the rule-based projective parser. - - Usage example - - >>> from nltk.parse.dependencygraph import conll_data2 - - >>> graphs = [ - ... DependencyGraph(entry) for entry in conll_data2.split('\\n\\n') if entry - ... ] - - >>> ppdp = ProbabilisticProjectiveDependencyParser() - >>> ppdp.train(graphs) - - >>> sent = ['Cathy', 'zag', 'hen', 'wild', 'zwaaien', '.'] - >>> list(ppdp.parse(sent)) - [Tree('zag', ['Cathy', 'hen', Tree('zwaaien', ['wild', '.'])])] - - """ - - def __init__(self): - """ - Create a new probabilistic dependency parser. No additional - operations are necessary. - """ - - def parse(self, tokens): - """ - Parses the list of tokens subject to the projectivity constraint - and the productions in the parser's grammar. This uses a method - similar to the span-concatenation algorithm defined in Eisner (1996). - It returns the most probable parse derived from the parser's - probabilistic dependency grammar. - """ - self._tokens = list(tokens) - chart = [] - for i in range(0, len(self._tokens) + 1): - chart.append([]) - for j in range(0, len(self._tokens) + 1): - chart[i].append(ChartCell(i, j)) - if i == j + 1: - if tokens[i - 1] in self._grammar._tags: - for tag in self._grammar._tags[tokens[i - 1]]: - chart[i][j].add( - DependencySpan(i - 1, i, i - 1, [-1], [tag]) - ) - else: - print( - "No tag found for input token '%s', parse is impossible." - % tokens[i - 1] - ) - return [] - for i in range(1, len(self._tokens) + 1): - for j in range(i - 2, -1, -1): - for k in range(i - 1, j, -1): - for span1 in chart[k][j]._entries: - for span2 in chart[i][k]._entries: - for newspan in self.concatenate(span1, span2): - chart[i][j].add(newspan) - trees = [] - max_parse = None - max_score = 0 - for parse in chart[len(self._tokens)][0]._entries: - conll_format = "" - malt_format = "" - for i in range(len(tokens)): - malt_format += "%s\t%s\t%d\t%s\n" % ( - tokens[i], - "null", - parse._arcs[i] + 1, - "null", - ) - # conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'null', '-', '-') - # Modify to comply with recent change in dependency graph such that there must be a ROOT element. - conll_format += "\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n" % ( - i + 1, - tokens[i], - tokens[i], - parse._tags[i], - parse._tags[i], - "null", - parse._arcs[i] + 1, - "ROOT", - "-", - "-", - ) - dg = DependencyGraph(conll_format) - score = self.compute_prob(dg) - trees.append((score, dg.tree())) - trees.sort() - return (tree for (score, tree) in trees) - - def concatenate(self, span1, span2): - """ - Concatenates the two spans in whichever way possible. This - includes rightward concatenation (from the leftmost word of the - leftmost span to the rightmost word of the rightmost span) and - leftward concatenation (vice-versa) between adjacent spans. Unlike - Eisner's presentation of span concatenation, these spans do not - share or pivot on a particular word/word-index. - - :return: A list of new spans formed through concatenation. - :rtype: list(DependencySpan) - """ - spans = [] - if span1._start_index == span2._start_index: - print("Error: Mismatched spans - replace this with thrown error") - if span1._start_index > span2._start_index: - temp_span = span1 - span1 = span2 - span2 = temp_span - # adjacent rightward covered concatenation - new_arcs = span1._arcs + span2._arcs - new_tags = span1._tags + span2._tags - if self._grammar.contains( - self._tokens[span1._head_index], self._tokens[span2._head_index] - ): - new_arcs[span2._head_index - span1._start_index] = span1._head_index - spans.append( - DependencySpan( - span1._start_index, - span2._end_index, - span1._head_index, - new_arcs, - new_tags, - ) - ) - # adjacent leftward covered concatenation - new_arcs = span1._arcs + span2._arcs - new_tags = span1._tags + span2._tags - if self._grammar.contains( - self._tokens[span2._head_index], self._tokens[span1._head_index] - ): - new_arcs[span1._head_index - span1._start_index] = span2._head_index - spans.append( - DependencySpan( - span1._start_index, - span2._end_index, - span2._head_index, - new_arcs, - new_tags, - ) - ) - return spans - - def train(self, graphs): - """ - Trains a ProbabilisticDependencyGrammar based on the list of input - DependencyGraphs. This model is an implementation of Eisner's (1996) - Model C, which derives its statistics from head-word, head-tag, - child-word, and child-tag relationships. - - :param graphs: A list of dependency graphs to train from. - :type: list(DependencyGraph) - """ - productions = [] - events = defaultdict(int) - tags = {} - for dg in graphs: - for node_index in range(1, len(dg.nodes)): - # children = dg.nodes[node_index]['deps'] - children = list( - chain.from_iterable(dg.nodes[node_index]["deps"].values()) - ) - - nr_left_children = dg.left_children(node_index) - nr_right_children = dg.right_children(node_index) - nr_children = nr_left_children + nr_right_children - for child_index in range( - 0 - (nr_left_children + 1), nr_right_children + 2 - ): - head_word = dg.nodes[node_index]["word"] - head_tag = dg.nodes[node_index]["tag"] - if head_word in tags: - tags[head_word].add(head_tag) - else: - tags[head_word] = {head_tag} - child = "STOP" - child_tag = "STOP" - prev_word = "START" - prev_tag = "START" - if child_index < 0: - array_index = child_index + nr_left_children - if array_index >= 0: - child = dg.nodes[children[array_index]]["word"] - child_tag = dg.nodes[children[array_index]]["tag"] - if child_index != -1: - prev_word = dg.nodes[children[array_index + 1]]["word"] - prev_tag = dg.nodes[children[array_index + 1]]["tag"] - if child != "STOP": - productions.append(DependencyProduction(head_word, [child])) - head_event = "(head ({} {}) (mods ({}, {}, {}) left))".format( - child, - child_tag, - prev_tag, - head_word, - head_tag, - ) - mod_event = "(mods ({}, {}, {}) left))".format( - prev_tag, - head_word, - head_tag, - ) - events[head_event] += 1 - events[mod_event] += 1 - elif child_index > 0: - array_index = child_index + nr_left_children - 1 - if array_index < nr_children: - child = dg.nodes[children[array_index]]["word"] - child_tag = dg.nodes[children[array_index]]["tag"] - if child_index != 1: - prev_word = dg.nodes[children[array_index - 1]]["word"] - prev_tag = dg.nodes[children[array_index - 1]]["tag"] - if child != "STOP": - productions.append(DependencyProduction(head_word, [child])) - head_event = "(head ({} {}) (mods ({}, {}, {}) right))".format( - child, - child_tag, - prev_tag, - head_word, - head_tag, - ) - mod_event = "(mods ({}, {}, {}) right))".format( - prev_tag, - head_word, - head_tag, - ) - events[head_event] += 1 - events[mod_event] += 1 - self._grammar = ProbabilisticDependencyGrammar(productions, events, tags) - - def compute_prob(self, dg): - """ - Computes the probability of a dependency graph based - on the parser's probability model (defined by the parser's - statistical dependency grammar). - - :param dg: A dependency graph to score. - :type dg: DependencyGraph - :return: The probability of the dependency graph. - :rtype: int - """ - prob = 1.0 - for node_index in range(1, len(dg.nodes)): - # children = dg.nodes[node_index]['deps'] - children = list(chain.from_iterable(dg.nodes[node_index]["deps"].values())) - - nr_left_children = dg.left_children(node_index) - nr_right_children = dg.right_children(node_index) - nr_children = nr_left_children + nr_right_children - for child_index in range(0 - (nr_left_children + 1), nr_right_children + 2): - head_word = dg.nodes[node_index]["word"] - head_tag = dg.nodes[node_index]["tag"] - child = "STOP" - child_tag = "STOP" - prev_word = "START" - prev_tag = "START" - if child_index < 0: - array_index = child_index + nr_left_children - if array_index >= 0: - child = dg.nodes[children[array_index]]["word"] - child_tag = dg.nodes[children[array_index]]["tag"] - if child_index != -1: - prev_word = dg.nodes[children[array_index + 1]]["word"] - prev_tag = dg.nodes[children[array_index + 1]]["tag"] - head_event = "(head ({} {}) (mods ({}, {}, {}) left))".format( - child, - child_tag, - prev_tag, - head_word, - head_tag, - ) - mod_event = "(mods ({}, {}, {}) left))".format( - prev_tag, - head_word, - head_tag, - ) - h_count = self._grammar._events[head_event] - m_count = self._grammar._events[mod_event] - - # If the grammar is not covered - if m_count != 0: - prob *= h_count / m_count - else: - prob = 0.00000001 # Very small number - - elif child_index > 0: - array_index = child_index + nr_left_children - 1 - if array_index < nr_children: - child = dg.nodes[children[array_index]]["word"] - child_tag = dg.nodes[children[array_index]]["tag"] - if child_index != 1: - prev_word = dg.nodes[children[array_index - 1]]["word"] - prev_tag = dg.nodes[children[array_index - 1]]["tag"] - head_event = "(head ({} {}) (mods ({}, {}, {}) right))".format( - child, - child_tag, - prev_tag, - head_word, - head_tag, - ) - mod_event = "(mods ({}, {}, {}) right))".format( - prev_tag, - head_word, - head_tag, - ) - h_count = self._grammar._events[head_event] - m_count = self._grammar._events[mod_event] - - if m_count != 0: - prob *= h_count / m_count - else: - prob = 0.00000001 # Very small number - - return prob - - -################################################################# -# Demos -################################################################# - - -def demo(): - projective_rule_parse_demo() - # arity_parse_demo() - projective_prob_parse_demo() - - -def projective_rule_parse_demo(): - """ - A demonstration showing the creation and use of a - ``DependencyGrammar`` to perform a projective dependency - parse. - """ - grammar = DependencyGrammar.fromstring( - """ - 'scratch' -> 'cats' | 'walls' - 'walls' -> 'the' - 'cats' -> 'the' - """ - ) - print(grammar) - pdp = ProjectiveDependencyParser(grammar) - trees = pdp.parse(["the", "cats", "scratch", "the", "walls"]) - for tree in trees: - print(tree) - - -def arity_parse_demo(): - """ - A demonstration showing the creation of a ``DependencyGrammar`` - in which a specific number of modifiers is listed for a given - head. This can further constrain the number of possible parses - created by a ``ProjectiveDependencyParser``. - """ - print() - print("A grammar with no arity constraints. Each DependencyProduction") - print("specifies a relationship between one head word and only one") - print("modifier word.") - grammar = DependencyGrammar.fromstring( - """ - 'fell' -> 'price' | 'stock' - 'price' -> 'of' | 'the' - 'of' -> 'stock' - 'stock' -> 'the' - """ - ) - print(grammar) - - print() - print("For the sentence 'The price of the stock fell', this grammar") - print("will produce the following three parses:") - pdp = ProjectiveDependencyParser(grammar) - trees = pdp.parse(["the", "price", "of", "the", "stock", "fell"]) - for tree in trees: - print(tree) - - print() - print("By contrast, the following grammar contains a ") - print("DependencyProduction that specifies a relationship") - print("between a single head word, 'price', and two modifier") - print("words, 'of' and 'the'.") - grammar = DependencyGrammar.fromstring( - """ - 'fell' -> 'price' | 'stock' - 'price' -> 'of' 'the' - 'of' -> 'stock' - 'stock' -> 'the' - """ - ) - print(grammar) - - print() - print( - "This constrains the number of possible parses to just one:" - ) # unimplemented, soon to replace - pdp = ProjectiveDependencyParser(grammar) - trees = pdp.parse(["the", "price", "of", "the", "stock", "fell"]) - for tree in trees: - print(tree) - - -def projective_prob_parse_demo(): - """ - A demo showing the training and use of a projective - dependency parser. - """ - from nltk.parse.dependencygraph import conll_data2 - - graphs = [DependencyGraph(entry) for entry in conll_data2.split("\n\n") if entry] - ppdp = ProbabilisticProjectiveDependencyParser() - print("Training Probabilistic Projective Dependency Parser...") - ppdp.train(graphs) - - sent = ["Cathy", "zag", "hen", "wild", "zwaaien", "."] - print("Parsing '", " ".join(sent), "'...") - print("Parse:") - for tree in ppdp.parse(sent): - print(tree) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/parse/recursivedescent.py b/pipeline/nltk/parse/recursivedescent.py deleted file mode 100644 index dc5d88c0884d8da7fdc52b044331ff0536bc19c4..0000000000000000000000000000000000000000 --- a/pipeline/nltk/parse/recursivedescent.py +++ /dev/null @@ -1,684 +0,0 @@ -# Natural Language Toolkit: Recursive Descent Parser -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird -# URL: -# For license information, see LICENSE.TXT - -from nltk.grammar import Nonterminal -from nltk.parse.api import ParserI -from nltk.tree import ImmutableTree, Tree - - -##////////////////////////////////////////////////////// -## Recursive Descent Parser -##////////////////////////////////////////////////////// -class RecursiveDescentParser(ParserI): - """ - A simple top-down CFG parser that parses texts by recursively - expanding the fringe of a Tree, and matching it against a - text. - - ``RecursiveDescentParser`` uses a list of tree locations called a - "frontier" to remember which subtrees have not yet been expanded - and which leaves have not yet been matched against the text. Each - tree location consists of a list of child indices specifying the - path from the root of the tree to a subtree or a leaf; see the - reference documentation for Tree for more information - about tree locations. - - When the parser begins parsing a text, it constructs a tree - containing only the start symbol, and a frontier containing the - location of the tree's root node. It then extends the tree to - cover the text, using the following recursive procedure: - - - If the frontier is empty, and the text is covered by the tree, - then return the tree as a possible parse. - - If the frontier is empty, and the text is not covered by the - tree, then return no parses. - - If the first element of the frontier is a subtree, then - use CFG productions to "expand" it. For each applicable - production, add the expanded subtree's children to the - frontier, and recursively find all parses that can be - generated by the new tree and frontier. - - If the first element of the frontier is a token, then "match" - it against the next token from the text. Remove the token - from the frontier, and recursively find all parses that can be - generated by the new tree and frontier. - - :see: ``nltk.grammar`` - """ - - def __init__(self, grammar, trace=0): - """ - Create a new ``RecursiveDescentParser``, that uses ``grammar`` - to parse texts. - - :type grammar: CFG - :param grammar: The grammar used to parse texts. - :type trace: int - :param trace: The level of tracing that should be used when - parsing a text. ``0`` will generate no tracing output; - and higher numbers will produce more verbose tracing - output. - """ - self._grammar = grammar - self._trace = trace - - def grammar(self): - return self._grammar - - def parse(self, tokens): - # Inherit docs from ParserI - - tokens = list(tokens) - self._grammar.check_coverage(tokens) - - # Start a recursive descent parse, with an initial tree - # containing just the start symbol. - start = self._grammar.start().symbol() - initial_tree = Tree(start, []) - frontier = [()] - if self._trace: - self._trace_start(initial_tree, frontier, tokens) - return self._parse(tokens, initial_tree, frontier) - - def _parse(self, remaining_text, tree, frontier): - """ - Recursively expand and match each elements of ``tree`` - specified by ``frontier``, to cover ``remaining_text``. Return - a list of all parses found. - - :return: An iterator of all parses that can be generated by - matching and expanding the elements of ``tree`` - specified by ``frontier``. - :rtype: iter(Tree) - :type tree: Tree - :param tree: A partial structure for the text that is - currently being parsed. The elements of ``tree`` - that are specified by ``frontier`` have not yet been - expanded or matched. - :type remaining_text: list(str) - :param remaining_text: The portion of the text that is not yet - covered by ``tree``. - :type frontier: list(tuple(int)) - :param frontier: A list of the locations within ``tree`` of - all subtrees that have not yet been expanded, and all - leaves that have not yet been matched. This list sorted - in left-to-right order of location within the tree. - """ - - # If the tree covers the text, and there's nothing left to - # expand, then we've found a complete parse; return it. - if len(remaining_text) == 0 and len(frontier) == 0: - if self._trace: - self._trace_succeed(tree, frontier) - yield tree - - # If there's still text, but nothing left to expand, we failed. - elif len(frontier) == 0: - if self._trace: - self._trace_backtrack(tree, frontier) - - # If the next element on the frontier is a tree, expand it. - elif isinstance(tree[frontier[0]], Tree): - yield from self._expand(remaining_text, tree, frontier) - - # If the next element on the frontier is a token, match it. - else: - yield from self._match(remaining_text, tree, frontier) - - def _match(self, rtext, tree, frontier): - """ - :rtype: iter(Tree) - :return: an iterator of all parses that can be generated by - matching the first element of ``frontier`` against the - first token in ``rtext``. In particular, if the first - element of ``frontier`` has the same type as the first - token in ``rtext``, then substitute the token into - ``tree``; and return all parses that can be generated by - matching and expanding the remaining elements of - ``frontier``. If the first element of ``frontier`` does not - have the same type as the first token in ``rtext``, then - return empty list. - - :type tree: Tree - :param tree: A partial structure for the text that is - currently being parsed. The elements of ``tree`` - that are specified by ``frontier`` have not yet been - expanded or matched. - :type rtext: list(str) - :param rtext: The portion of the text that is not yet - covered by ``tree``. - :type frontier: list of tuple of int - :param frontier: A list of the locations within ``tree`` of - all subtrees that have not yet been expanded, and all - leaves that have not yet been matched. - """ - - tree_leaf = tree[frontier[0]] - if len(rtext) > 0 and tree_leaf == rtext[0]: - # If it's a terminal that matches rtext[0], then substitute - # in the token, and continue parsing. - newtree = tree.copy(deep=True) - newtree[frontier[0]] = rtext[0] - if self._trace: - self._trace_match(newtree, frontier[1:], rtext[0]) - yield from self._parse(rtext[1:], newtree, frontier[1:]) - else: - # If it's a non-matching terminal, fail. - if self._trace: - self._trace_backtrack(tree, frontier, rtext[:1]) - - def _expand(self, remaining_text, tree, frontier, production=None): - """ - :rtype: iter(Tree) - :return: An iterator of all parses that can be generated by - expanding the first element of ``frontier`` with - ``production``. In particular, if the first element of - ``frontier`` is a subtree whose node type is equal to - ``production``'s left hand side, then add a child to that - subtree for each element of ``production``'s right hand - side; and return all parses that can be generated by - matching and expanding the remaining elements of - ``frontier``. If the first element of ``frontier`` is not a - subtree whose node type is equal to ``production``'s left - hand side, then return an empty list. If ``production`` is - not specified, then return a list of all parses that can - be generated by expanding the first element of ``frontier`` - with *any* CFG production. - - :type tree: Tree - :param tree: A partial structure for the text that is - currently being parsed. The elements of ``tree`` - that are specified by ``frontier`` have not yet been - expanded or matched. - :type remaining_text: list(str) - :param remaining_text: The portion of the text that is not yet - covered by ``tree``. - :type frontier: list(tuple(int)) - :param frontier: A list of the locations within ``tree`` of - all subtrees that have not yet been expanded, and all - leaves that have not yet been matched. - """ - - if production is None: - productions = self._grammar.productions() - else: - productions = [production] - - for production in productions: - lhs = production.lhs().symbol() - if lhs == tree[frontier[0]].label(): - subtree = self._production_to_tree(production) - if frontier[0] == (): - newtree = subtree - else: - newtree = tree.copy(deep=True) - newtree[frontier[0]] = subtree - new_frontier = [ - frontier[0] + (i,) for i in range(len(production.rhs())) - ] - if self._trace: - self._trace_expand(newtree, new_frontier, production) - yield from self._parse( - remaining_text, newtree, new_frontier + frontier[1:] - ) - - def _production_to_tree(self, production): - """ - :rtype: Tree - :return: The Tree that is licensed by ``production``. - In particular, given the production ``[lhs -> elt[1] ... elt[n]]`` - return a tree that has a node ``lhs.symbol``, and - ``n`` children. For each nonterminal element - ``elt[i]`` in the production, the tree token has a - childless subtree with node value ``elt[i].symbol``; and - for each terminal element ``elt[j]``, the tree token has - a leaf token with type ``elt[j]``. - - :param production: The CFG production that licenses the tree - token that should be returned. - :type production: Production - """ - children = [] - for elt in production.rhs(): - if isinstance(elt, Nonterminal): - children.append(Tree(elt.symbol(), [])) - else: - # This will be matched. - children.append(elt) - return Tree(production.lhs().symbol(), children) - - def trace(self, trace=2): - """ - Set the level of tracing output that should be generated when - parsing a text. - - :type trace: int - :param trace: The trace level. A trace level of ``0`` will - generate no tracing output; and higher trace levels will - produce more verbose tracing output. - :rtype: None - """ - self._trace = trace - - def _trace_fringe(self, tree, treeloc=None): - """ - Print trace output displaying the fringe of ``tree``. The - fringe of ``tree`` consists of all of its leaves and all of - its childless subtrees. - - :rtype: None - """ - - if treeloc == (): - print("*", end=" ") - if isinstance(tree, Tree): - if len(tree) == 0: - print(repr(Nonterminal(tree.label())), end=" ") - for i in range(len(tree)): - if treeloc is not None and i == treeloc[0]: - self._trace_fringe(tree[i], treeloc[1:]) - else: - self._trace_fringe(tree[i]) - else: - print(repr(tree), end=" ") - - def _trace_tree(self, tree, frontier, operation): - """ - Print trace output displaying the parser's current state. - - :param operation: A character identifying the operation that - generated the current state. - :rtype: None - """ - if self._trace == 2: - print(" %c [" % operation, end=" ") - else: - print(" [", end=" ") - if len(frontier) > 0: - self._trace_fringe(tree, frontier[0]) - else: - self._trace_fringe(tree) - print("]") - - def _trace_start(self, tree, frontier, text): - print("Parsing %r" % " ".join(text)) - if self._trace > 2: - print("Start:") - if self._trace > 1: - self._trace_tree(tree, frontier, " ") - - def _trace_expand(self, tree, frontier, production): - if self._trace > 2: - print("Expand: %s" % production) - if self._trace > 1: - self._trace_tree(tree, frontier, "E") - - def _trace_match(self, tree, frontier, tok): - if self._trace > 2: - print("Match: %r" % tok) - if self._trace > 1: - self._trace_tree(tree, frontier, "M") - - def _trace_succeed(self, tree, frontier): - if self._trace > 2: - print("GOOD PARSE:") - if self._trace == 1: - print("Found a parse:\n%s" % tree) - if self._trace > 1: - self._trace_tree(tree, frontier, "+") - - def _trace_backtrack(self, tree, frontier, toks=None): - if self._trace > 2: - if toks: - print("Backtrack: %r match failed" % toks[0]) - else: - print("Backtrack") - - -##////////////////////////////////////////////////////// -## Stepping Recursive Descent Parser -##////////////////////////////////////////////////////// -class SteppingRecursiveDescentParser(RecursiveDescentParser): - """ - A ``RecursiveDescentParser`` that allows you to step through the - parsing process, performing a single operation at a time. - - The ``initialize`` method is used to start parsing a text. - ``expand`` expands the first element on the frontier using a single - CFG production, and ``match`` matches the first element on the - frontier against the next text token. ``backtrack`` undoes the most - recent expand or match operation. ``step`` performs a single - expand, match, or backtrack operation. ``parses`` returns the set - of parses that have been found by the parser. - - :ivar _history: A list of ``(rtext, tree, frontier)`` tripples, - containing the previous states of the parser. This history is - used to implement the ``backtrack`` operation. - :ivar _tried_e: A record of all productions that have been tried - for a given tree. This record is used by ``expand`` to perform - the next untried production. - :ivar _tried_m: A record of what tokens have been matched for a - given tree. This record is used by ``step`` to decide whether - or not to match a token. - :see: ``nltk.grammar`` - """ - - def __init__(self, grammar, trace=0): - super().__init__(grammar, trace) - self._rtext = None - self._tree = None - self._frontier = [()] - self._tried_e = {} - self._tried_m = {} - self._history = [] - self._parses = [] - - # [XX] TEMPORARY HACK WARNING! This should be replaced with - # something nicer when we get the chance. - def _freeze(self, tree): - c = tree.copy() - # for pos in c.treepositions('leaves'): - # c[pos] = c[pos].freeze() - return ImmutableTree.convert(c) - - def parse(self, tokens): - tokens = list(tokens) - self.initialize(tokens) - while self.step() is not None: - pass - return self.parses() - - def initialize(self, tokens): - """ - Start parsing a given text. This sets the parser's tree to - the start symbol, its frontier to the root node, and its - remaining text to ``token['SUBTOKENS']``. - """ - - self._rtext = tokens - start = self._grammar.start().symbol() - self._tree = Tree(start, []) - self._frontier = [()] - self._tried_e = {} - self._tried_m = {} - self._history = [] - self._parses = [] - if self._trace: - self._trace_start(self._tree, self._frontier, self._rtext) - - def remaining_text(self): - """ - :return: The portion of the text that is not yet covered by the - tree. - :rtype: list(str) - """ - return self._rtext - - def frontier(self): - """ - :return: A list of the tree locations of all subtrees that - have not yet been expanded, and all leaves that have not - yet been matched. - :rtype: list(tuple(int)) - """ - return self._frontier - - def tree(self): - """ - :return: A partial structure for the text that is - currently being parsed. The elements specified by the - frontier have not yet been expanded or matched. - :rtype: Tree - """ - return self._tree - - def step(self): - """ - Perform a single parsing operation. If an untried match is - possible, then perform the match, and return the matched - token. If an untried expansion is possible, then perform the - expansion, and return the production that it is based on. If - backtracking is possible, then backtrack, and return True. - Otherwise, return None. - - :return: None if no operation was performed; a token if a match - was performed; a production if an expansion was performed; - and True if a backtrack operation was performed. - :rtype: Production or String or bool - """ - # Try matching (if we haven't already) - if self.untried_match(): - token = self.match() - if token is not None: - return token - - # Try expanding. - production = self.expand() - if production is not None: - return production - - # Try backtracking - if self.backtrack(): - self._trace_backtrack(self._tree, self._frontier) - return True - - # Nothing left to do. - return None - - def expand(self, production=None): - """ - Expand the first element of the frontier. In particular, if - the first element of the frontier is a subtree whose node type - is equal to ``production``'s left hand side, then add a child - to that subtree for each element of ``production``'s right hand - side. If ``production`` is not specified, then use the first - untried expandable production. If all expandable productions - have been tried, do nothing. - - :return: The production used to expand the frontier, if an - expansion was performed. If no expansion was performed, - return None. - :rtype: Production or None - """ - - # Make sure we *can* expand. - if len(self._frontier) == 0: - return None - if not isinstance(self._tree[self._frontier[0]], Tree): - return None - - # If they didn't specify a production, check all untried ones. - if production is None: - productions = self.untried_expandable_productions() - else: - productions = [production] - - parses = [] - for prod in productions: - # Record that we've tried this production now. - self._tried_e.setdefault(self._freeze(self._tree), []).append(prod) - - # Try expanding. - for _result in self._expand(self._rtext, self._tree, self._frontier, prod): - return prod - - # We didn't expand anything. - return None - - def match(self): - """ - Match the first element of the frontier. In particular, if - the first element of the frontier has the same type as the - next text token, then substitute the text token into the tree. - - :return: The token matched, if a match operation was - performed. If no match was performed, return None - :rtype: str or None - """ - - # Record that we've tried matching this token. - tok = self._rtext[0] - self._tried_m.setdefault(self._freeze(self._tree), []).append(tok) - - # Make sure we *can* match. - if len(self._frontier) == 0: - return None - if isinstance(self._tree[self._frontier[0]], Tree): - return None - - for _result in self._match(self._rtext, self._tree, self._frontier): - # Return the token we just matched. - return self._history[-1][0][0] - return None - - def backtrack(self): - """ - Return the parser to its state before the most recent - match or expand operation. Calling ``undo`` repeatedly return - the parser to successively earlier states. If no match or - expand operations have been performed, ``undo`` will make no - changes. - - :return: true if an operation was successfully undone. - :rtype: bool - """ - if len(self._history) == 0: - return False - (self._rtext, self._tree, self._frontier) = self._history.pop() - return True - - def expandable_productions(self): - """ - :return: A list of all the productions for which expansions - are available for the current parser state. - :rtype: list(Production) - """ - # Make sure we *can* expand. - if len(self._frontier) == 0: - return [] - frontier_child = self._tree[self._frontier[0]] - if len(self._frontier) == 0 or not isinstance(frontier_child, Tree): - return [] - - return [ - p - for p in self._grammar.productions() - if p.lhs().symbol() == frontier_child.label() - ] - - def untried_expandable_productions(self): - """ - :return: A list of all the untried productions for which - expansions are available for the current parser state. - :rtype: list(Production) - """ - - tried_expansions = self._tried_e.get(self._freeze(self._tree), []) - return [p for p in self.expandable_productions() if p not in tried_expansions] - - def untried_match(self): - """ - :return: Whether the first element of the frontier is a token - that has not yet been matched. - :rtype: bool - """ - - if len(self._rtext) == 0: - return False - tried_matches = self._tried_m.get(self._freeze(self._tree), []) - return self._rtext[0] not in tried_matches - - def currently_complete(self): - """ - :return: Whether the parser's current state represents a - complete parse. - :rtype: bool - """ - return len(self._frontier) == 0 and len(self._rtext) == 0 - - def _parse(self, remaining_text, tree, frontier): - """ - A stub version of ``_parse`` that sets the parsers current - state to the given arguments. In ``RecursiveDescentParser``, - the ``_parse`` method is used to recursively continue parsing a - text. ``SteppingRecursiveDescentParser`` overrides it to - capture these recursive calls. It records the parser's old - state in the history (to allow for backtracking), and updates - the parser's new state using the given arguments. Finally, it - returns ``[1]``, which is used by ``match`` and ``expand`` to - detect whether their operations were successful. - - :return: ``[1]`` - :rtype: list of int - """ - self._history.append((self._rtext, self._tree, self._frontier)) - self._rtext = remaining_text - self._tree = tree - self._frontier = frontier - - # Is it a good parse? If so, record it. - if len(frontier) == 0 and len(remaining_text) == 0: - self._parses.append(tree) - self._trace_succeed(self._tree, self._frontier) - - return [1] - - def parses(self): - """ - :return: An iterator of the parses that have been found by this - parser so far. - :rtype: list of Tree - """ - return iter(self._parses) - - def set_grammar(self, grammar): - """ - Change the grammar used to parse texts. - - :param grammar: The new grammar. - :type grammar: CFG - """ - self._grammar = grammar - - -##////////////////////////////////////////////////////// -## Demonstration Code -##////////////////////////////////////////////////////// - - -def demo(): - """ - A demonstration of the recursive descent parser. - """ - - from nltk import CFG, parse - - grammar = CFG.fromstring( - """ - S -> NP VP - NP -> Det N | Det N PP - VP -> V NP | V NP PP - PP -> P NP - NP -> 'I' - N -> 'man' | 'park' | 'telescope' | 'dog' - Det -> 'the' | 'a' - P -> 'in' | 'with' - V -> 'saw' - """ - ) - - for prod in grammar.productions(): - print(prod) - - sent = "I saw a man in the park".split() - parser = parse.RecursiveDescentParser(grammar, trace=2) - for p in parser.parse(sent): - print(p) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/parse/shiftreduce.py b/pipeline/nltk/parse/shiftreduce.py deleted file mode 100644 index bf18342573a14f18ca3918580e22d81f82c896cd..0000000000000000000000000000000000000000 --- a/pipeline/nltk/parse/shiftreduce.py +++ /dev/null @@ -1,479 +0,0 @@ -# Natural Language Toolkit: Shift-Reduce Parser -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird -# URL: -# For license information, see LICENSE.TXT - -from nltk.grammar import Nonterminal -from nltk.parse.api import ParserI -from nltk.tree import Tree - - -##////////////////////////////////////////////////////// -## Shift/Reduce Parser -##////////////////////////////////////////////////////// -class ShiftReduceParser(ParserI): - """ - A simple bottom-up CFG parser that uses two operations, "shift" - and "reduce", to find a single parse for a text. - - ``ShiftReduceParser`` maintains a stack, which records the - structure of a portion of the text. This stack is a list of - strings and Trees that collectively cover a portion of - the text. For example, while parsing the sentence "the dog saw - the man" with a typical grammar, ``ShiftReduceParser`` will produce - the following stack, which covers "the dog saw":: - - [(NP: (Det: 'the') (N: 'dog')), (V: 'saw')] - - ``ShiftReduceParser`` attempts to extend the stack to cover the - entire text, and to combine the stack elements into a single tree, - producing a complete parse for the sentence. - - Initially, the stack is empty. It is extended to cover the text, - from left to right, by repeatedly applying two operations: - - - "shift" moves a token from the beginning of the text to the - end of the stack. - - "reduce" uses a CFG production to combine the rightmost stack - elements into a single Tree. - - Often, more than one operation can be performed on a given stack. - In this case, ``ShiftReduceParser`` uses the following heuristics - to decide which operation to perform: - - - Only shift if no reductions are available. - - If multiple reductions are available, then apply the reduction - whose CFG production is listed earliest in the grammar. - - Note that these heuristics are not guaranteed to choose an - operation that leads to a parse of the text. Also, if multiple - parses exists, ``ShiftReduceParser`` will return at most one of - them. - - :see: ``nltk.grammar`` - """ - - def __init__(self, grammar, trace=0): - """ - Create a new ``ShiftReduceParser``, that uses ``grammar`` to - parse texts. - - :type grammar: Grammar - :param grammar: The grammar used to parse texts. - :type trace: int - :param trace: The level of tracing that should be used when - parsing a text. ``0`` will generate no tracing output; - and higher numbers will produce more verbose tracing - output. - """ - self._grammar = grammar - self._trace = trace - self._check_grammar() - - def grammar(self): - return self._grammar - - def parse(self, tokens): - tokens = list(tokens) - self._grammar.check_coverage(tokens) - - # initialize the stack. - stack = [] - remaining_text = tokens - - # Trace output. - if self._trace: - print("Parsing %r" % " ".join(tokens)) - self._trace_stack(stack, remaining_text) - - # iterate through the text, pushing the token onto - # the stack, then reducing the stack. - while len(remaining_text) > 0: - self._shift(stack, remaining_text) - while self._reduce(stack, remaining_text): - pass - - # Did we reduce everything? - if len(stack) == 1: - # Did we end up with the right category? - if stack[0].label() == self._grammar.start().symbol(): - yield stack[0] - - def _shift(self, stack, remaining_text): - """ - Move a token from the beginning of ``remaining_text`` to the - end of ``stack``. - - :type stack: list(str and Tree) - :param stack: A list of strings and Trees, encoding - the structure of the text that has been parsed so far. - :type remaining_text: list(str) - :param remaining_text: The portion of the text that is not yet - covered by ``stack``. - :rtype: None - """ - stack.append(remaining_text[0]) - remaining_text.remove(remaining_text[0]) - if self._trace: - self._trace_shift(stack, remaining_text) - - def _match_rhs(self, rhs, rightmost_stack): - """ - :rtype: bool - :return: true if the right hand side of a CFG production - matches the rightmost elements of the stack. ``rhs`` - matches ``rightmost_stack`` if they are the same length, - and each element of ``rhs`` matches the corresponding - element of ``rightmost_stack``. A nonterminal element of - ``rhs`` matches any Tree whose node value is equal - to the nonterminal's symbol. A terminal element of ``rhs`` - matches any string whose type is equal to the terminal. - :type rhs: list(terminal and Nonterminal) - :param rhs: The right hand side of a CFG production. - :type rightmost_stack: list(string and Tree) - :param rightmost_stack: The rightmost elements of the parser's - stack. - """ - - if len(rightmost_stack) != len(rhs): - return False - for i in range(len(rightmost_stack)): - if isinstance(rightmost_stack[i], Tree): - if not isinstance(rhs[i], Nonterminal): - return False - if rightmost_stack[i].label() != rhs[i].symbol(): - return False - else: - if isinstance(rhs[i], Nonterminal): - return False - if rightmost_stack[i] != rhs[i]: - return False - return True - - def _reduce(self, stack, remaining_text, production=None): - """ - Find a CFG production whose right hand side matches the - rightmost stack elements; and combine those stack elements - into a single Tree, with the node specified by the - production's left-hand side. If more than one CFG production - matches the stack, then use the production that is listed - earliest in the grammar. The new Tree replaces the - elements in the stack. - - :rtype: Production or None - :return: If a reduction is performed, then return the CFG - production that the reduction is based on; otherwise, - return false. - :type stack: list(string and Tree) - :param stack: A list of strings and Trees, encoding - the structure of the text that has been parsed so far. - :type remaining_text: list(str) - :param remaining_text: The portion of the text that is not yet - covered by ``stack``. - """ - if production is None: - productions = self._grammar.productions() - else: - productions = [production] - - # Try each production, in order. - for production in productions: - rhslen = len(production.rhs()) - - # check if the RHS of a production matches the top of the stack - if self._match_rhs(production.rhs(), stack[-rhslen:]): - - # combine the tree to reflect the reduction - tree = Tree(production.lhs().symbol(), stack[-rhslen:]) - stack[-rhslen:] = [tree] - - # We reduced something - if self._trace: - self._trace_reduce(stack, production, remaining_text) - return production - - # We didn't reduce anything - return None - - def trace(self, trace=2): - """ - Set the level of tracing output that should be generated when - parsing a text. - - :type trace: int - :param trace: The trace level. A trace level of ``0`` will - generate no tracing output; and higher trace levels will - produce more verbose tracing output. - :rtype: None - """ - # 1: just show shifts. - # 2: show shifts & reduces - # 3: display which tokens & productions are shifed/reduced - self._trace = trace - - def _trace_stack(self, stack, remaining_text, marker=" "): - """ - Print trace output displaying the given stack and text. - - :rtype: None - :param marker: A character that is printed to the left of the - stack. This is used with trace level 2 to print 'S' - before shifted stacks and 'R' before reduced stacks. - """ - s = " " + marker + " [ " - for elt in stack: - if isinstance(elt, Tree): - s += repr(Nonterminal(elt.label())) + " " - else: - s += repr(elt) + " " - s += "* " + " ".join(remaining_text) + "]" - print(s) - - def _trace_shift(self, stack, remaining_text): - """ - Print trace output displaying that a token has been shifted. - - :rtype: None - """ - if self._trace > 2: - print("Shift %r:" % stack[-1]) - if self._trace == 2: - self._trace_stack(stack, remaining_text, "S") - elif self._trace > 0: - self._trace_stack(stack, remaining_text) - - def _trace_reduce(self, stack, production, remaining_text): - """ - Print trace output displaying that ``production`` was used to - reduce ``stack``. - - :rtype: None - """ - if self._trace > 2: - rhs = " ".join(production.rhs()) - print(f"Reduce {production.lhs()!r} <- {rhs}") - if self._trace == 2: - self._trace_stack(stack, remaining_text, "R") - elif self._trace > 1: - self._trace_stack(stack, remaining_text) - - def _check_grammar(self): - """ - Check to make sure that all of the CFG productions are - potentially useful. If any productions can never be used, - then print a warning. - - :rtype: None - """ - productions = self._grammar.productions() - - # Any production whose RHS is an extension of another production's RHS - # will never be used. - for i in range(len(productions)): - for j in range(i + 1, len(productions)): - rhs1 = productions[i].rhs() - rhs2 = productions[j].rhs() - if rhs1[: len(rhs2)] == rhs2: - print("Warning: %r will never be used" % productions[i]) - - -##////////////////////////////////////////////////////// -## Stepping Shift/Reduce Parser -##////////////////////////////////////////////////////// -class SteppingShiftReduceParser(ShiftReduceParser): - """ - A ``ShiftReduceParser`` that allows you to setp through the parsing - process, performing a single operation at a time. It also allows - you to change the parser's grammar midway through parsing a text. - - The ``initialize`` method is used to start parsing a text. - ``shift`` performs a single shift operation, and ``reduce`` performs - a single reduce operation. ``step`` will perform a single reduce - operation if possible; otherwise, it will perform a single shift - operation. ``parses`` returns the set of parses that have been - found by the parser. - - :ivar _history: A list of ``(stack, remaining_text)`` pairs, - containing all of the previous states of the parser. This - history is used to implement the ``undo`` operation. - :see: ``nltk.grammar`` - """ - - def __init__(self, grammar, trace=0): - super().__init__(grammar, trace) - self._stack = None - self._remaining_text = None - self._history = [] - - def parse(self, tokens): - tokens = list(tokens) - self.initialize(tokens) - while self.step(): - pass - return self.parses() - - def stack(self): - """ - :return: The parser's stack. - :rtype: list(str and Tree) - """ - return self._stack - - def remaining_text(self): - """ - :return: The portion of the text that is not yet covered by the - stack. - :rtype: list(str) - """ - return self._remaining_text - - def initialize(self, tokens): - """ - Start parsing a given text. This sets the parser's stack to - ``[]`` and sets its remaining text to ``tokens``. - """ - self._stack = [] - self._remaining_text = tokens - self._history = [] - - def step(self): - """ - Perform a single parsing operation. If a reduction is - possible, then perform that reduction, and return the - production that it is based on. Otherwise, if a shift is - possible, then perform it, and return True. Otherwise, - return False. - - :return: False if no operation was performed; True if a shift was - performed; and the CFG production used to reduce if a - reduction was performed. - :rtype: Production or bool - """ - return self.reduce() or self.shift() - - def shift(self): - """ - Move a token from the beginning of the remaining text to the - end of the stack. If there are no more tokens in the - remaining text, then do nothing. - - :return: True if the shift operation was successful. - :rtype: bool - """ - if len(self._remaining_text) == 0: - return False - self._history.append((self._stack[:], self._remaining_text[:])) - self._shift(self._stack, self._remaining_text) - return True - - def reduce(self, production=None): - """ - Use ``production`` to combine the rightmost stack elements into - a single Tree. If ``production`` does not match the - rightmost stack elements, then do nothing. - - :return: The production used to reduce the stack, if a - reduction was performed. If no reduction was performed, - return None. - - :rtype: Production or None - """ - self._history.append((self._stack[:], self._remaining_text[:])) - return_val = self._reduce(self._stack, self._remaining_text, production) - - if not return_val: - self._history.pop() - return return_val - - def undo(self): - """ - Return the parser to its state before the most recent - shift or reduce operation. Calling ``undo`` repeatedly return - the parser to successively earlier states. If no shift or - reduce operations have been performed, ``undo`` will make no - changes. - - :return: true if an operation was successfully undone. - :rtype: bool - """ - if len(self._history) == 0: - return False - (self._stack, self._remaining_text) = self._history.pop() - return True - - def reducible_productions(self): - """ - :return: A list of the productions for which reductions are - available for the current parser state. - :rtype: list(Production) - """ - productions = [] - for production in self._grammar.productions(): - rhslen = len(production.rhs()) - if self._match_rhs(production.rhs(), self._stack[-rhslen:]): - productions.append(production) - return productions - - def parses(self): - """ - :return: An iterator of the parses that have been found by this - parser so far. - :rtype: iter(Tree) - """ - if ( - len(self._remaining_text) == 0 - and len(self._stack) == 1 - and self._stack[0].label() == self._grammar.start().symbol() - ): - yield self._stack[0] - - # copied from nltk.parser - - def set_grammar(self, grammar): - """ - Change the grammar used to parse texts. - - :param grammar: The new grammar. - :type grammar: CFG - """ - self._grammar = grammar - - -##////////////////////////////////////////////////////// -## Demonstration Code -##////////////////////////////////////////////////////// - - -def demo(): - """ - A demonstration of the shift-reduce parser. - """ - - from nltk import CFG, parse - - grammar = CFG.fromstring( - """ - S -> NP VP - NP -> Det N | Det N PP - VP -> V NP | V NP PP - PP -> P NP - NP -> 'I' - N -> 'man' | 'park' | 'telescope' | 'dog' - Det -> 'the' | 'a' - P -> 'in' | 'with' - V -> 'saw' - """ - ) - - sent = "I saw a man in the park".split() - - parser = parse.ShiftReduceParser(grammar, trace=2) - for p in parser.parse(sent): - print(p) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/parse/stanford.py b/pipeline/nltk/parse/stanford.py deleted file mode 100644 index c5ed0f9eab042dfdb7d91679ac5502f495a328ac..0000000000000000000000000000000000000000 --- a/pipeline/nltk/parse/stanford.py +++ /dev/null @@ -1,470 +0,0 @@ -# Natural Language Toolkit: Interface to the Stanford Parser -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Xu -# -# URL: -# For license information, see LICENSE.TXT - -import os -import tempfile -import warnings -from subprocess import PIPE - -from nltk.internals import ( - _java_options, - config_java, - find_jar_iter, - find_jars_within_path, - java, -) -from nltk.parse.api import ParserI -from nltk.parse.dependencygraph import DependencyGraph -from nltk.tree import Tree - -_stanford_url = "https://nlp.stanford.edu/software/lex-parser.shtml" - - -class GenericStanfordParser(ParserI): - """Interface to the Stanford Parser""" - - _MODEL_JAR_PATTERN = r"stanford-parser-(\d+)(\.(\d+))+-models\.jar" - _JAR = r"stanford-parser\.jar" - _MAIN_CLASS = "edu.stanford.nlp.parser.lexparser.LexicalizedParser" - - _USE_STDIN = False - _DOUBLE_SPACED_OUTPUT = False - - def __init__( - self, - path_to_jar=None, - path_to_models_jar=None, - model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz", - encoding="utf8", - verbose=False, - java_options="-mx4g", - corenlp_options="", - ): - - # find the most recent code and model jar - stanford_jar = max( - find_jar_iter( - self._JAR, - path_to_jar, - env_vars=("STANFORD_PARSER", "STANFORD_CORENLP"), - searchpath=(), - url=_stanford_url, - verbose=verbose, - is_regex=True, - ), - key=lambda model_path: os.path.dirname(model_path), - ) - - model_jar = max( - find_jar_iter( - self._MODEL_JAR_PATTERN, - path_to_models_jar, - env_vars=("STANFORD_MODELS", "STANFORD_CORENLP"), - searchpath=(), - url=_stanford_url, - verbose=verbose, - is_regex=True, - ), - key=lambda model_path: os.path.dirname(model_path), - ) - - # self._classpath = (stanford_jar, model_jar) - - # Adding logging jar files to classpath - stanford_dir = os.path.split(stanford_jar)[0] - self._classpath = tuple([model_jar] + find_jars_within_path(stanford_dir)) - - self.model_path = model_path - self._encoding = encoding - self.corenlp_options = corenlp_options - self.java_options = java_options - - def _parse_trees_output(self, output_): - res = [] - cur_lines = [] - cur_trees = [] - blank = False - for line in output_.splitlines(False): - if line == "": - if blank: - res.append(iter(cur_trees)) - cur_trees = [] - blank = False - elif self._DOUBLE_SPACED_OUTPUT: - cur_trees.append(self._make_tree("\n".join(cur_lines))) - cur_lines = [] - blank = True - else: - res.append(iter([self._make_tree("\n".join(cur_lines))])) - cur_lines = [] - else: - cur_lines.append(line) - blank = False - return iter(res) - - def parse_sents(self, sentences, verbose=False): - """ - Use StanfordParser to parse multiple sentences. Takes multiple sentences as a - list where each sentence is a list of words. - Each sentence will be automatically tagged with this StanfordParser instance's - tagger. - If whitespaces exists inside a token, then the token will be treated as - separate tokens. - - :param sentences: Input sentences to parse - :type sentences: list(list(str)) - :rtype: iter(iter(Tree)) - """ - cmd = [ - self._MAIN_CLASS, - "-model", - self.model_path, - "-sentences", - "newline", - "-outputFormat", - self._OUTPUT_FORMAT, - "-tokenized", - "-escaper", - "edu.stanford.nlp.process.PTBEscapingProcessor", - ] - return self._parse_trees_output( - self._execute( - cmd, "\n".join(" ".join(sentence) for sentence in sentences), verbose - ) - ) - - def raw_parse(self, sentence, verbose=False): - """ - Use StanfordParser to parse a sentence. Takes a sentence as a string; - before parsing, it will be automatically tokenized and tagged by - the Stanford Parser. - - :param sentence: Input sentence to parse - :type sentence: str - :rtype: iter(Tree) - """ - return next(self.raw_parse_sents([sentence], verbose)) - - def raw_parse_sents(self, sentences, verbose=False): - """ - Use StanfordParser to parse multiple sentences. Takes multiple sentences as a - list of strings. - Each sentence will be automatically tokenized and tagged by the Stanford Parser. - - :param sentences: Input sentences to parse - :type sentences: list(str) - :rtype: iter(iter(Tree)) - """ - cmd = [ - self._MAIN_CLASS, - "-model", - self.model_path, - "-sentences", - "newline", - "-outputFormat", - self._OUTPUT_FORMAT, - ] - return self._parse_trees_output( - self._execute(cmd, "\n".join(sentences), verbose) - ) - - def tagged_parse(self, sentence, verbose=False): - """ - Use StanfordParser to parse a sentence. Takes a sentence as a list of - (word, tag) tuples; the sentence must have already been tokenized and - tagged. - - :param sentence: Input sentence to parse - :type sentence: list(tuple(str, str)) - :rtype: iter(Tree) - """ - return next(self.tagged_parse_sents([sentence], verbose)) - - def tagged_parse_sents(self, sentences, verbose=False): - """ - Use StanfordParser to parse multiple sentences. Takes multiple sentences - where each sentence is a list of (word, tag) tuples. - The sentences must have already been tokenized and tagged. - - :param sentences: Input sentences to parse - :type sentences: list(list(tuple(str, str))) - :rtype: iter(iter(Tree)) - """ - tag_separator = "/" - cmd = [ - self._MAIN_CLASS, - "-model", - self.model_path, - "-sentences", - "newline", - "-outputFormat", - self._OUTPUT_FORMAT, - "-tokenized", - "-tagSeparator", - tag_separator, - "-tokenizerFactory", - "edu.stanford.nlp.process.WhitespaceTokenizer", - "-tokenizerMethod", - "newCoreLabelTokenizerFactory", - ] - # We don't need to escape slashes as "splitting is done on the last instance of the character in the token" - return self._parse_trees_output( - self._execute( - cmd, - "\n".join( - " ".join(tag_separator.join(tagged) for tagged in sentence) - for sentence in sentences - ), - verbose, - ) - ) - - def _execute(self, cmd, input_, verbose=False): - encoding = self._encoding - cmd.extend(["-encoding", encoding]) - if self.corenlp_options: - cmd.extend(self.corenlp_options.split()) - - default_options = " ".join(_java_options) - - # Configure java. - config_java(options=self.java_options, verbose=verbose) - - # Windows is incompatible with NamedTemporaryFile() without passing in delete=False. - with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file: - # Write the actual sentences to the temporary input file - if isinstance(input_, str) and encoding: - input_ = input_.encode(encoding) - input_file.write(input_) - input_file.flush() - - # Run the tagger and get the output. - if self._USE_STDIN: - input_file.seek(0) - stdout, stderr = java( - cmd, - classpath=self._classpath, - stdin=input_file, - stdout=PIPE, - stderr=PIPE, - ) - else: - cmd.append(input_file.name) - stdout, stderr = java( - cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE - ) - - stdout = stdout.replace(b"\xc2\xa0", b" ") - stdout = stdout.replace(b"\x00\xa0", b" ") - stdout = stdout.decode(encoding) - - os.unlink(input_file.name) - - # Return java configurations to their default values. - config_java(options=default_options, verbose=False) - - return stdout - - -class StanfordParser(GenericStanfordParser): - """ - >>> parser=StanfordParser( - ... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" - ... ) # doctest: +SKIP - - >>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) # doctest: +NORMALIZE_WHITESPACE +SKIP - [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), - Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), - Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])] - - >>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents(( - ... "the quick brown fox jumps over the lazy dog", - ... "the quick grey wolf jumps over the lazy fox" - ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP - [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), - Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), - Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP', - [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['grey']), Tree('NN', ['wolf'])]), Tree('NP', - [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']), - Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])] - - >>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents(( - ... "I 'm a dog".split(), - ... "This is my friends ' cat ( the tabby )".split(), - ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP - [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]), - Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP', - [Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']), - Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', [Tree('', []), - Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', [])])])])])])])] - - >>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents(( - ... ( - ... ("The", "DT"), - ... ("quick", "JJ"), - ... ("brown", "JJ"), - ... ("fox", "NN"), - ... ("jumped", "VBD"), - ... ("over", "IN"), - ... ("the", "DT"), - ... ("lazy", "JJ"), - ... ("dog", "NN"), - ... (".", "."), - ... ), - ... ))],[]) # doctest: +NORMALIZE_WHITESPACE +SKIP - [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), - Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP', - [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])] - """ - - _OUTPUT_FORMAT = "penn" - - def __init__(self, *args, **kwargs): - warnings.warn( - "The StanfordParser will be deprecated\n" - "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.", - DeprecationWarning, - stacklevel=2, - ) - - super().__init__(*args, **kwargs) - - def _make_tree(self, result): - return Tree.fromstring(result) - - -class StanfordDependencyParser(GenericStanfordParser): - - """ - >>> dep_parser=StanfordDependencyParser( - ... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" - ... ) # doctest: +SKIP - - >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP - [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])] - - >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP - [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')), - ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')), - ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')), - ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]] - - >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents(( - ... "The quick brown fox jumps over the lazy dog.", - ... "The quick grey wolf jumps over the lazy fox." - ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP - [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]), - Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])] - - >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents(( - ... "I 'm a dog".split(), - ... "This is my friends ' cat ( the tabby )".split(), - ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP - [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])] - - >>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents(( - ... ( - ... ("The", "DT"), - ... ("quick", "JJ"), - ... ("brown", "JJ"), - ... ("fox", "NN"), - ... ("jumped", "VBD"), - ... ("over", "IN"), - ... ("the", "DT"), - ... ("lazy", "JJ"), - ... ("dog", "NN"), - ... (".", "."), - ... ), - ... ))],[]) # doctest: +NORMALIZE_WHITESPACE +SKIP - [[((u'jumped', u'VBD'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')), - ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')), - ((u'jumped', u'VBD'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')), - ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]] - - """ - - _OUTPUT_FORMAT = "conll2007" - - def __init__(self, *args, **kwargs): - warnings.warn( - "The StanfordDependencyParser will be deprecated\n" - "Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.", - DeprecationWarning, - stacklevel=2, - ) - - super().__init__(*args, **kwargs) - - def _make_tree(self, result): - return DependencyGraph(result, top_relation_label="root") - - -class StanfordNeuralDependencyParser(GenericStanfordParser): - """ - >>> from nltk.parse.stanford import StanfordNeuralDependencyParser # doctest: +SKIP - >>> dep_parser=StanfordNeuralDependencyParser(java_options='-mx4g')# doctest: +SKIP - - >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP - [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy']), '.'])] - - >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE +SKIP - [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', - (u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), - u'amod', (u'brown', u'JJ')), ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), - ((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det', - (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ')), ((u'jumps', u'VBZ'), - u'punct', (u'.', u'.'))]] - - >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents(( - ... "The quick brown fox jumps over the lazy dog.", - ... "The quick grey wolf jumps over the lazy fox." - ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP - [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', - 'the', 'lazy']), '.']), Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), - Tree('fox', ['over', 'the', 'lazy']), '.'])] - - >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents(( - ... "I 'm a dog".split(), - ... "This is my friends ' cat ( the tabby )".split(), - ... ))], []) # doctest: +NORMALIZE_WHITESPACE +SKIP - [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', - ['my', "'"]), Tree('tabby', ['-LRB-', 'the', '-RRB-'])])] - """ - - _OUTPUT_FORMAT = "conll" - _MAIN_CLASS = "edu.stanford.nlp.pipeline.StanfordCoreNLP" - _JAR = r"stanford-corenlp-(\d+)(\.(\d+))+\.jar" - _MODEL_JAR_PATTERN = r"stanford-corenlp-(\d+)(\.(\d+))+-models\.jar" - _USE_STDIN = True - _DOUBLE_SPACED_OUTPUT = True - - def __init__(self, *args, **kwargs): - warnings.warn( - "The StanfordNeuralDependencyParser will be deprecated\n" - "Please use \033[91mnltk.parse.corenlp.CoreNLPDependencyParser\033[0m instead.", - DeprecationWarning, - stacklevel=2, - ) - - super().__init__(*args, **kwargs) - self.corenlp_options += "-annotators tokenize,ssplit,pos,depparse" - - def tagged_parse_sents(self, sentences, verbose=False): - """ - Currently unimplemented because the neural dependency parser (and - the StanfordCoreNLP pipeline class) doesn't support passing in pre- - tagged tokens. - """ - raise NotImplementedError( - "tagged_parse[_sents] is not supported by " - "StanfordNeuralDependencyParser; use " - "parse[_sents] or raw_parse[_sents] instead." - ) - - def _make_tree(self, result): - return DependencyGraph(result, top_relation_label="ROOT") diff --git a/pipeline/nltk/parse/transitionparser.py b/pipeline/nltk/parse/transitionparser.py deleted file mode 100644 index 476d70260a09c92196ea1cce749fc6774e75d822..0000000000000000000000000000000000000000 --- a/pipeline/nltk/parse/transitionparser.py +++ /dev/null @@ -1,794 +0,0 @@ -# Natural Language Toolkit: Arc-Standard and Arc-eager Transition Based Parsers -# -# Author: Long Duong -# -# Copyright (C) 2001-2023 NLTK Project -# URL: -# For license information, see LICENSE.TXT - -import pickle -import tempfile -from copy import deepcopy -from operator import itemgetter -from os import remove - -try: - from numpy import array - from scipy import sparse - from sklearn import svm - from sklearn.datasets import load_svmlight_file -except ImportError: - pass - -from nltk.parse import DependencyEvaluator, DependencyGraph, ParserI - - -class Configuration: - """ - Class for holding configuration which is the partial analysis of the input sentence. - The transition based parser aims at finding set of operators that transfer the initial - configuration to the terminal configuration. - - The configuration includes: - - Stack: for storing partially proceeded words - - Buffer: for storing remaining input words - - Set of arcs: for storing partially built dependency tree - - This class also provides a method to represent a configuration as list of features. - """ - - def __init__(self, dep_graph): - """ - :param dep_graph: the representation of an input in the form of dependency graph. - :type dep_graph: DependencyGraph where the dependencies are not specified. - """ - # dep_graph.nodes contain list of token for a sentence - self.stack = [0] # The root element - self.buffer = list(range(1, len(dep_graph.nodes))) # The rest is in the buffer - self.arcs = [] # empty set of arc - self._tokens = dep_graph.nodes - self._max_address = len(self.buffer) - - def __str__(self): - return ( - "Stack : " - + str(self.stack) - + " Buffer : " - + str(self.buffer) - + " Arcs : " - + str(self.arcs) - ) - - def _check_informative(self, feat, flag=False): - """ - Check whether a feature is informative - The flag control whether "_" is informative or not - """ - if feat is None: - return False - if feat == "": - return False - if flag is False: - if feat == "_": - return False - return True - - def extract_features(self): - """ - Extract the set of features for the current configuration. Implement standard features as describe in - Table 3.2 (page 31) in Dependency Parsing book by Sandra Kubler, Ryan McDonal, Joakim Nivre. - Please note that these features are very basic. - :return: list(str) - """ - result = [] - # Todo : can come up with more complicated features set for better - # performance. - if len(self.stack) > 0: - # Stack 0 - stack_idx0 = self.stack[len(self.stack) - 1] - token = self._tokens[stack_idx0] - if self._check_informative(token["word"], True): - result.append("STK_0_FORM_" + token["word"]) - if "lemma" in token and self._check_informative(token["lemma"]): - result.append("STK_0_LEMMA_" + token["lemma"]) - if self._check_informative(token["tag"]): - result.append("STK_0_POS_" + token["tag"]) - if "feats" in token and self._check_informative(token["feats"]): - feats = token["feats"].split("|") - for feat in feats: - result.append("STK_0_FEATS_" + feat) - # Stack 1 - if len(self.stack) > 1: - stack_idx1 = self.stack[len(self.stack) - 2] - token = self._tokens[stack_idx1] - if self._check_informative(token["tag"]): - result.append("STK_1_POS_" + token["tag"]) - - # Left most, right most dependency of stack[0] - left_most = 1000000 - right_most = -1 - dep_left_most = "" - dep_right_most = "" - for (wi, r, wj) in self.arcs: - if wi == stack_idx0: - if (wj > wi) and (wj > right_most): - right_most = wj - dep_right_most = r - if (wj < wi) and (wj < left_most): - left_most = wj - dep_left_most = r - if self._check_informative(dep_left_most): - result.append("STK_0_LDEP_" + dep_left_most) - if self._check_informative(dep_right_most): - result.append("STK_0_RDEP_" + dep_right_most) - - # Check Buffered 0 - if len(self.buffer) > 0: - # Buffer 0 - buffer_idx0 = self.buffer[0] - token = self._tokens[buffer_idx0] - if self._check_informative(token["word"], True): - result.append("BUF_0_FORM_" + token["word"]) - if "lemma" in token and self._check_informative(token["lemma"]): - result.append("BUF_0_LEMMA_" + token["lemma"]) - if self._check_informative(token["tag"]): - result.append("BUF_0_POS_" + token["tag"]) - if "feats" in token and self._check_informative(token["feats"]): - feats = token["feats"].split("|") - for feat in feats: - result.append("BUF_0_FEATS_" + feat) - # Buffer 1 - if len(self.buffer) > 1: - buffer_idx1 = self.buffer[1] - token = self._tokens[buffer_idx1] - if self._check_informative(token["word"], True): - result.append("BUF_1_FORM_" + token["word"]) - if self._check_informative(token["tag"]): - result.append("BUF_1_POS_" + token["tag"]) - if len(self.buffer) > 2: - buffer_idx2 = self.buffer[2] - token = self._tokens[buffer_idx2] - if self._check_informative(token["tag"]): - result.append("BUF_2_POS_" + token["tag"]) - if len(self.buffer) > 3: - buffer_idx3 = self.buffer[3] - token = self._tokens[buffer_idx3] - if self._check_informative(token["tag"]): - result.append("BUF_3_POS_" + token["tag"]) - # Left most, right most dependency of stack[0] - left_most = 1000000 - right_most = -1 - dep_left_most = "" - dep_right_most = "" - for (wi, r, wj) in self.arcs: - if wi == buffer_idx0: - if (wj > wi) and (wj > right_most): - right_most = wj - dep_right_most = r - if (wj < wi) and (wj < left_most): - left_most = wj - dep_left_most = r - if self._check_informative(dep_left_most): - result.append("BUF_0_LDEP_" + dep_left_most) - if self._check_informative(dep_right_most): - result.append("BUF_0_RDEP_" + dep_right_most) - - return result - - -class Transition: - """ - This class defines a set of transition which is applied to a configuration to get another configuration - Note that for different parsing algorithm, the transition is different. - """ - - # Define set of transitions - LEFT_ARC = "LEFTARC" - RIGHT_ARC = "RIGHTARC" - SHIFT = "SHIFT" - REDUCE = "REDUCE" - - def __init__(self, alg_option): - """ - :param alg_option: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm - :type alg_option: str - """ - self._algo = alg_option - if alg_option not in [ - TransitionParser.ARC_STANDARD, - TransitionParser.ARC_EAGER, - ]: - raise ValueError( - " Currently we only support %s and %s " - % (TransitionParser.ARC_STANDARD, TransitionParser.ARC_EAGER) - ) - - def left_arc(self, conf, relation): - """ - Note that the algorithm for left-arc is quite similar except for precondition for both arc-standard and arc-eager - - :param configuration: is the current configuration - :return: A new configuration or -1 if the pre-condition is not satisfied - """ - if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0): - return -1 - if conf.buffer[0] == 0: - # here is the Root element - return -1 - - idx_wi = conf.stack[len(conf.stack) - 1] - - flag = True - if self._algo == TransitionParser.ARC_EAGER: - for (idx_parent, r, idx_child) in conf.arcs: - if idx_child == idx_wi: - flag = False - - if flag: - conf.stack.pop() - idx_wj = conf.buffer[0] - conf.arcs.append((idx_wj, relation, idx_wi)) - else: - return -1 - - def right_arc(self, conf, relation): - """ - Note that the algorithm for right-arc is DIFFERENT for arc-standard and arc-eager - - :param configuration: is the current configuration - :return: A new configuration or -1 if the pre-condition is not satisfied - """ - if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0): - return -1 - if self._algo == TransitionParser.ARC_STANDARD: - idx_wi = conf.stack.pop() - idx_wj = conf.buffer[0] - conf.buffer[0] = idx_wi - conf.arcs.append((idx_wi, relation, idx_wj)) - else: # arc-eager - idx_wi = conf.stack[len(conf.stack) - 1] - idx_wj = conf.buffer.pop(0) - conf.stack.append(idx_wj) - conf.arcs.append((idx_wi, relation, idx_wj)) - - def reduce(self, conf): - """ - Note that the algorithm for reduce is only available for arc-eager - - :param configuration: is the current configuration - :return: A new configuration or -1 if the pre-condition is not satisfied - """ - - if self._algo != TransitionParser.ARC_EAGER: - return -1 - if len(conf.stack) <= 0: - return -1 - - idx_wi = conf.stack[len(conf.stack) - 1] - flag = False - for (idx_parent, r, idx_child) in conf.arcs: - if idx_child == idx_wi: - flag = True - if flag: - conf.stack.pop() # reduce it - else: - return -1 - - def shift(self, conf): - """ - Note that the algorithm for shift is the SAME for arc-standard and arc-eager - - :param configuration: is the current configuration - :return: A new configuration or -1 if the pre-condition is not satisfied - """ - if len(conf.buffer) <= 0: - return -1 - idx_wi = conf.buffer.pop(0) - conf.stack.append(idx_wi) - - -class TransitionParser(ParserI): - - """ - Class for transition based parser. Implement 2 algorithms which are "arc-standard" and "arc-eager" - """ - - ARC_STANDARD = "arc-standard" - ARC_EAGER = "arc-eager" - - def __init__(self, algorithm): - """ - :param algorithm: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm - :type algorithm: str - """ - if not (algorithm in [self.ARC_STANDARD, self.ARC_EAGER]): - raise ValueError( - " Currently we only support %s and %s " - % (self.ARC_STANDARD, self.ARC_EAGER) - ) - self._algorithm = algorithm - - self._dictionary = {} - self._transition = {} - self._match_transition = {} - - def _get_dep_relation(self, idx_parent, idx_child, depgraph): - p_node = depgraph.nodes[idx_parent] - c_node = depgraph.nodes[idx_child] - - if c_node["word"] is None: - return None # Root word - - if c_node["head"] == p_node["address"]: - return c_node["rel"] - else: - return None - - def _convert_to_binary_features(self, features): - """ - :param features: list of feature string which is needed to convert to binary features - :type features: list(str) - :return : string of binary features in libsvm format which is 'featureID:value' pairs - """ - unsorted_result = [] - for feature in features: - self._dictionary.setdefault(feature, len(self._dictionary)) - unsorted_result.append(self._dictionary[feature]) - - # Default value of each feature is 1.0 - return " ".join( - str(featureID) + ":1.0" for featureID in sorted(unsorted_result) - ) - - def _is_projective(self, depgraph): - arc_list = [] - for key in depgraph.nodes: - node = depgraph.nodes[key] - - if "head" in node: - childIdx = node["address"] - parentIdx = node["head"] - if parentIdx is not None: - arc_list.append((parentIdx, childIdx)) - - for (parentIdx, childIdx) in arc_list: - # Ensure that childIdx < parentIdx - if childIdx > parentIdx: - temp = childIdx - childIdx = parentIdx - parentIdx = temp - for k in range(childIdx + 1, parentIdx): - for m in range(len(depgraph.nodes)): - if (m < childIdx) or (m > parentIdx): - if (k, m) in arc_list: - return False - if (m, k) in arc_list: - return False - return True - - def _write_to_file(self, key, binary_features, input_file): - """ - write the binary features to input file and update the transition dictionary - """ - self._transition.setdefault(key, len(self._transition) + 1) - self._match_transition[self._transition[key]] = key - - input_str = str(self._transition[key]) + " " + binary_features + "\n" - input_file.write(input_str.encode("utf-8")) - - def _create_training_examples_arc_std(self, depgraphs, input_file): - """ - Create the training example in the libsvm format and write it to the input_file. - Reference : Page 32, Chapter 3. Dependency Parsing by Sandra Kubler, Ryan McDonal and Joakim Nivre (2009) - """ - operation = Transition(self.ARC_STANDARD) - count_proj = 0 - training_seq = [] - - for depgraph in depgraphs: - if not self._is_projective(depgraph): - continue - - count_proj += 1 - conf = Configuration(depgraph) - while len(conf.buffer) > 0: - b0 = conf.buffer[0] - features = conf.extract_features() - binary_features = self._convert_to_binary_features(features) - - if len(conf.stack) > 0: - s0 = conf.stack[len(conf.stack) - 1] - # Left-arc operation - rel = self._get_dep_relation(b0, s0, depgraph) - if rel is not None: - key = Transition.LEFT_ARC + ":" + rel - self._write_to_file(key, binary_features, input_file) - operation.left_arc(conf, rel) - training_seq.append(key) - continue - - # Right-arc operation - rel = self._get_dep_relation(s0, b0, depgraph) - if rel is not None: - precondition = True - # Get the max-index of buffer - maxID = conf._max_address - - for w in range(maxID + 1): - if w != b0: - relw = self._get_dep_relation(b0, w, depgraph) - if relw is not None: - if (b0, relw, w) not in conf.arcs: - precondition = False - - if precondition: - key = Transition.RIGHT_ARC + ":" + rel - self._write_to_file(key, binary_features, input_file) - operation.right_arc(conf, rel) - training_seq.append(key) - continue - - # Shift operation as the default - key = Transition.SHIFT - self._write_to_file(key, binary_features, input_file) - operation.shift(conf) - training_seq.append(key) - - print(" Number of training examples : " + str(len(depgraphs))) - print(" Number of valid (projective) examples : " + str(count_proj)) - return training_seq - - def _create_training_examples_arc_eager(self, depgraphs, input_file): - """ - Create the training example in the libsvm format and write it to the input_file. - Reference : 'A Dynamic Oracle for Arc-Eager Dependency Parsing' by Joav Goldberg and Joakim Nivre - """ - operation = Transition(self.ARC_EAGER) - countProj = 0 - training_seq = [] - - for depgraph in depgraphs: - if not self._is_projective(depgraph): - continue - - countProj += 1 - conf = Configuration(depgraph) - while len(conf.buffer) > 0: - b0 = conf.buffer[0] - features = conf.extract_features() - binary_features = self._convert_to_binary_features(features) - - if len(conf.stack) > 0: - s0 = conf.stack[len(conf.stack) - 1] - # Left-arc operation - rel = self._get_dep_relation(b0, s0, depgraph) - if rel is not None: - key = Transition.LEFT_ARC + ":" + rel - self._write_to_file(key, binary_features, input_file) - operation.left_arc(conf, rel) - training_seq.append(key) - continue - - # Right-arc operation - rel = self._get_dep_relation(s0, b0, depgraph) - if rel is not None: - key = Transition.RIGHT_ARC + ":" + rel - self._write_to_file(key, binary_features, input_file) - operation.right_arc(conf, rel) - training_seq.append(key) - continue - - # reduce operation - flag = False - for k in range(s0): - if self._get_dep_relation(k, b0, depgraph) is not None: - flag = True - if self._get_dep_relation(b0, k, depgraph) is not None: - flag = True - if flag: - key = Transition.REDUCE - self._write_to_file(key, binary_features, input_file) - operation.reduce(conf) - training_seq.append(key) - continue - - # Shift operation as the default - key = Transition.SHIFT - self._write_to_file(key, binary_features, input_file) - operation.shift(conf) - training_seq.append(key) - - print(" Number of training examples : " + str(len(depgraphs))) - print(" Number of valid (projective) examples : " + str(countProj)) - return training_seq - - def train(self, depgraphs, modelfile, verbose=True): - """ - :param depgraphs : list of DependencyGraph as the training data - :type depgraphs : DependencyGraph - :param modelfile : file name to save the trained model - :type modelfile : str - """ - - try: - input_file = tempfile.NamedTemporaryFile( - prefix="transition_parse.train", dir=tempfile.gettempdir(), delete=False - ) - - if self._algorithm == self.ARC_STANDARD: - self._create_training_examples_arc_std(depgraphs, input_file) - else: - self._create_training_examples_arc_eager(depgraphs, input_file) - - input_file.close() - # Using the temporary file to train the libsvm classifier - x_train, y_train = load_svmlight_file(input_file.name) - # The parameter is set according to the paper: - # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre - # Todo : because of probability = True => very slow due to - # cross-validation. Need to improve the speed here - model = svm.SVC( - kernel="poly", - degree=2, - coef0=0, - gamma=0.2, - C=0.5, - verbose=verbose, - probability=True, - ) - - model.fit(x_train, y_train) - # Save the model to file name (as pickle) - pickle.dump(model, open(modelfile, "wb")) - finally: - remove(input_file.name) - - def parse(self, depgraphs, modelFile): - """ - :param depgraphs: the list of test sentence, each sentence is represented as a dependency graph where the 'head' information is dummy - :type depgraphs: list(DependencyGraph) - :param modelfile: the model file - :type modelfile: str - :return: list (DependencyGraph) with the 'head' and 'rel' information - """ - result = [] - # First load the model - model = pickle.load(open(modelFile, "rb")) - operation = Transition(self._algorithm) - - for depgraph in depgraphs: - conf = Configuration(depgraph) - while len(conf.buffer) > 0: - features = conf.extract_features() - col = [] - row = [] - data = [] - for feature in features: - if feature in self._dictionary: - col.append(self._dictionary[feature]) - row.append(0) - data.append(1.0) - np_col = array(sorted(col)) # NB : index must be sorted - np_row = array(row) - np_data = array(data) - - x_test = sparse.csr_matrix( - (np_data, (np_row, np_col)), shape=(1, len(self._dictionary)) - ) - - # It's best to use decision function as follow BUT it's not supported yet for sparse SVM - # Using decision function to build the votes array - # dec_func = model.decision_function(x_test)[0] - # votes = {} - # k = 0 - # for i in range(len(model.classes_)): - # for j in range(i+1, len(model.classes_)): - # #if dec_func[k] > 0: - # votes.setdefault(i,0) - # votes[i] +=1 - # else: - # votes.setdefault(j,0) - # votes[j] +=1 - # k +=1 - # Sort votes according to the values - # sorted_votes = sorted(votes.items(), key=itemgetter(1), reverse=True) - - # We will use predict_proba instead of decision_function - prob_dict = {} - pred_prob = model.predict_proba(x_test)[0] - for i in range(len(pred_prob)): - prob_dict[i] = pred_prob[i] - sorted_Prob = sorted(prob_dict.items(), key=itemgetter(1), reverse=True) - - # Note that SHIFT is always a valid operation - for (y_pred_idx, confidence) in sorted_Prob: - # y_pred = model.predict(x_test)[0] - # From the prediction match to the operation - y_pred = model.classes_[y_pred_idx] - - if y_pred in self._match_transition: - strTransition = self._match_transition[y_pred] - baseTransition = strTransition.split(":")[0] - - if baseTransition == Transition.LEFT_ARC: - if ( - operation.left_arc(conf, strTransition.split(":")[1]) - != -1 - ): - break - elif baseTransition == Transition.RIGHT_ARC: - if ( - operation.right_arc(conf, strTransition.split(":")[1]) - != -1 - ): - break - elif baseTransition == Transition.REDUCE: - if operation.reduce(conf) != -1: - break - elif baseTransition == Transition.SHIFT: - if operation.shift(conf) != -1: - break - else: - raise ValueError( - "The predicted transition is not recognized, expected errors" - ) - - # Finish with operations build the dependency graph from Conf.arcs - - new_depgraph = deepcopy(depgraph) - for key in new_depgraph.nodes: - node = new_depgraph.nodes[key] - node["rel"] = "" - # With the default, all the token depend on the Root - node["head"] = 0 - for (head, rel, child) in conf.arcs: - c_node = new_depgraph.nodes[child] - c_node["head"] = head - c_node["rel"] = rel - result.append(new_depgraph) - - return result - - -def demo(): - """ - >>> from nltk.parse import DependencyGraph, DependencyEvaluator - >>> from nltk.parse.transitionparser import TransitionParser, Configuration, Transition - >>> gold_sent = DependencyGraph(\""" - ... Economic JJ 2 ATT - ... news NN 3 SBJ - ... has VBD 0 ROOT - ... little JJ 5 ATT - ... effect NN 3 OBJ - ... on IN 5 ATT - ... financial JJ 8 ATT - ... markets NNS 6 PC - ... . . 3 PU - ... \""") - - >>> conf = Configuration(gold_sent) - - ###################### Check the Initial Feature ######################## - - >>> print(', '.join(conf.extract_features())) - STK_0_POS_TOP, BUF_0_FORM_Economic, BUF_0_LEMMA_Economic, BUF_0_POS_JJ, BUF_1_FORM_news, BUF_1_POS_NN, BUF_2_POS_VBD, BUF_3_POS_JJ - - ###################### Check The Transition ####################### - Check the Initialized Configuration - >>> print(conf) - Stack : [0] Buffer : [1, 2, 3, 4, 5, 6, 7, 8, 9] Arcs : [] - - A. Do some transition checks for ARC-STANDARD - - >>> operation = Transition('arc-standard') - >>> operation.shift(conf) - >>> operation.left_arc(conf, "ATT") - >>> operation.shift(conf) - >>> operation.left_arc(conf,"SBJ") - >>> operation.shift(conf) - >>> operation.shift(conf) - >>> operation.left_arc(conf, "ATT") - >>> operation.shift(conf) - >>> operation.shift(conf) - >>> operation.shift(conf) - >>> operation.left_arc(conf, "ATT") - - Middle Configuration and Features Check - >>> print(conf) - Stack : [0, 3, 5, 6] Buffer : [8, 9] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7)] - - >>> print(', '.join(conf.extract_features())) - STK_0_FORM_on, STK_0_LEMMA_on, STK_0_POS_IN, STK_1_POS_NN, BUF_0_FORM_markets, BUF_0_LEMMA_markets, BUF_0_POS_NNS, BUF_1_FORM_., BUF_1_POS_., BUF_0_LDEP_ATT - - >>> operation.right_arc(conf, "PC") - >>> operation.right_arc(conf, "ATT") - >>> operation.right_arc(conf, "OBJ") - >>> operation.shift(conf) - >>> operation.right_arc(conf, "PU") - >>> operation.right_arc(conf, "ROOT") - >>> operation.shift(conf) - - Terminated Configuration Check - >>> print(conf) - Stack : [0] Buffer : [] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7), (6, 'PC', 8), (5, 'ATT', 6), (3, 'OBJ', 5), (3, 'PU', 9), (0, 'ROOT', 3)] - - - B. Do some transition checks for ARC-EAGER - - >>> conf = Configuration(gold_sent) - >>> operation = Transition('arc-eager') - >>> operation.shift(conf) - >>> operation.left_arc(conf,'ATT') - >>> operation.shift(conf) - >>> operation.left_arc(conf,'SBJ') - >>> operation.right_arc(conf,'ROOT') - >>> operation.shift(conf) - >>> operation.left_arc(conf,'ATT') - >>> operation.right_arc(conf,'OBJ') - >>> operation.right_arc(conf,'ATT') - >>> operation.shift(conf) - >>> operation.left_arc(conf,'ATT') - >>> operation.right_arc(conf,'PC') - >>> operation.reduce(conf) - >>> operation.reduce(conf) - >>> operation.reduce(conf) - >>> operation.right_arc(conf,'PU') - >>> print(conf) - Stack : [0, 3, 9] Buffer : [] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (0, 'ROOT', 3), (5, 'ATT', 4), (3, 'OBJ', 5), (5, 'ATT', 6), (8, 'ATT', 7), (6, 'PC', 8), (3, 'PU', 9)] - - ###################### Check The Training Function ####################### - - A. Check the ARC-STANDARD training - >>> import tempfile - >>> import os - >>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False) - - >>> parser_std = TransitionParser('arc-standard') - >>> print(', '.join(parser_std._create_training_examples_arc_std([gold_sent], input_file))) - Number of training examples : 1 - Number of valid (projective) examples : 1 - SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, SHIFT, SHIFT, LEFTARC:ATT, SHIFT, SHIFT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, RIGHTARC:ATT, RIGHTARC:OBJ, SHIFT, RIGHTARC:PU, RIGHTARC:ROOT, SHIFT - - >>> parser_std.train([gold_sent],'temp.arcstd.model', verbose=False) - Number of training examples : 1 - Number of valid (projective) examples : 1 - >>> input_file.close() - >>> remove(input_file.name) - - B. Check the ARC-EAGER training - - >>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(),delete=False) - >>> parser_eager = TransitionParser('arc-eager') - >>> print(', '.join(parser_eager._create_training_examples_arc_eager([gold_sent], input_file))) - Number of training examples : 1 - Number of valid (projective) examples : 1 - SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, RIGHTARC:ROOT, SHIFT, LEFTARC:ATT, RIGHTARC:OBJ, RIGHTARC:ATT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, REDUCE, REDUCE, REDUCE, RIGHTARC:PU - - >>> parser_eager.train([gold_sent],'temp.arceager.model', verbose=False) - Number of training examples : 1 - Number of valid (projective) examples : 1 - - >>> input_file.close() - >>> remove(input_file.name) - - ###################### Check The Parsing Function ######################## - - A. Check the ARC-STANDARD parser - - >>> result = parser_std.parse([gold_sent], 'temp.arcstd.model') - >>> de = DependencyEvaluator(result, [gold_sent]) - >>> de.eval() >= (0, 0) - True - - B. Check the ARC-EAGER parser - >>> result = parser_eager.parse([gold_sent], 'temp.arceager.model') - >>> de = DependencyEvaluator(result, [gold_sent]) - >>> de.eval() >= (0, 0) - True - - Remove test temporary files - >>> remove('temp.arceager.model') - >>> remove('temp.arcstd.model') - - Note that result is very poor because of only one training example. - """ diff --git a/pipeline/nltk/parse/util.py b/pipeline/nltk/parse/util.py deleted file mode 100644 index 3cc5bee08fdb9aa237513992a36fa2eaa0aa8219..0000000000000000000000000000000000000000 --- a/pipeline/nltk/parse/util.py +++ /dev/null @@ -1,234 +0,0 @@ -# Natural Language Toolkit: Parser Utility Functions -# -# Author: Ewan Klein -# Tom Aarsen <> -# -# Copyright (C) 2001-2023 NLTK Project -# URL: -# For license information, see LICENSE.TXT - - -""" -Utility functions for parsers. -""" - -from nltk.data import load -from nltk.grammar import CFG, PCFG, FeatureGrammar -from nltk.parse.chart import Chart, ChartParser -from nltk.parse.featurechart import FeatureChart, FeatureChartParser -from nltk.parse.pchart import InsideChartParser - - -def load_parser( - grammar_url, trace=0, parser=None, chart_class=None, beam_size=0, **load_args -): - """ - Load a grammar from a file, and build a parser based on that grammar. - The parser depends on the grammar format, and might also depend - on properties of the grammar itself. - - The following grammar formats are currently supported: - - ``'cfg'`` (CFGs: ``CFG``) - - ``'pcfg'`` (probabilistic CFGs: ``PCFG``) - - ``'fcfg'`` (feature-based CFGs: ``FeatureGrammar``) - - :type grammar_url: str - :param grammar_url: A URL specifying where the grammar is located. - The default protocol is ``"nltk:"``, which searches for the file - in the the NLTK data package. - :type trace: int - :param trace: The level of tracing that should be used when - parsing a text. ``0`` will generate no tracing output; - and higher numbers will produce more verbose tracing output. - :param parser: The class used for parsing; should be ``ChartParser`` - or a subclass. - If None, the class depends on the grammar format. - :param chart_class: The class used for storing the chart; - should be ``Chart`` or a subclass. - Only used for CFGs and feature CFGs. - If None, the chart class depends on the grammar format. - :type beam_size: int - :param beam_size: The maximum length for the parser's edge queue. - Only used for probabilistic CFGs. - :param load_args: Keyword parameters used when loading the grammar. - See ``data.load`` for more information. - """ - grammar = load(grammar_url, **load_args) - if not isinstance(grammar, CFG): - raise ValueError("The grammar must be a CFG, " "or a subclass thereof.") - if isinstance(grammar, PCFG): - if parser is None: - parser = InsideChartParser - return parser(grammar, trace=trace, beam_size=beam_size) - - elif isinstance(grammar, FeatureGrammar): - if parser is None: - parser = FeatureChartParser - if chart_class is None: - chart_class = FeatureChart - return parser(grammar, trace=trace, chart_class=chart_class) - - else: # Plain CFG. - if parser is None: - parser = ChartParser - if chart_class is None: - chart_class = Chart - return parser(grammar, trace=trace, chart_class=chart_class) - - -def taggedsent_to_conll(sentence): - """ - A module to convert a single POS tagged sentence into CONLL format. - - >>> from nltk import word_tokenize, pos_tag - >>> text = "This is a foobar sentence." - >>> for line in taggedsent_to_conll(pos_tag(word_tokenize(text))): # doctest: +NORMALIZE_WHITESPACE - ... print(line, end="") - 1 This _ DT DT _ 0 a _ _ - 2 is _ VBZ VBZ _ 0 a _ _ - 3 a _ DT DT _ 0 a _ _ - 4 foobar _ JJ JJ _ 0 a _ _ - 5 sentence _ NN NN _ 0 a _ _ - 6 . _ . . _ 0 a _ _ - - :param sentence: A single input sentence to parse - :type sentence: list(tuple(str, str)) - :rtype: iter(str) - :return: a generator yielding a single sentence in CONLL format. - """ - for (i, (word, tag)) in enumerate(sentence, start=1): - input_str = [str(i), word, "_", tag, tag, "_", "0", "a", "_", "_"] - input_str = "\t".join(input_str) + "\n" - yield input_str - - -def taggedsents_to_conll(sentences): - """ - A module to convert the a POS tagged document stream - (i.e. list of list of tuples, a list of sentences) and yield lines - in CONLL format. This module yields one line per word and two newlines - for end of sentence. - - >>> from nltk import word_tokenize, sent_tokenize, pos_tag - >>> text = "This is a foobar sentence. Is that right?" - >>> sentences = [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(text)] - >>> for line in taggedsents_to_conll(sentences): # doctest: +NORMALIZE_WHITESPACE - ... if line: - ... print(line, end="") - 1 This _ DT DT _ 0 a _ _ - 2 is _ VBZ VBZ _ 0 a _ _ - 3 a _ DT DT _ 0 a _ _ - 4 foobar _ JJ JJ _ 0 a _ _ - 5 sentence _ NN NN _ 0 a _ _ - 6 . _ . . _ 0 a _ _ - - - 1 Is _ VBZ VBZ _ 0 a _ _ - 2 that _ IN IN _ 0 a _ _ - 3 right _ NN NN _ 0 a _ _ - 4 ? _ . . _ 0 a _ _ - - - - :param sentences: Input sentences to parse - :type sentence: list(list(tuple(str, str))) - :rtype: iter(str) - :return: a generator yielding sentences in CONLL format. - """ - for sentence in sentences: - yield from taggedsent_to_conll(sentence) - yield "\n\n" - - -###################################################################### -# { Test Suites -###################################################################### - - -class TestGrammar: - """ - Unit tests for CFG. - """ - - def __init__(self, grammar, suite, accept=None, reject=None): - self.test_grammar = grammar - - self.cp = load_parser(grammar, trace=0) - self.suite = suite - self._accept = accept - self._reject = reject - - def run(self, show_trees=False): - """ - Sentences in the test suite are divided into two classes: - - - grammatical (``accept``) and - - ungrammatical (``reject``). - - If a sentence should parse according to the grammar, the value of - ``trees`` will be a non-empty list. If a sentence should be rejected - according to the grammar, then the value of ``trees`` will be None. - """ - for test in self.suite: - print(test["doc"] + ":", end=" ") - for key in ["accept", "reject"]: - for sent in test[key]: - tokens = sent.split() - trees = list(self.cp.parse(tokens)) - if show_trees and trees: - print() - print(sent) - for tree in trees: - print(tree) - if key == "accept": - if trees == []: - raise ValueError("Sentence '%s' failed to parse'" % sent) - else: - accepted = True - else: - if trees: - raise ValueError("Sentence '%s' received a parse'" % sent) - else: - rejected = True - if accepted and rejected: - print("All tests passed!") - - -def extract_test_sentences(string, comment_chars="#%;", encoding=None): - """ - Parses a string with one test sentence per line. - Lines can optionally begin with: - - - a bool, saying if the sentence is grammatical or not, or - - an int, giving the number of parse trees is should have, - - The result information is followed by a colon, and then the sentence. - Empty lines and lines beginning with a comment char are ignored. - - :return: a list of tuple of sentences and expected results, - where a sentence is a list of str, - and a result is None, or bool, or int - - :param comment_chars: ``str`` of possible comment characters. - :param encoding: the encoding of the string, if it is binary - """ - if encoding is not None: - string = string.decode(encoding) - sentences = [] - for sentence in string.split("\n"): - if sentence == "" or sentence[0] in comment_chars: - continue - split_info = sentence.split(":", 1) - result = None - if len(split_info) == 2: - if split_info[0] in ["True", "true", "False", "false"]: - result = split_info[0] in ["True", "true"] - sentence = split_info[1] - else: - result = int(split_info[0]) - sentence = split_info[1] - tokens = sentence.split() - if tokens == []: - continue - sentences += [(tokens, result)] - return sentences diff --git a/pipeline/nltk/parse/viterbi.py b/pipeline/nltk/parse/viterbi.py deleted file mode 100644 index 8a3e9de30432a65828463e32e6ea7bff27b7c5ee..0000000000000000000000000000000000000000 --- a/pipeline/nltk/parse/viterbi.py +++ /dev/null @@ -1,453 +0,0 @@ -# Natural Language Toolkit: Viterbi Probabilistic Parser -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird -# URL: -# For license information, see LICENSE.TXT - -from functools import reduce - -from nltk.parse.api import ParserI -from nltk.tree import ProbabilisticTree, Tree - -##////////////////////////////////////////////////////// -## Viterbi PCFG Parser -##////////////////////////////////////////////////////// - - -class ViterbiParser(ParserI): - """ - A bottom-up ``PCFG`` parser that uses dynamic programming to find - the single most likely parse for a text. The ``ViterbiParser`` parser - parses texts by filling in a "most likely constituent table". - This table records the most probable tree representation for any - given span and node value. In particular, it has an entry for - every start index, end index, and node value, recording the most - likely subtree that spans from the start index to the end index, - and has the given node value. - - The ``ViterbiParser`` parser fills in this table incrementally. It starts - by filling in all entries for constituents that span one element - of text (i.e., entries where the end index is one greater than the - start index). After it has filled in all table entries for - constituents that span one element of text, it fills in the - entries for constitutants that span two elements of text. It - continues filling in the entries for constituents spanning larger - and larger portions of the text, until the entire table has been - filled. Finally, it returns the table entry for a constituent - spanning the entire text, whose node value is the grammar's start - symbol. - - In order to find the most likely constituent with a given span and - node value, the ``ViterbiParser`` parser considers all productions that - could produce that node value. For each production, it finds all - children that collectively cover the span and have the node values - specified by the production's right hand side. If the probability - of the tree formed by applying the production to the children is - greater than the probability of the current entry in the table, - then the table is updated with this new tree. - - A pseudo-code description of the algorithm used by - ``ViterbiParser`` is: - - | Create an empty most likely constituent table, *MLC*. - | For width in 1...len(text): - | For start in 1...len(text)-width: - | For prod in grammar.productions: - | For each sequence of subtrees [t[1], t[2], ..., t[n]] in MLC, - | where t[i].label()==prod.rhs[i], - | and the sequence covers [start:start+width]: - | old_p = MLC[start, start+width, prod.lhs] - | new_p = P(t[1])P(t[1])...P(t[n])P(prod) - | if new_p > old_p: - | new_tree = Tree(prod.lhs, t[1], t[2], ..., t[n]) - | MLC[start, start+width, prod.lhs] = new_tree - | Return MLC[0, len(text), start_symbol] - - :type _grammar: PCFG - :ivar _grammar: The grammar used to parse sentences. - :type _trace: int - :ivar _trace: The level of tracing output that should be generated - when parsing a text. - """ - - def __init__(self, grammar, trace=0): - """ - Create a new ``ViterbiParser`` parser, that uses ``grammar`` to - parse texts. - - :type grammar: PCFG - :param grammar: The grammar used to parse texts. - :type trace: int - :param trace: The level of tracing that should be used when - parsing a text. ``0`` will generate no tracing output; - and higher numbers will produce more verbose tracing - output. - """ - self._grammar = grammar - self._trace = trace - - def grammar(self): - return self._grammar - - def trace(self, trace=2): - """ - Set the level of tracing output that should be generated when - parsing a text. - - :type trace: int - :param trace: The trace level. A trace level of ``0`` will - generate no tracing output; and higher trace levels will - produce more verbose tracing output. - :rtype: None - """ - self._trace = trace - - def parse(self, tokens): - # Inherit docs from ParserI - - tokens = list(tokens) - self._grammar.check_coverage(tokens) - - # The most likely constituent table. This table specifies the - # most likely constituent for a given span and type. - # Constituents can be either Trees or tokens. For Trees, - # the "type" is the Nonterminal for the tree's root node - # value. For Tokens, the "type" is the token's type. - # The table is stored as a dictionary, since it is sparse. - constituents = {} - - # Initialize the constituents dictionary with the words from - # the text. - if self._trace: - print("Inserting tokens into the most likely" + " constituents table...") - for index in range(len(tokens)): - token = tokens[index] - constituents[index, index + 1, token] = token - if self._trace > 1: - self._trace_lexical_insertion(token, index, len(tokens)) - - # Consider each span of length 1, 2, ..., n; and add any trees - # that might cover that span to the constituents dictionary. - for length in range(1, len(tokens) + 1): - if self._trace: - print( - "Finding the most likely constituents" - + " spanning %d text elements..." % length - ) - for start in range(len(tokens) - length + 1): - span = (start, start + length) - self._add_constituents_spanning(span, constituents, tokens) - - # Return the tree that spans the entire text & have the right cat - tree = constituents.get((0, len(tokens), self._grammar.start())) - if tree is not None: - yield tree - - def _add_constituents_spanning(self, span, constituents, tokens): - """ - Find any constituents that might cover ``span``, and add them - to the most likely constituents table. - - :rtype: None - :type span: tuple(int, int) - :param span: The section of the text for which we are - trying to find possible constituents. The span is - specified as a pair of integers, where the first integer - is the index of the first token that should be included in - the constituent; and the second integer is the index of - the first token that should not be included in the - constituent. I.e., the constituent should cover - ``text[span[0]:span[1]]``, where ``text`` is the text - that we are parsing. - - :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree) - :param constituents: The most likely constituents table. This - table records the most probable tree representation for - any given span and node value. In particular, - ``constituents(s,e,nv)`` is the most likely - ``ProbabilisticTree`` that covers ``text[s:e]`` - and has a node value ``nv.symbol()``, where ``text`` - is the text that we are parsing. When - ``_add_constituents_spanning`` is called, ``constituents`` - should contain all possible constituents that are shorter - than ``span``. - - :type tokens: list of tokens - :param tokens: The text we are parsing. This is only used for - trace output. - """ - # Since some of the grammar productions may be unary, we need to - # repeatedly try all of the productions until none of them add any - # new constituents. - changed = True - while changed: - changed = False - - # Find all ways instantiations of the grammar productions that - # cover the span. - instantiations = self._find_instantiations(span, constituents) - - # For each production instantiation, add a new - # ProbabilisticTree whose probability is the product - # of the childrens' probabilities and the production's - # probability. - for (production, children) in instantiations: - subtrees = [c for c in children if isinstance(c, Tree)] - p = reduce(lambda pr, t: pr * t.prob(), subtrees, production.prob()) - node = production.lhs().symbol() - tree = ProbabilisticTree(node, children, prob=p) - - # If it's new a constituent, then add it to the - # constituents dictionary. - c = constituents.get((span[0], span[1], production.lhs())) - if self._trace > 1: - if c is None or c != tree: - if c is None or c.prob() < tree.prob(): - print(" Insert:", end=" ") - else: - print(" Discard:", end=" ") - self._trace_production(production, p, span, len(tokens)) - if c is None or c.prob() < tree.prob(): - constituents[span[0], span[1], production.lhs()] = tree - changed = True - - def _find_instantiations(self, span, constituents): - """ - :return: a list of the production instantiations that cover a - given span of the text. A "production instantiation" is - a tuple containing a production and a list of children, - where the production's right hand side matches the list of - children; and the children cover ``span``. :rtype: list - of ``pair`` of ``Production``, (list of - (``ProbabilisticTree`` or token. - - :type span: tuple(int, int) - :param span: The section of the text for which we are - trying to find production instantiations. The span is - specified as a pair of integers, where the first integer - is the index of the first token that should be covered by - the production instantiation; and the second integer is - the index of the first token that should not be covered by - the production instantiation. - :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree) - :param constituents: The most likely constituents table. This - table records the most probable tree representation for - any given span and node value. See the module - documentation for more information. - """ - rv = [] - for production in self._grammar.productions(): - childlists = self._match_rhs(production.rhs(), span, constituents) - - for childlist in childlists: - rv.append((production, childlist)) - return rv - - def _match_rhs(self, rhs, span, constituents): - """ - :return: a set of all the lists of children that cover ``span`` - and that match ``rhs``. - :rtype: list(list(ProbabilisticTree or token) - - :type rhs: list(Nonterminal or any) - :param rhs: The list specifying what kinds of children need to - cover ``span``. Each nonterminal in ``rhs`` specifies - that the corresponding child should be a tree whose node - value is that nonterminal's symbol. Each terminal in ``rhs`` - specifies that the corresponding child should be a token - whose type is that terminal. - :type span: tuple(int, int) - :param span: The section of the text for which we are - trying to find child lists. The span is specified as a - pair of integers, where the first integer is the index of - the first token that should be covered by the child list; - and the second integer is the index of the first token - that should not be covered by the child list. - :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree) - :param constituents: The most likely constituents table. This - table records the most probable tree representation for - any given span and node value. See the module - documentation for more information. - """ - (start, end) = span - - # Base case - if start >= end and rhs == (): - return [[]] - if start >= end or rhs == (): - return [] - - # Find everything that matches the 1st symbol of the RHS - childlists = [] - for split in range(start, end + 1): - l = constituents.get((start, split, rhs[0])) - if l is not None: - rights = self._match_rhs(rhs[1:], (split, end), constituents) - childlists += [[l] + r for r in rights] - - return childlists - - def _trace_production(self, production, p, span, width): - """ - Print trace output indicating that a given production has been - applied at a given location. - - :param production: The production that has been applied - :type production: Production - :param p: The probability of the tree produced by the production. - :type p: float - :param span: The span of the production - :type span: tuple - :rtype: None - """ - - str = "|" + "." * span[0] - str += "=" * (span[1] - span[0]) - str += "." * (width - span[1]) + "| " - str += "%s" % production - if self._trace > 2: - str = f"{str:<40} {p:12.10f} " - - print(str) - - def _trace_lexical_insertion(self, token, index, width): - str = " Insert: |" + "." * index + "=" + "." * (width - index - 1) + "| " - str += f"{token}" - print(str) - - def __repr__(self): - return "" % self._grammar - - -##////////////////////////////////////////////////////// -## Test Code -##////////////////////////////////////////////////////// - - -def demo(): - """ - A demonstration of the probabilistic parsers. The user is - prompted to select which demo to run, and how many parses should - be found; and then each parser is run on the same demo, and a - summary of the results are displayed. - """ - import sys - import time - - from nltk import tokenize - from nltk.grammar import PCFG - from nltk.parse import ViterbiParser - - toy_pcfg1 = PCFG.fromstring( - """ - S -> NP VP [1.0] - NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] - Det -> 'the' [0.8] | 'my' [0.2] - N -> 'man' [0.5] | 'telescope' [0.5] - VP -> VP PP [0.1] | V NP [0.7] | V [0.2] - V -> 'ate' [0.35] | 'saw' [0.65] - PP -> P NP [1.0] - P -> 'with' [0.61] | 'under' [0.39] - """ - ) - - toy_pcfg2 = PCFG.fromstring( - """ - S -> NP VP [1.0] - VP -> V NP [.59] - VP -> V [.40] - VP -> VP PP [.01] - NP -> Det N [.41] - NP -> Name [.28] - NP -> NP PP [.31] - PP -> P NP [1.0] - V -> 'saw' [.21] - V -> 'ate' [.51] - V -> 'ran' [.28] - N -> 'boy' [.11] - N -> 'cookie' [.12] - N -> 'table' [.13] - N -> 'telescope' [.14] - N -> 'hill' [.5] - Name -> 'Jack' [.52] - Name -> 'Bob' [.48] - P -> 'with' [.61] - P -> 'under' [.39] - Det -> 'the' [.41] - Det -> 'a' [.31] - Det -> 'my' [.28] - """ - ) - - # Define two demos. Each demo has a sentence and a grammar. - demos = [ - ("I saw the man with my telescope", toy_pcfg1), - ("the boy saw Jack with Bob under the table with a telescope", toy_pcfg2), - ] - - # Ask the user which demo they want to use. - print() - for i in range(len(demos)): - print(f"{i + 1:>3}: {demos[i][0]}") - print(" %r" % demos[i][1]) - print() - print("Which demo (%d-%d)? " % (1, len(demos)), end=" ") - try: - snum = int(sys.stdin.readline().strip()) - 1 - sent, grammar = demos[snum] - except: - print("Bad sentence number") - return - - # Tokenize the sentence. - tokens = sent.split() - - parser = ViterbiParser(grammar) - all_parses = {} - - print(f"\nsent: {sent}\nparser: {parser}\ngrammar: {grammar}") - parser.trace(3) - t = time.time() - parses = parser.parse_all(tokens) - time = time.time() - t - average = ( - reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) if parses else 0 - ) - num_parses = len(parses) - for p in parses: - all_parses[p.freeze()] = 1 - - # Print some summary statistics - print() - print("Time (secs) # Parses Average P(parse)") - print("-----------------------------------------") - print("%11.4f%11d%19.14f" % (time, num_parses, average)) - parses = all_parses.keys() - if parses: - p = reduce(lambda a, b: a + b.prob(), parses, 0) / len(parses) - else: - p = 0 - print("------------------------------------------") - print("%11s%11d%19.14f" % ("n/a", len(parses), p)) - - # Ask the user if we should draw the parses. - print() - print("Draw parses (y/n)? ", end=" ") - if sys.stdin.readline().strip().lower().startswith("y"): - from nltk.draw.tree import draw_trees - - print(" please wait...") - draw_trees(*parses) - - # Ask the user if we should print the parses. - print() - print("Print parses (y/n)? ", end=" ") - if sys.stdin.readline().strip().lower().startswith("y"): - for parse in parses: - print(parse) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/probability.py b/pipeline/nltk/probability.py deleted file mode 100644 index a6de70732ac33e375c42d5e675aac124ffeafdf6..0000000000000000000000000000000000000000 --- a/pipeline/nltk/probability.py +++ /dev/null @@ -1,2578 +0,0 @@ -# Natural Language Toolkit: Probability and Statistics -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird (additions) -# Trevor Cohn (additions) -# Peter Ljunglöf (additions) -# Liang Dong (additions) -# Geoffrey Sampson (additions) -# Ilia Kurenkov (additions) -# -# URL: -# For license information, see LICENSE.TXT - -""" -Classes for representing and processing probabilistic information. - -The ``FreqDist`` class is used to encode "frequency distributions", -which count the number of times that each outcome of an experiment -occurs. - -The ``ProbDistI`` class defines a standard interface for "probability -distributions", which encode the probability of each outcome for an -experiment. There are two types of probability distribution: - - - "derived probability distributions" are created from frequency - distributions. They attempt to model the probability distribution - that generated the frequency distribution. - - "analytic probability distributions" are created directly from - parameters (such as variance). - -The ``ConditionalFreqDist`` class and ``ConditionalProbDistI`` interface -are used to encode conditional distributions. Conditional probability -distributions can be derived or analytic; but currently the only -implementation of the ``ConditionalProbDistI`` interface is -``ConditionalProbDist``, a derived distribution. - -""" - -import array -import math -import random -import warnings -from abc import ABCMeta, abstractmethod -from collections import Counter, defaultdict -from functools import reduce - -from nltk.internals import raise_unorderable_types - -_NINF = float("-1e300") - -##////////////////////////////////////////////////////// -## Frequency Distributions -##////////////////////////////////////////////////////// - - -class FreqDist(Counter): - """ - A frequency distribution for the outcomes of an experiment. A - frequency distribution records the number of times each outcome of - an experiment has occurred. For example, a frequency distribution - could be used to record the frequency of each word type in a - document. Formally, a frequency distribution can be defined as a - function mapping from each sample to the number of times that - sample occurred as an outcome. - - Frequency distributions are generally constructed by running a - number of experiments, and incrementing the count for a sample - every time it is an outcome of an experiment. For example, the - following code will produce a frequency distribution that encodes - how often each word occurs in a text: - - >>> from nltk.tokenize import word_tokenize - >>> from nltk.probability import FreqDist - >>> sent = 'This is an example sentence' - >>> fdist = FreqDist() - >>> for word in word_tokenize(sent): - ... fdist[word.lower()] += 1 - - An equivalent way to do this is with the initializer: - - >>> fdist = FreqDist(word.lower() for word in word_tokenize(sent)) - - """ - - def __init__(self, samples=None): - """ - Construct a new frequency distribution. If ``samples`` is - given, then the frequency distribution will be initialized - with the count of each object in ``samples``; otherwise, it - will be initialized to be empty. - - In particular, ``FreqDist()`` returns an empty frequency - distribution; and ``FreqDist(samples)`` first creates an empty - frequency distribution, and then calls ``update`` with the - list ``samples``. - - :param samples: The samples to initialize the frequency - distribution with. - :type samples: Sequence - """ - Counter.__init__(self, samples) - - # Cached number of samples in this FreqDist - self._N = None - - def N(self): - """ - Return the total number of sample outcomes that have been - recorded by this FreqDist. For the number of unique - sample values (or bins) with counts greater than zero, use - ``FreqDist.B()``. - - :rtype: int - """ - if self._N is None: - # Not already cached, or cache has been invalidated - self._N = sum(self.values()) - return self._N - - def __setitem__(self, key, val): - """ - Override ``Counter.__setitem__()`` to invalidate the cached N - """ - self._N = None - super().__setitem__(key, val) - - def __delitem__(self, key): - """ - Override ``Counter.__delitem__()`` to invalidate the cached N - """ - self._N = None - super().__delitem__(key) - - def update(self, *args, **kwargs): - """ - Override ``Counter.update()`` to invalidate the cached N - """ - self._N = None - super().update(*args, **kwargs) - - def setdefault(self, key, val): - """ - Override ``Counter.setdefault()`` to invalidate the cached N - """ - self._N = None - super().setdefault(key, val) - - def B(self): - """ - Return the total number of sample values (or "bins") that - have counts greater than zero. For the total - number of sample outcomes recorded, use ``FreqDist.N()``. - (FreqDist.B() is the same as len(FreqDist).) - - :rtype: int - """ - return len(self) - - def hapaxes(self): - """ - Return a list of all samples that occur once (hapax legomena) - - :rtype: list - """ - return [item for item in self if self[item] == 1] - - def Nr(self, r, bins=None): - return self.r_Nr(bins)[r] - - def r_Nr(self, bins=None): - """ - Return the dictionary mapping r to Nr, the number of samples with frequency r, where Nr > 0. - - :type bins: int - :param bins: The number of possible sample outcomes. ``bins`` - is used to calculate Nr(0). In particular, Nr(0) is - ``bins-self.B()``. If ``bins`` is not specified, it - defaults to ``self.B()`` (so Nr(0) will be 0). - :rtype: int - """ - - _r_Nr = defaultdict(int) - for count in self.values(): - _r_Nr[count] += 1 - - # Special case for Nr[0]: - _r_Nr[0] = bins - self.B() if bins is not None else 0 - - return _r_Nr - - def _cumulative_frequencies(self, samples): - """ - Return the cumulative frequencies of the specified samples. - If no samples are specified, all counts are returned, starting - with the largest. - - :param samples: the samples whose frequencies should be returned. - :type samples: any - :rtype: list(float) - """ - cf = 0.0 - for sample in samples: - cf += self[sample] - yield cf - - # slightly odd nomenclature freq() if FreqDist does counts and ProbDist does probs, - # here, freq() does probs - def freq(self, sample): - """ - Return the frequency of a given sample. The frequency of a - sample is defined as the count of that sample divided by the - total number of sample outcomes that have been recorded by - this FreqDist. The count of a sample is defined as the - number of times that sample outcome was recorded by this - FreqDist. Frequencies are always real numbers in the range - [0, 1]. - - :param sample: the sample whose frequency - should be returned. - :type sample: any - :rtype: float - """ - n = self.N() - if n == 0: - return 0 - return self[sample] / n - - def max(self): - """ - Return the sample with the greatest number of outcomes in this - frequency distribution. If two or more samples have the same - number of outcomes, return one of them; which sample is - returned is undefined. If no outcomes have occurred in this - frequency distribution, return None. - - :return: The sample with the maximum number of outcomes in this - frequency distribution. - :rtype: any or None - """ - if len(self) == 0: - raise ValueError( - "A FreqDist must have at least one sample before max is defined." - ) - return self.most_common(1)[0][0] - - def plot( - self, *args, title="", cumulative=False, percents=False, show=True, **kwargs - ): - """ - Plot samples from the frequency distribution - displaying the most frequent sample first. If an integer - parameter is supplied, stop after this many samples have been - plotted. For a cumulative plot, specify cumulative=True. Additional - ``**kwargs`` are passed to matplotlib's plot function. - (Requires Matplotlib to be installed.) - - :param title: The title for the graph. - :type title: str - :param cumulative: Whether the plot is cumulative. (default = False) - :type cumulative: bool - :param percents: Whether the plot uses percents instead of counts. (default = False) - :type percents: bool - :param show: Whether to show the plot, or only return the ax. - :type show: bool - """ - try: - import matplotlib.pyplot as plt - except ImportError as e: - raise ValueError( - "The plot function requires matplotlib to be installed." - "See https://matplotlib.org/" - ) from e - - if len(args) == 0: - args = [len(self)] - samples = [item for item, _ in self.most_common(*args)] - - if cumulative: - freqs = list(self._cumulative_frequencies(samples)) - ylabel = "Cumulative " - else: - freqs = [self[sample] for sample in samples] - ylabel = "" - - if percents: - freqs = [f / self.N() * 100 for f in freqs] - ylabel += "Percents" - else: - ylabel += "Counts" - - ax = plt.gca() - ax.grid(True, color="silver") - - if "linewidth" not in kwargs: - kwargs["linewidth"] = 2 - if title: - ax.set_title(title) - - ax.plot(freqs, **kwargs) - ax.set_xticks(range(len(samples))) - ax.set_xticklabels([str(s) for s in samples], rotation=90) - ax.set_xlabel("Samples") - ax.set_ylabel(ylabel) - - if show: - plt.show() - - return ax - - def tabulate(self, *args, **kwargs): - """ - Tabulate the given samples from the frequency distribution (cumulative), - displaying the most frequent sample first. If an integer - parameter is supplied, stop after this many samples have been - plotted. - - :param samples: The samples to plot (default is all samples) - :type samples: list - :param cumulative: A flag to specify whether the freqs are cumulative (default = False) - :type title: bool - """ - if len(args) == 0: - args = [len(self)] - samples = _get_kwarg( - kwargs, "samples", [item for item, _ in self.most_common(*args)] - ) - - cumulative = _get_kwarg(kwargs, "cumulative", False) - if cumulative: - freqs = list(self._cumulative_frequencies(samples)) - else: - freqs = [self[sample] for sample in samples] - # percents = [f * 100 for f in freqs] only in ProbDist? - - width = max(len(f"{s}") for s in samples) - width = max(width, max(len("%d" % f) for f in freqs)) - - for i in range(len(samples)): - print("%*s" % (width, samples[i]), end=" ") - print() - for i in range(len(samples)): - print("%*d" % (width, freqs[i]), end=" ") - print() - - def copy(self): - """ - Create a copy of this frequency distribution. - - :rtype: FreqDist - """ - return self.__class__(self) - - # Mathematical operatiors - - def __add__(self, other): - """ - Add counts from two counters. - - >>> FreqDist('abbb') + FreqDist('bcc') - FreqDist({'b': 4, 'c': 2, 'a': 1}) - - """ - return self.__class__(super().__add__(other)) - - def __sub__(self, other): - """ - Subtract count, but keep only results with positive counts. - - >>> FreqDist('abbbc') - FreqDist('bccd') - FreqDist({'b': 2, 'a': 1}) - - """ - return self.__class__(super().__sub__(other)) - - def __or__(self, other): - """ - Union is the maximum of value in either of the input counters. - - >>> FreqDist('abbb') | FreqDist('bcc') - FreqDist({'b': 3, 'c': 2, 'a': 1}) - - """ - return self.__class__(super().__or__(other)) - - def __and__(self, other): - """ - Intersection is the minimum of corresponding counts. - - >>> FreqDist('abbb') & FreqDist('bcc') - FreqDist({'b': 1}) - - """ - return self.__class__(super().__and__(other)) - - def __le__(self, other): - """ - Returns True if this frequency distribution is a subset of the other - and for no key the value exceeds the value of the same key from - the other frequency distribution. - - The <= operator forms partial order and satisfying the axioms - reflexivity, antisymmetry and transitivity. - - >>> FreqDist('a') <= FreqDist('a') - True - >>> a = FreqDist('abc') - >>> b = FreqDist('aabc') - >>> (a <= b, b <= a) - (True, False) - >>> FreqDist('a') <= FreqDist('abcd') - True - >>> FreqDist('abc') <= FreqDist('xyz') - False - >>> FreqDist('xyz') <= FreqDist('abc') - False - >>> c = FreqDist('a') - >>> d = FreqDist('aa') - >>> e = FreqDist('aaa') - >>> c <= d and d <= e and c <= e - True - """ - if not isinstance(other, FreqDist): - raise_unorderable_types("<=", self, other) - return set(self).issubset(other) and all( - self[key] <= other[key] for key in self - ) - - def __ge__(self, other): - if not isinstance(other, FreqDist): - raise_unorderable_types(">=", self, other) - return set(self).issuperset(other) and all( - self[key] >= other[key] for key in other - ) - - __lt__ = lambda self, other: self <= other and not self == other - __gt__ = lambda self, other: self >= other and not self == other - - def __repr__(self): - """ - Return a string representation of this FreqDist. - - :rtype: string - """ - return self.pformat() - - def pprint(self, maxlen=10, stream=None): - """ - Print a string representation of this FreqDist to 'stream' - - :param maxlen: The maximum number of items to print - :type maxlen: int - :param stream: The stream to print to. stdout by default - """ - print(self.pformat(maxlen=maxlen), file=stream) - - def pformat(self, maxlen=10): - """ - Return a string representation of this FreqDist. - - :param maxlen: The maximum number of items to display - :type maxlen: int - :rtype: string - """ - items = ["{!r}: {!r}".format(*item) for item in self.most_common(maxlen)] - if len(self) > maxlen: - items.append("...") - return "FreqDist({{{0}}})".format(", ".join(items)) - - def __str__(self): - """ - Return a string representation of this FreqDist. - - :rtype: string - """ - return "" % (len(self), self.N()) - - def __iter__(self): - """ - Return an iterator which yields tokens ordered by frequency. - - :rtype: iterator - """ - for token, _ in self.most_common(self.B()): - yield token - - -##////////////////////////////////////////////////////// -## Probability Distributions -##////////////////////////////////////////////////////// - - -class ProbDistI(metaclass=ABCMeta): - """ - A probability distribution for the outcomes of an experiment. A - probability distribution specifies how likely it is that an - experiment will have any given outcome. For example, a - probability distribution could be used to predict the probability - that a token in a document will have a given type. Formally, a - probability distribution can be defined as a function mapping from - samples to nonnegative real numbers, such that the sum of every - number in the function's range is 1.0. A ``ProbDist`` is often - used to model the probability distribution of the experiment used - to generate a frequency distribution. - """ - - SUM_TO_ONE = True - """True if the probabilities of the samples in this probability - distribution will always sum to one.""" - - @abstractmethod - def __init__(self): - """ - Classes inheriting from ProbDistI should implement __init__. - """ - - @abstractmethod - def prob(self, sample): - """ - Return the probability for a given sample. Probabilities - are always real numbers in the range [0, 1]. - - :param sample: The sample whose probability - should be returned. - :type sample: any - :rtype: float - """ - - def logprob(self, sample): - """ - Return the base 2 logarithm of the probability for a given sample. - - :param sample: The sample whose probability - should be returned. - :type sample: any - :rtype: float - """ - # Default definition, in terms of prob() - p = self.prob(sample) - return math.log(p, 2) if p != 0 else _NINF - - @abstractmethod - def max(self): - """ - Return the sample with the greatest probability. If two or - more samples have the same probability, return one of them; - which sample is returned is undefined. - - :rtype: any - """ - - @abstractmethod - def samples(self): - """ - Return a list of all samples that have nonzero probabilities. - Use ``prob`` to find the probability of each sample. - - :rtype: list - """ - - # cf self.SUM_TO_ONE - def discount(self): - """ - Return the ratio by which counts are discounted on average: c*/c - - :rtype: float - """ - return 0.0 - - # Subclasses should define more efficient implementations of this, - # where possible. - def generate(self): - """ - Return a randomly selected sample from this probability distribution. - The probability of returning each sample ``samp`` is equal to - ``self.prob(samp)``. - """ - p = random.random() - p_init = p - for sample in self.samples(): - p -= self.prob(sample) - if p <= 0: - return sample - # allow for some rounding error: - if p < 0.0001: - return sample - # we *should* never get here - if self.SUM_TO_ONE: - warnings.warn( - "Probability distribution %r sums to %r; generate()" - " is returning an arbitrary sample." % (self, p_init - p) - ) - return random.choice(list(self.samples())) - - -class UniformProbDist(ProbDistI): - """ - A probability distribution that assigns equal probability to each - sample in a given set; and a zero probability to all other - samples. - """ - - def __init__(self, samples): - """ - Construct a new uniform probability distribution, that assigns - equal probability to each sample in ``samples``. - - :param samples: The samples that should be given uniform - probability. - :type samples: list - :raise ValueError: If ``samples`` is empty. - """ - if len(samples) == 0: - raise ValueError( - "A Uniform probability distribution must " + "have at least one sample." - ) - self._sampleset = set(samples) - self._prob = 1.0 / len(self._sampleset) - self._samples = list(self._sampleset) - - def prob(self, sample): - return self._prob if sample in self._sampleset else 0 - - def max(self): - return self._samples[0] - - def samples(self): - return self._samples - - def __repr__(self): - return "" % len(self._sampleset) - - -class RandomProbDist(ProbDistI): - """ - Generates a random probability distribution whereby each sample - will be between 0 and 1 with equal probability (uniform random distribution. - Also called a continuous uniform distribution). - """ - - def __init__(self, samples): - if len(samples) == 0: - raise ValueError( - "A probability distribution must " + "have at least one sample." - ) - self._probs = self.unirand(samples) - self._samples = list(self._probs.keys()) - - @classmethod - def unirand(cls, samples): - """ - The key function that creates a randomized initial distribution - that still sums to 1. Set as a dictionary of prob values so that - it can still be passed to MutableProbDist and called with identical - syntax to UniformProbDist - """ - samples = set(samples) - randrow = [random.random() for i in range(len(samples))] - total = sum(randrow) - for i, x in enumerate(randrow): - randrow[i] = x / total - - total = sum(randrow) - if total != 1: - # this difference, if present, is so small (near NINF) that it - # can be subtracted from any element without risking probs not (0 1) - randrow[-1] -= total - 1 - - return {s: randrow[i] for i, s in enumerate(samples)} - - def max(self): - if not hasattr(self, "_max"): - self._max = max((p, v) for (v, p) in self._probs.items())[1] - return self._max - - def prob(self, sample): - return self._probs.get(sample, 0) - - def samples(self): - return self._samples - - def __repr__(self): - return "" % len(self._probs) - - -class DictionaryProbDist(ProbDistI): - """ - A probability distribution whose probabilities are directly - specified by a given dictionary. The given dictionary maps - samples to probabilities. - """ - - def __init__(self, prob_dict=None, log=False, normalize=False): - """ - Construct a new probability distribution from the given - dictionary, which maps values to probabilities (or to log - probabilities, if ``log`` is true). If ``normalize`` is - true, then the probability values are scaled by a constant - factor such that they sum to 1. - - If called without arguments, the resulting probability - distribution assigns zero probability to all values. - """ - - self._prob_dict = prob_dict.copy() if prob_dict is not None else {} - self._log = log - - # Normalize the distribution, if requested. - if normalize: - if len(prob_dict) == 0: - raise ValueError( - "A DictionaryProbDist must have at least one sample " - + "before it can be normalized." - ) - if log: - value_sum = sum_logs(list(self._prob_dict.values())) - if value_sum <= _NINF: - logp = math.log(1.0 / len(prob_dict), 2) - for x in prob_dict: - self._prob_dict[x] = logp - else: - for (x, p) in self._prob_dict.items(): - self._prob_dict[x] -= value_sum - else: - value_sum = sum(self._prob_dict.values()) - if value_sum == 0: - p = 1.0 / len(prob_dict) - for x in prob_dict: - self._prob_dict[x] = p - else: - norm_factor = 1.0 / value_sum - for (x, p) in self._prob_dict.items(): - self._prob_dict[x] *= norm_factor - - def prob(self, sample): - if self._log: - return 2 ** (self._prob_dict[sample]) if sample in self._prob_dict else 0 - else: - return self._prob_dict.get(sample, 0) - - def logprob(self, sample): - if self._log: - return self._prob_dict.get(sample, _NINF) - else: - if sample not in self._prob_dict: - return _NINF - elif self._prob_dict[sample] == 0: - return _NINF - else: - return math.log(self._prob_dict[sample], 2) - - def max(self): - if not hasattr(self, "_max"): - self._max = max((p, v) for (v, p) in self._prob_dict.items())[1] - return self._max - - def samples(self): - return self._prob_dict.keys() - - def __repr__(self): - return "" % len(self._prob_dict) - - -class MLEProbDist(ProbDistI): - """ - The maximum likelihood estimate for the probability distribution - of the experiment used to generate a frequency distribution. The - "maximum likelihood estimate" approximates the probability of - each sample as the frequency of that sample in the frequency - distribution. - """ - - def __init__(self, freqdist, bins=None): - """ - Use the maximum likelihood estimate to create a probability - distribution for the experiment used to generate ``freqdist``. - - :type freqdist: FreqDist - :param freqdist: The frequency distribution that the - probability estimates should be based on. - """ - self._freqdist = freqdist - - def freqdist(self): - """ - Return the frequency distribution that this probability - distribution is based on. - - :rtype: FreqDist - """ - return self._freqdist - - def prob(self, sample): - return self._freqdist.freq(sample) - - def max(self): - return self._freqdist.max() - - def samples(self): - return self._freqdist.keys() - - def __repr__(self): - """ - :rtype: str - :return: A string representation of this ``ProbDist``. - """ - return "" % self._freqdist.N() - - -class LidstoneProbDist(ProbDistI): - """ - The Lidstone estimate for the probability distribution of the - experiment used to generate a frequency distribution. The - "Lidstone estimate" is parameterized by a real number *gamma*, - which typically ranges from 0 to 1. The Lidstone estimate - approximates the probability of a sample with count *c* from an - experiment with *N* outcomes and *B* bins as - ``c+gamma)/(N+B*gamma)``. This is equivalent to adding - *gamma* to the count for each bin, and taking the maximum - likelihood estimate of the resulting frequency distribution. - """ - - SUM_TO_ONE = False - - def __init__(self, freqdist, gamma, bins=None): - """ - Use the Lidstone estimate to create a probability distribution - for the experiment used to generate ``freqdist``. - - :type freqdist: FreqDist - :param freqdist: The frequency distribution that the - probability estimates should be based on. - :type gamma: float - :param gamma: A real number used to parameterize the - estimate. The Lidstone estimate is equivalent to adding - *gamma* to the count for each bin, and taking the - maximum likelihood estimate of the resulting frequency - distribution. - :type bins: int - :param bins: The number of sample values that can be generated - by the experiment that is described by the probability - distribution. This value must be correctly set for the - probabilities of the sample values to sum to one. If - ``bins`` is not specified, it defaults to ``freqdist.B()``. - """ - if (bins == 0) or (bins is None and freqdist.N() == 0): - name = self.__class__.__name__[:-8] - raise ValueError( - "A %s probability distribution " % name + "must have at least one bin." - ) - if (bins is not None) and (bins < freqdist.B()): - name = self.__class__.__name__[:-8] - raise ValueError( - "\nThe number of bins in a %s distribution " % name - + "(%d) must be greater than or equal to\n" % bins - + "the number of bins in the FreqDist used " - + "to create it (%d)." % freqdist.B() - ) - - self._freqdist = freqdist - self._gamma = float(gamma) - self._N = self._freqdist.N() - - if bins is None: - bins = freqdist.B() - self._bins = bins - - self._divisor = self._N + bins * gamma - if self._divisor == 0.0: - # In extreme cases we force the probability to be 0, - # which it will be, since the count will be 0: - self._gamma = 0 - self._divisor = 1 - - def freqdist(self): - """ - Return the frequency distribution that this probability - distribution is based on. - - :rtype: FreqDist - """ - return self._freqdist - - def prob(self, sample): - c = self._freqdist[sample] - return (c + self._gamma) / self._divisor - - def max(self): - # For Lidstone distributions, probability is monotonic with - # frequency, so the most probable sample is the one that - # occurs most frequently. - return self._freqdist.max() - - def samples(self): - return self._freqdist.keys() - - def discount(self): - gb = self._gamma * self._bins - return gb / (self._N + gb) - - def __repr__(self): - """ - Return a string representation of this ``ProbDist``. - - :rtype: str - """ - return "" % self._freqdist.N() - - -class LaplaceProbDist(LidstoneProbDist): - """ - The Laplace estimate for the probability distribution of the - experiment used to generate a frequency distribution. The - "Laplace estimate" approximates the probability of a sample with - count *c* from an experiment with *N* outcomes and *B* bins as - *(c+1)/(N+B)*. This is equivalent to adding one to the count for - each bin, and taking the maximum likelihood estimate of the - resulting frequency distribution. - """ - - def __init__(self, freqdist, bins=None): - """ - Use the Laplace estimate to create a probability distribution - for the experiment used to generate ``freqdist``. - - :type freqdist: FreqDist - :param freqdist: The frequency distribution that the - probability estimates should be based on. - :type bins: int - :param bins: The number of sample values that can be generated - by the experiment that is described by the probability - distribution. This value must be correctly set for the - probabilities of the sample values to sum to one. If - ``bins`` is not specified, it defaults to ``freqdist.B()``. - """ - LidstoneProbDist.__init__(self, freqdist, 1, bins) - - def __repr__(self): - """ - :rtype: str - :return: A string representation of this ``ProbDist``. - """ - return "" % self._freqdist.N() - - -class ELEProbDist(LidstoneProbDist): - """ - The expected likelihood estimate for the probability distribution - of the experiment used to generate a frequency distribution. The - "expected likelihood estimate" approximates the probability of a - sample with count *c* from an experiment with *N* outcomes and - *B* bins as *(c+0.5)/(N+B/2)*. This is equivalent to adding 0.5 - to the count for each bin, and taking the maximum likelihood - estimate of the resulting frequency distribution. - """ - - def __init__(self, freqdist, bins=None): - """ - Use the expected likelihood estimate to create a probability - distribution for the experiment used to generate ``freqdist``. - - :type freqdist: FreqDist - :param freqdist: The frequency distribution that the - probability estimates should be based on. - :type bins: int - :param bins: The number of sample values that can be generated - by the experiment that is described by the probability - distribution. This value must be correctly set for the - probabilities of the sample values to sum to one. If - ``bins`` is not specified, it defaults to ``freqdist.B()``. - """ - LidstoneProbDist.__init__(self, freqdist, 0.5, bins) - - def __repr__(self): - """ - Return a string representation of this ``ProbDist``. - - :rtype: str - """ - return "" % self._freqdist.N() - - -class HeldoutProbDist(ProbDistI): - """ - The heldout estimate for the probability distribution of the - experiment used to generate two frequency distributions. These - two frequency distributions are called the "heldout frequency - distribution" and the "base frequency distribution." The - "heldout estimate" uses uses the "heldout frequency - distribution" to predict the probability of each sample, given its - frequency in the "base frequency distribution". - - In particular, the heldout estimate approximates the probability - for a sample that occurs *r* times in the base distribution as - the average frequency in the heldout distribution of all samples - that occur *r* times in the base distribution. - - This average frequency is *Tr[r]/(Nr[r].N)*, where: - - - *Tr[r]* is the total count in the heldout distribution for - all samples that occur *r* times in the base distribution. - - *Nr[r]* is the number of samples that occur *r* times in - the base distribution. - - *N* is the number of outcomes recorded by the heldout - frequency distribution. - - In order to increase the efficiency of the ``prob`` member - function, *Tr[r]/(Nr[r].N)* is precomputed for each value of *r* - when the ``HeldoutProbDist`` is created. - - :type _estimate: list(float) - :ivar _estimate: A list mapping from *r*, the number of - times that a sample occurs in the base distribution, to the - probability estimate for that sample. ``_estimate[r]`` is - calculated by finding the average frequency in the heldout - distribution of all samples that occur *r* times in the base - distribution. In particular, ``_estimate[r]`` = - *Tr[r]/(Nr[r].N)*. - :type _max_r: int - :ivar _max_r: The maximum number of times that any sample occurs - in the base distribution. ``_max_r`` is used to decide how - large ``_estimate`` must be. - """ - - SUM_TO_ONE = False - - def __init__(self, base_fdist, heldout_fdist, bins=None): - """ - Use the heldout estimate to create a probability distribution - for the experiment used to generate ``base_fdist`` and - ``heldout_fdist``. - - :type base_fdist: FreqDist - :param base_fdist: The base frequency distribution. - :type heldout_fdist: FreqDist - :param heldout_fdist: The heldout frequency distribution. - :type bins: int - :param bins: The number of sample values that can be generated - by the experiment that is described by the probability - distribution. This value must be correctly set for the - probabilities of the sample values to sum to one. If - ``bins`` is not specified, it defaults to ``freqdist.B()``. - """ - - self._base_fdist = base_fdist - self._heldout_fdist = heldout_fdist - - # The max number of times any sample occurs in base_fdist. - self._max_r = base_fdist[base_fdist.max()] - - # Calculate Tr, Nr, and N. - Tr = self._calculate_Tr() - r_Nr = base_fdist.r_Nr(bins) - Nr = [r_Nr[r] for r in range(self._max_r + 1)] - N = heldout_fdist.N() - - # Use Tr, Nr, and N to compute the probability estimate for - # each value of r. - self._estimate = self._calculate_estimate(Tr, Nr, N) - - def _calculate_Tr(self): - """ - Return the list *Tr*, where *Tr[r]* is the total count in - ``heldout_fdist`` for all samples that occur *r* - times in ``base_fdist``. - - :rtype: list(float) - """ - Tr = [0.0] * (self._max_r + 1) - for sample in self._heldout_fdist: - r = self._base_fdist[sample] - Tr[r] += self._heldout_fdist[sample] - return Tr - - def _calculate_estimate(self, Tr, Nr, N): - """ - Return the list *estimate*, where *estimate[r]* is the probability - estimate for any sample that occurs *r* times in the base frequency - distribution. In particular, *estimate[r]* is *Tr[r]/(N[r].N)*. - In the special case that *N[r]=0*, *estimate[r]* will never be used; - so we define *estimate[r]=None* for those cases. - - :rtype: list(float) - :type Tr: list(float) - :param Tr: the list *Tr*, where *Tr[r]* is the total count in - the heldout distribution for all samples that occur *r* - times in base distribution. - :type Nr: list(float) - :param Nr: The list *Nr*, where *Nr[r]* is the number of - samples that occur *r* times in the base distribution. - :type N: int - :param N: The total number of outcomes recorded by the heldout - frequency distribution. - """ - estimate = [] - for r in range(self._max_r + 1): - if Nr[r] == 0: - estimate.append(None) - else: - estimate.append(Tr[r] / (Nr[r] * N)) - return estimate - - def base_fdist(self): - """ - Return the base frequency distribution that this probability - distribution is based on. - - :rtype: FreqDist - """ - return self._base_fdist - - def heldout_fdist(self): - """ - Return the heldout frequency distribution that this - probability distribution is based on. - - :rtype: FreqDist - """ - return self._heldout_fdist - - def samples(self): - return self._base_fdist.keys() - - def prob(self, sample): - # Use our precomputed probability estimate. - r = self._base_fdist[sample] - return self._estimate[r] - - def max(self): - # Note: the Heldout estimation is *not* necessarily monotonic; - # so this implementation is currently broken. However, it - # should give the right answer *most* of the time. :) - return self._base_fdist.max() - - def discount(self): - raise NotImplementedError() - - def __repr__(self): - """ - :rtype: str - :return: A string representation of this ``ProbDist``. - """ - s = "" - return s % (self._base_fdist.N(), self._heldout_fdist.N()) - - -class CrossValidationProbDist(ProbDistI): - """ - The cross-validation estimate for the probability distribution of - the experiment used to generate a set of frequency distribution. - The "cross-validation estimate" for the probability of a sample - is found by averaging the held-out estimates for the sample in - each pair of frequency distributions. - """ - - SUM_TO_ONE = False - - def __init__(self, freqdists, bins): - """ - Use the cross-validation estimate to create a probability - distribution for the experiment used to generate - ``freqdists``. - - :type freqdists: list(FreqDist) - :param freqdists: A list of the frequency distributions - generated by the experiment. - :type bins: int - :param bins: The number of sample values that can be generated - by the experiment that is described by the probability - distribution. This value must be correctly set for the - probabilities of the sample values to sum to one. If - ``bins`` is not specified, it defaults to ``freqdist.B()``. - """ - self._freqdists = freqdists - - # Create a heldout probability distribution for each pair of - # frequency distributions in freqdists. - self._heldout_probdists = [] - for fdist1 in freqdists: - for fdist2 in freqdists: - if fdist1 is not fdist2: - probdist = HeldoutProbDist(fdist1, fdist2, bins) - self._heldout_probdists.append(probdist) - - def freqdists(self): - """ - Return the list of frequency distributions that this ``ProbDist`` is based on. - - :rtype: list(FreqDist) - """ - return self._freqdists - - def samples(self): - # [xx] nb: this is not too efficient - return set(sum((list(fd) for fd in self._freqdists), [])) - - def prob(self, sample): - # Find the average probability estimate returned by each - # heldout distribution. - prob = 0.0 - for heldout_probdist in self._heldout_probdists: - prob += heldout_probdist.prob(sample) - return prob / len(self._heldout_probdists) - - def discount(self): - raise NotImplementedError() - - def __repr__(self): - """ - Return a string representation of this ``ProbDist``. - - :rtype: str - """ - return "" % len(self._freqdists) - - -class WittenBellProbDist(ProbDistI): - """ - The Witten-Bell estimate of a probability distribution. This distribution - allocates uniform probability mass to as yet unseen events by using the - number of events that have only been seen once. The probability mass - reserved for unseen events is equal to *T / (N + T)* - where *T* is the number of observed event types and *N* is the total - number of observed events. This equates to the maximum likelihood estimate - of a new type event occurring. The remaining probability mass is discounted - such that all probability estimates sum to one, yielding: - - - *p = T / Z (N + T)*, if count = 0 - - *p = c / (N + T)*, otherwise - """ - - def __init__(self, freqdist, bins=None): - """ - Creates a distribution of Witten-Bell probability estimates. This - distribution allocates uniform probability mass to as yet unseen - events by using the number of events that have only been seen once. The - probability mass reserved for unseen events is equal to *T / (N + T)* - where *T* is the number of observed event types and *N* is the total - number of observed events. This equates to the maximum likelihood - estimate of a new type event occurring. The remaining probability mass - is discounted such that all probability estimates sum to one, - yielding: - - - *p = T / Z (N + T)*, if count = 0 - - *p = c / (N + T)*, otherwise - - The parameters *T* and *N* are taken from the ``freqdist`` parameter - (the ``B()`` and ``N()`` values). The normalizing factor *Z* is - calculated using these values along with the ``bins`` parameter. - - :param freqdist: The frequency counts upon which to base the - estimation. - :type freqdist: FreqDist - :param bins: The number of possible event types. This must be at least - as large as the number of bins in the ``freqdist``. If None, then - it's assumed to be equal to that of the ``freqdist`` - :type bins: int - """ - assert bins is None or bins >= freqdist.B(), ( - "bins parameter must not be less than %d=freqdist.B()" % freqdist.B() - ) - if bins is None: - bins = freqdist.B() - self._freqdist = freqdist - self._T = self._freqdist.B() - self._Z = bins - self._freqdist.B() - self._N = self._freqdist.N() - # self._P0 is P(0), precalculated for efficiency: - if self._N == 0: - # if freqdist is empty, we approximate P(0) by a UniformProbDist: - self._P0 = 1.0 / self._Z - else: - self._P0 = self._T / (self._Z * (self._N + self._T)) - - def prob(self, sample): - # inherit docs from ProbDistI - c = self._freqdist[sample] - return c / (self._N + self._T) if c != 0 else self._P0 - - def max(self): - return self._freqdist.max() - - def samples(self): - return self._freqdist.keys() - - def freqdist(self): - return self._freqdist - - def discount(self): - raise NotImplementedError() - - def __repr__(self): - """ - Return a string representation of this ``ProbDist``. - - :rtype: str - """ - return "" % self._freqdist.N() - - -##////////////////////////////////////////////////////// -## Good-Turing Probability Distributions -##////////////////////////////////////////////////////// - -# Good-Turing frequency estimation was contributed by Alan Turing and -# his statistical assistant I.J. Good, during their collaboration in -# the WWII. It is a statistical technique for predicting the -# probability of occurrence of objects belonging to an unknown number -# of species, given past observations of such objects and their -# species. (In drawing balls from an urn, the 'objects' would be balls -# and the 'species' would be the distinct colors of the balls (finite -# but unknown in number). -# -# Good-Turing method calculates the probability mass to assign to -# events with zero or low counts based on the number of events with -# higher counts. It does so by using the adjusted count *c\**: -# -# - *c\* = (c + 1) N(c + 1) / N(c)* for c >= 1 -# - *things with frequency zero in training* = N(1) for c == 0 -# -# where *c* is the original count, *N(i)* is the number of event types -# observed with count *i*. We can think the count of unseen as the count -# of frequency one (see Jurafsky & Martin 2nd Edition, p101). -# -# This method is problematic because the situation ``N(c+1) == 0`` -# is quite common in the original Good-Turing estimation; smoothing or -# interpolation of *N(i)* values is essential in practice. -# -# Bill Gale and Geoffrey Sampson present a simple and effective approach, -# Simple Good-Turing. As a smoothing curve they simply use a power curve: -# -# Nr = a*r^b (with b < -1 to give the appropriate hyperbolic -# relationship) -# -# They estimate a and b by simple linear regression technique on the -# logarithmic form of the equation: -# -# log Nr = a + b*log(r) -# -# However, they suggest that such a simple curve is probably only -# appropriate for high values of r. For low values of r, they use the -# measured Nr directly. (see M&S, p.213) -# -# Gale and Sampson propose to use r while the difference between r and -# r* is 1.96 greater than the standard deviation, and switch to r* if -# it is less or equal: -# -# |r - r*| > 1.96 * sqrt((r + 1)^2 (Nr+1 / Nr^2) (1 + Nr+1 / Nr)) -# -# The 1.96 coefficient correspond to a 0.05 significance criterion, -# some implementations can use a coefficient of 1.65 for a 0.1 -# significance criterion. -# - -##////////////////////////////////////////////////////// -## Simple Good-Turing Probablity Distributions -##////////////////////////////////////////////////////// - - -class SimpleGoodTuringProbDist(ProbDistI): - """ - SimpleGoodTuring ProbDist approximates from frequency to frequency of - frequency into a linear line under log space by linear regression. - Details of Simple Good-Turing algorithm can be found in: - - - Good Turing smoothing without tears" (Gale & Sampson 1995), - Journal of Quantitative Linguistics, vol. 2 pp. 217-237. - - "Speech and Language Processing (Jurafsky & Martin), - 2nd Edition, Chapter 4.5 p103 (log(Nc) = a + b*log(c)) - - https://www.grsampson.net/RGoodTur.html - - Given a set of pair (xi, yi), where the xi denotes the frequency and - yi denotes the frequency of frequency, we want to minimize their - square variation. E(x) and E(y) represent the mean of xi and yi. - - - slope: b = sigma ((xi-E(x)(yi-E(y))) / sigma ((xi-E(x))(xi-E(x))) - - intercept: a = E(y) - b.E(x) - """ - - SUM_TO_ONE = False - - def __init__(self, freqdist, bins=None): - """ - :param freqdist: The frequency counts upon which to base the - estimation. - :type freqdist: FreqDist - :param bins: The number of possible event types. This must be - larger than the number of bins in the ``freqdist``. If None, - then it's assumed to be equal to ``freqdist``.B() + 1 - :type bins: int - """ - assert ( - bins is None or bins > freqdist.B() - ), "bins parameter must not be less than %d=freqdist.B()+1" % (freqdist.B() + 1) - if bins is None: - bins = freqdist.B() + 1 - self._freqdist = freqdist - self._bins = bins - r, nr = self._r_Nr() - self.find_best_fit(r, nr) - self._switch(r, nr) - self._renormalize(r, nr) - - def _r_Nr_non_zero(self): - r_Nr = self._freqdist.r_Nr() - del r_Nr[0] - return r_Nr - - def _r_Nr(self): - """ - Split the frequency distribution in two list (r, Nr), where Nr(r) > 0 - """ - nonzero = self._r_Nr_non_zero() - - if not nonzero: - return [], [] - return zip(*sorted(nonzero.items())) - - def find_best_fit(self, r, nr): - """ - Use simple linear regression to tune parameters self._slope and - self._intercept in the log-log space based on count and Nr(count) - (Work in log space to avoid floating point underflow.) - """ - # For higher sample frequencies the data points becomes horizontal - # along line Nr=1. To create a more evident linear model in log-log - # space, we average positive Nr values with the surrounding zero - # values. (Church and Gale, 1991) - - if not r or not nr: - # Empty r or nr? - return - - zr = [] - for j in range(len(r)): - i = r[j - 1] if j > 0 else 0 - k = 2 * r[j] - i if j == len(r) - 1 else r[j + 1] - zr_ = 2.0 * nr[j] / (k - i) - zr.append(zr_) - - log_r = [math.log(i) for i in r] - log_zr = [math.log(i) for i in zr] - - xy_cov = x_var = 0.0 - x_mean = sum(log_r) / len(log_r) - y_mean = sum(log_zr) / len(log_zr) - for (x, y) in zip(log_r, log_zr): - xy_cov += (x - x_mean) * (y - y_mean) - x_var += (x - x_mean) ** 2 - self._slope = xy_cov / x_var if x_var != 0 else 0.0 - if self._slope >= -1: - warnings.warn( - "SimpleGoodTuring did not find a proper best fit " - "line for smoothing probabilities of occurrences. " - "The probability estimates are likely to be " - "unreliable." - ) - self._intercept = y_mean - self._slope * x_mean - - def _switch(self, r, nr): - """ - Calculate the r frontier where we must switch from Nr to Sr - when estimating E[Nr]. - """ - for i, r_ in enumerate(r): - if len(r) == i + 1 or r[i + 1] != r_ + 1: - # We are at the end of r, or there is a gap in r - self._switch_at = r_ - break - - Sr = self.smoothedNr - smooth_r_star = (r_ + 1) * Sr(r_ + 1) / Sr(r_) - unsmooth_r_star = (r_ + 1) * nr[i + 1] / nr[i] - - std = math.sqrt(self._variance(r_, nr[i], nr[i + 1])) - if abs(unsmooth_r_star - smooth_r_star) <= 1.96 * std: - self._switch_at = r_ - break - - def _variance(self, r, nr, nr_1): - r = float(r) - nr = float(nr) - nr_1 = float(nr_1) - return (r + 1.0) ** 2 * (nr_1 / nr**2) * (1.0 + nr_1 / nr) - - def _renormalize(self, r, nr): - """ - It is necessary to renormalize all the probability estimates to - ensure a proper probability distribution results. This can be done - by keeping the estimate of the probability mass for unseen items as - N(1)/N and renormalizing all the estimates for previously seen items - (as Gale and Sampson (1995) propose). (See M&S P.213, 1999) - """ - prob_cov = 0.0 - for r_, nr_ in zip(r, nr): - prob_cov += nr_ * self._prob_measure(r_) - if prob_cov: - self._renormal = (1 - self._prob_measure(0)) / prob_cov - - def smoothedNr(self, r): - """ - Return the number of samples with count r. - - :param r: The amount of frequency. - :type r: int - :rtype: float - """ - - # Nr = a*r^b (with b < -1 to give the appropriate hyperbolic - # relationship) - # Estimate a and b by simple linear regression technique on - # the logarithmic form of the equation: log Nr = a + b*log(r) - - return math.exp(self._intercept + self._slope * math.log(r)) - - def prob(self, sample): - """ - Return the sample's probability. - - :param sample: sample of the event - :type sample: str - :rtype: float - """ - count = self._freqdist[sample] - p = self._prob_measure(count) - if count == 0: - if self._bins == self._freqdist.B(): - p = 0.0 - else: - p = p / (self._bins - self._freqdist.B()) - else: - p = p * self._renormal - return p - - def _prob_measure(self, count): - if count == 0 and self._freqdist.N() == 0: - return 1.0 - elif count == 0 and self._freqdist.N() != 0: - return self._freqdist.Nr(1) / self._freqdist.N() - - if self._switch_at > count: - Er_1 = self._freqdist.Nr(count + 1) - Er = self._freqdist.Nr(count) - else: - Er_1 = self.smoothedNr(count + 1) - Er = self.smoothedNr(count) - - r_star = (count + 1) * Er_1 / Er - return r_star / self._freqdist.N() - - def check(self): - prob_sum = 0.0 - for i in range(0, len(self._Nr)): - prob_sum += self._Nr[i] * self._prob_measure(i) / self._renormal - print("Probability Sum:", prob_sum) - # assert prob_sum != 1.0, "probability sum should be one!" - - def discount(self): - """ - This function returns the total mass of probability transfers from the - seen samples to the unseen samples. - """ - return self.smoothedNr(1) / self._freqdist.N() - - def max(self): - return self._freqdist.max() - - def samples(self): - return self._freqdist.keys() - - def freqdist(self): - return self._freqdist - - def __repr__(self): - """ - Return a string representation of this ``ProbDist``. - - :rtype: str - """ - return "" % self._freqdist.N() - - -class MutableProbDist(ProbDistI): - """ - An mutable probdist where the probabilities may be easily modified. This - simply copies an existing probdist, storing the probability values in a - mutable dictionary and providing an update method. - """ - - def __init__(self, prob_dist, samples, store_logs=True): - """ - Creates the mutable probdist based on the given prob_dist and using - the list of samples given. These values are stored as log - probabilities if the store_logs flag is set. - - :param prob_dist: the distribution from which to garner the - probabilities - :type prob_dist: ProbDist - :param samples: the complete set of samples - :type samples: sequence of any - :param store_logs: whether to store the probabilities as logarithms - :type store_logs: bool - """ - self._samples = samples - self._sample_dict = {samples[i]: i for i in range(len(samples))} - self._data = array.array("d", [0.0]) * len(samples) - for i in range(len(samples)): - if store_logs: - self._data[i] = prob_dist.logprob(samples[i]) - else: - self._data[i] = prob_dist.prob(samples[i]) - self._logs = store_logs - - def max(self): - # inherit documentation - return max((p, v) for (v, p) in self._sample_dict.items())[1] - - def samples(self): - # inherit documentation - return self._samples - - def prob(self, sample): - # inherit documentation - i = self._sample_dict.get(sample) - if i is None: - return 0.0 - return 2 ** (self._data[i]) if self._logs else self._data[i] - - def logprob(self, sample): - # inherit documentation - i = self._sample_dict.get(sample) - if i is None: - return float("-inf") - return self._data[i] if self._logs else math.log(self._data[i], 2) - - def update(self, sample, prob, log=True): - """ - Update the probability for the given sample. This may cause the object - to stop being the valid probability distribution - the user must - ensure that they update the sample probabilities such that all samples - have probabilities between 0 and 1 and that all probabilities sum to - one. - - :param sample: the sample for which to update the probability - :type sample: any - :param prob: the new probability - :type prob: float - :param log: is the probability already logged - :type log: bool - """ - i = self._sample_dict.get(sample) - assert i is not None - if self._logs: - self._data[i] = prob if log else math.log(prob, 2) - else: - self._data[i] = 2 ** (prob) if log else prob - - -##///////////////////////////////////////////////////// -## Kneser-Ney Probability Distribution -##////////////////////////////////////////////////////// - -# This method for calculating probabilities was introduced in 1995 by Reinhard -# Kneser and Hermann Ney. It was meant to improve the accuracy of language -# models that use backing-off to deal with sparse data. The authors propose two -# ways of doing so: a marginal distribution constraint on the back-off -# distribution and a leave-one-out distribution. For a start, the first one is -# implemented as a class below. -# -# The idea behind a back-off n-gram model is that we have a series of -# frequency distributions for our n-grams so that in case we have not seen a -# given n-gram during training (and as a result have a 0 probability for it) we -# can 'back off' (hence the name!) and try testing whether we've seen the -# n-1-gram part of the n-gram in training. -# -# The novelty of Kneser and Ney's approach was that they decided to fiddle -# around with the way this latter, backed off probability was being calculated -# whereas their peers seemed to focus on the primary probability. -# -# The implementation below uses one of the techniques described in their paper -# titled "Improved backing-off for n-gram language modeling." In the same paper -# another technique is introduced to attempt to smooth the back-off -# distribution as well as the primary one. There is also a much-cited -# modification of this method proposed by Chen and Goodman. -# -# In order for the implementation of Kneser-Ney to be more efficient, some -# changes have been made to the original algorithm. Namely, the calculation of -# the normalizing function gamma has been significantly simplified and -# combined slightly differently with beta. None of these changes affect the -# nature of the algorithm, but instead aim to cut out unnecessary calculations -# and take advantage of storing and retrieving information in dictionaries -# where possible. - - -class KneserNeyProbDist(ProbDistI): - """ - Kneser-Ney estimate of a probability distribution. This is a version of - back-off that counts how likely an n-gram is provided the n-1-gram had - been seen in training. Extends the ProbDistI interface, requires a trigram - FreqDist instance to train on. Optionally, a different from default discount - value can be specified. The default discount is set to 0.75. - - """ - - def __init__(self, freqdist, bins=None, discount=0.75): - """ - :param freqdist: The trigram frequency distribution upon which to base - the estimation - :type freqdist: FreqDist - :param bins: Included for compatibility with nltk.tag.hmm - :type bins: int or float - :param discount: The discount applied when retrieving counts of - trigrams - :type discount: float (preferred, but can be set to int) - """ - - if not bins: - self._bins = freqdist.B() - else: - self._bins = bins - self._D = discount - - # cache for probability calculation - self._cache = {} - - # internal bigram and trigram frequency distributions - self._bigrams = defaultdict(int) - self._trigrams = freqdist - - # helper dictionaries used to calculate probabilities - self._wordtypes_after = defaultdict(float) - self._trigrams_contain = defaultdict(float) - self._wordtypes_before = defaultdict(float) - for w0, w1, w2 in freqdist: - self._bigrams[(w0, w1)] += freqdist[(w0, w1, w2)] - self._wordtypes_after[(w0, w1)] += 1 - self._trigrams_contain[w1] += 1 - self._wordtypes_before[(w1, w2)] += 1 - - def prob(self, trigram): - # sample must be a triple - if len(trigram) != 3: - raise ValueError("Expected an iterable with 3 members.") - trigram = tuple(trigram) - w0, w1, w2 = trigram - - if trigram in self._cache: - return self._cache[trigram] - else: - # if the sample trigram was seen during training - if trigram in self._trigrams: - prob = (self._trigrams[trigram] - self.discount()) / self._bigrams[ - (w0, w1) - ] - - # else if the 'rougher' environment was seen during training - elif (w0, w1) in self._bigrams and (w1, w2) in self._wordtypes_before: - aftr = self._wordtypes_after[(w0, w1)] - bfr = self._wordtypes_before[(w1, w2)] - - # the probability left over from alphas - leftover_prob = (aftr * self.discount()) / self._bigrams[(w0, w1)] - - # the beta (including normalization) - beta = bfr / (self._trigrams_contain[w1] - aftr) - - prob = leftover_prob * beta - - # else the sample was completely unseen during training - else: - prob = 0.0 - - self._cache[trigram] = prob - return prob - - def discount(self): - """ - Return the value by which counts are discounted. By default set to 0.75. - - :rtype: float - """ - return self._D - - def set_discount(self, discount): - """ - Set the value by which counts are discounted to the value of discount. - - :param discount: the new value to discount counts by - :type discount: float (preferred, but int possible) - :rtype: None - """ - self._D = discount - - def samples(self): - return self._trigrams.keys() - - def max(self): - return self._trigrams.max() - - def __repr__(self): - """ - Return a string representation of this ProbDist - - :rtype: str - """ - return f">> from nltk.probability import ConditionalFreqDist - >>> from nltk.tokenize import word_tokenize - >>> sent = "the the the dog dog some other words that we do not care about" - >>> cfdist = ConditionalFreqDist() - >>> for word in word_tokenize(sent): - ... condition = len(word) - ... cfdist[condition][word] += 1 - - An equivalent way to do this is with the initializer: - - >>> cfdist = ConditionalFreqDist((len(word), word) for word in word_tokenize(sent)) - - The frequency distribution for each condition is accessed using - the indexing operator: - - >>> cfdist[3] - FreqDist({'the': 3, 'dog': 2, 'not': 1}) - >>> cfdist[3].freq('the') - 0.5 - >>> cfdist[3]['dog'] - 2 - - When the indexing operator is used to access the frequency - distribution for a condition that has not been accessed before, - ``ConditionalFreqDist`` creates a new empty FreqDist for that - condition. - - """ - - def __init__(self, cond_samples=None): - """ - Construct a new empty conditional frequency distribution. In - particular, the count for every sample, under every condition, - is zero. - - :param cond_samples: The samples to initialize the conditional - frequency distribution with - :type cond_samples: Sequence of (condition, sample) tuples - """ - defaultdict.__init__(self, FreqDist) - - if cond_samples: - for (cond, sample) in cond_samples: - self[cond][sample] += 1 - - def __reduce__(self): - kv_pairs = ((cond, self[cond]) for cond in self.conditions()) - return (self.__class__, (), None, None, kv_pairs) - - def conditions(self): - """ - Return a list of the conditions that have been accessed for - this ``ConditionalFreqDist``. Use the indexing operator to - access the frequency distribution for a given condition. - Note that the frequency distributions for some conditions - may contain zero sample outcomes. - - :rtype: list - """ - return list(self.keys()) - - def N(self): - """ - Return the total number of sample outcomes that have been - recorded by this ``ConditionalFreqDist``. - - :rtype: int - """ - return sum(fdist.N() for fdist in self.values()) - - def plot( - self, - *args, - samples=None, - title="", - cumulative=False, - percents=False, - conditions=None, - show=True, - **kwargs, - ): - """ - Plot the given samples from the conditional frequency distribution. - For a cumulative plot, specify cumulative=True. Additional ``*args`` and - ``**kwargs`` are passed to matplotlib's plot function. - (Requires Matplotlib to be installed.) - - :param samples: The samples to plot - :type samples: list - :param title: The title for the graph - :type title: str - :param cumulative: Whether the plot is cumulative. (default = False) - :type cumulative: bool - :param percents: Whether the plot uses percents instead of counts. (default = False) - :type percents: bool - :param conditions: The conditions to plot (default is all) - :type conditions: list - :param show: Whether to show the plot, or only return the ax. - :type show: bool - """ - try: - import matplotlib.pyplot as plt # import statement fix - except ImportError as e: - raise ValueError( - "The plot function requires matplotlib to be installed." - "See https://matplotlib.org/" - ) from e - - if not conditions: - conditions = self.conditions() - else: - conditions = [c for c in conditions if c in self] - if not samples: - samples = sorted({v for c in conditions for v in self[c]}) - if "linewidth" not in kwargs: - kwargs["linewidth"] = 2 - ax = plt.gca() - if conditions: - freqs = [] - for condition in conditions: - if cumulative: - # freqs should be a list of list where each sub list will be a frequency of a condition - freq = list(self[condition]._cumulative_frequencies(samples)) - else: - freq = [self[condition][sample] for sample in samples] - - if percents: - freq = [f / self[condition].N() * 100 for f in freq] - - freqs.append(freq) - - if cumulative: - ylabel = "Cumulative " - legend_loc = "lower right" - else: - ylabel = "" - legend_loc = "upper right" - - if percents: - ylabel += "Percents" - else: - ylabel += "Counts" - - i = 0 - for freq in freqs: - kwargs["label"] = conditions[i] # label for each condition - i += 1 - ax.plot(freq, *args, **kwargs) - ax.legend(loc=legend_loc) - ax.grid(True, color="silver") - ax.set_xticks(range(len(samples))) - ax.set_xticklabels([str(s) for s in samples], rotation=90) - if title: - ax.set_title(title) - ax.set_xlabel("Samples") - ax.set_ylabel(ylabel) - - if show: - plt.show() - - return ax - - def tabulate(self, *args, **kwargs): - """ - Tabulate the given samples from the conditional frequency distribution. - - :param samples: The samples to plot - :type samples: list - :param conditions: The conditions to plot (default is all) - :type conditions: list - :param cumulative: A flag to specify whether the freqs are cumulative (default = False) - :type title: bool - """ - - cumulative = _get_kwarg(kwargs, "cumulative", False) - conditions = _get_kwarg(kwargs, "conditions", sorted(self.conditions())) - samples = _get_kwarg( - kwargs, - "samples", - sorted({v for c in conditions if c in self for v in self[c]}), - ) # this computation could be wasted - - width = max(len("%s" % s) for s in samples) - freqs = dict() - for c in conditions: - if cumulative: - freqs[c] = list(self[c]._cumulative_frequencies(samples)) - else: - freqs[c] = [self[c][sample] for sample in samples] - width = max(width, max(len("%d" % f) for f in freqs[c])) - - condition_size = max(len("%s" % c) for c in conditions) - print(" " * condition_size, end=" ") - for s in samples: - print("%*s" % (width, s), end=" ") - print() - for c in conditions: - print("%*s" % (condition_size, c), end=" ") - for f in freqs[c]: - print("%*d" % (width, f), end=" ") - print() - - # Mathematical operators - - def __add__(self, other): - """ - Add counts from two ConditionalFreqDists. - """ - if not isinstance(other, ConditionalFreqDist): - return NotImplemented - result = self.copy() - for cond in other.conditions(): - result[cond] += other[cond] - return result - - def __sub__(self, other): - """ - Subtract count, but keep only results with positive counts. - """ - if not isinstance(other, ConditionalFreqDist): - return NotImplemented - result = self.copy() - for cond in other.conditions(): - result[cond] -= other[cond] - if not result[cond]: - del result[cond] - return result - - def __or__(self, other): - """ - Union is the maximum of value in either of the input counters. - """ - if not isinstance(other, ConditionalFreqDist): - return NotImplemented - result = self.copy() - for cond in other.conditions(): - result[cond] |= other[cond] - return result - - def __and__(self, other): - """ - Intersection is the minimum of corresponding counts. - """ - if not isinstance(other, ConditionalFreqDist): - return NotImplemented - result = ConditionalFreqDist() - for cond in self.conditions(): - newfreqdist = self[cond] & other[cond] - if newfreqdist: - result[cond] = newfreqdist - return result - - # @total_ordering doesn't work here, since the class inherits from a builtin class - def __le__(self, other): - if not isinstance(other, ConditionalFreqDist): - raise_unorderable_types("<=", self, other) - return set(self.conditions()).issubset(other.conditions()) and all( - self[c] <= other[c] for c in self.conditions() - ) - - def __lt__(self, other): - if not isinstance(other, ConditionalFreqDist): - raise_unorderable_types("<", self, other) - return self <= other and self != other - - def __ge__(self, other): - if not isinstance(other, ConditionalFreqDist): - raise_unorderable_types(">=", self, other) - return other <= self - - def __gt__(self, other): - if not isinstance(other, ConditionalFreqDist): - raise_unorderable_types(">", self, other) - return other < self - - def deepcopy(self): - from copy import deepcopy - - return deepcopy(self) - - copy = deepcopy - - def __repr__(self): - """ - Return a string representation of this ``ConditionalFreqDist``. - - :rtype: str - """ - return "" % len(self) - - -class ConditionalProbDistI(dict, metaclass=ABCMeta): - """ - A collection of probability distributions for a single experiment - run under different conditions. Conditional probability - distributions are used to estimate the likelihood of each sample, - given the condition under which the experiment was run. For - example, a conditional probability distribution could be used to - estimate the probability of each word type in a document, given - the length of the word type. Formally, a conditional probability - distribution can be defined as a function that maps from each - condition to the ``ProbDist`` for the experiment under that - condition. - """ - - @abstractmethod - def __init__(self): - """ - Classes inheriting from ConditionalProbDistI should implement __init__. - """ - - def conditions(self): - """ - Return a list of the conditions that are represented by - this ``ConditionalProbDist``. Use the indexing operator to - access the probability distribution for a given condition. - - :rtype: list - """ - return list(self.keys()) - - def __repr__(self): - """ - Return a string representation of this ``ConditionalProbDist``. - - :rtype: str - """ - return "<%s with %d conditions>" % (type(self).__name__, len(self)) - - -class ConditionalProbDist(ConditionalProbDistI): - """ - A conditional probability distribution modeling the experiments - that were used to generate a conditional frequency distribution. - A ConditionalProbDist is constructed from a - ``ConditionalFreqDist`` and a ``ProbDist`` factory: - - - The ``ConditionalFreqDist`` specifies the frequency - distribution for each condition. - - The ``ProbDist`` factory is a function that takes a - condition's frequency distribution, and returns its - probability distribution. A ``ProbDist`` class's name (such as - ``MLEProbDist`` or ``HeldoutProbDist``) can be used to specify - that class's constructor. - - The first argument to the ``ProbDist`` factory is the frequency - distribution that it should model; and the remaining arguments are - specified by the ``factory_args`` parameter to the - ``ConditionalProbDist`` constructor. For example, the following - code constructs a ``ConditionalProbDist``, where the probability - distribution for each condition is an ``ELEProbDist`` with 10 bins: - - >>> from nltk.corpus import brown - >>> from nltk.probability import ConditionalFreqDist - >>> from nltk.probability import ConditionalProbDist, ELEProbDist - >>> cfdist = ConditionalFreqDist(brown.tagged_words()[:5000]) - >>> cpdist = ConditionalProbDist(cfdist, ELEProbDist, 10) - >>> cpdist['passed'].max() - 'VBD' - >>> cpdist['passed'].prob('VBD') #doctest: +ELLIPSIS - 0.423... - - """ - - def __init__(self, cfdist, probdist_factory, *factory_args, **factory_kw_args): - """ - Construct a new conditional probability distribution, based on - the given conditional frequency distribution and ``ProbDist`` - factory. - - :type cfdist: ConditionalFreqDist - :param cfdist: The ``ConditionalFreqDist`` specifying the - frequency distribution for each condition. - :type probdist_factory: class or function - :param probdist_factory: The function or class that maps - a condition's frequency distribution to its probability - distribution. The function is called with the frequency - distribution as its first argument, - ``factory_args`` as its remaining arguments, and - ``factory_kw_args`` as keyword arguments. - :type factory_args: (any) - :param factory_args: Extra arguments for ``probdist_factory``. - These arguments are usually used to specify extra - properties for the probability distributions of individual - conditions, such as the number of bins they contain. - :type factory_kw_args: (any) - :param factory_kw_args: Extra keyword arguments for ``probdist_factory``. - """ - self._probdist_factory = probdist_factory - self._factory_args = factory_args - self._factory_kw_args = factory_kw_args - - for condition in cfdist: - self[condition] = probdist_factory( - cfdist[condition], *factory_args, **factory_kw_args - ) - - def __missing__(self, key): - self[key] = self._probdist_factory( - FreqDist(), *self._factory_args, **self._factory_kw_args - ) - return self[key] - - -class DictionaryConditionalProbDist(ConditionalProbDistI): - """ - An alternative ConditionalProbDist that simply wraps a dictionary of - ProbDists rather than creating these from FreqDists. - """ - - def __init__(self, probdist_dict): - """ - :param probdist_dict: a dictionary containing the probdists indexed - by the conditions - :type probdist_dict: dict any -> probdist - """ - self.update(probdist_dict) - - def __missing__(self, key): - self[key] = DictionaryProbDist() - return self[key] - - -##////////////////////////////////////////////////////// -## Adding in log-space. -##////////////////////////////////////////////////////// - -# If the difference is bigger than this, then just take the bigger one: -_ADD_LOGS_MAX_DIFF = math.log(1e-30, 2) - - -def add_logs(logx, logy): - """ - Given two numbers ``logx`` = *log(x)* and ``logy`` = *log(y)*, return - *log(x+y)*. Conceptually, this is the same as returning - ``log(2**(logx)+2**(logy))``, but the actual implementation - avoids overflow errors that could result from direct computation. - """ - if logx < logy + _ADD_LOGS_MAX_DIFF: - return logy - if logy < logx + _ADD_LOGS_MAX_DIFF: - return logx - base = min(logx, logy) - return base + math.log(2 ** (logx - base) + 2 ** (logy - base), 2) - - -def sum_logs(logs): - return reduce(add_logs, logs[1:], logs[0]) if len(logs) != 0 else _NINF - - -##////////////////////////////////////////////////////// -## Probabilistic Mix-in -##////////////////////////////////////////////////////// - - -class ProbabilisticMixIn: - """ - A mix-in class to associate probabilities with other classes - (trees, rules, etc.). To use the ``ProbabilisticMixIn`` class, - define a new class that derives from an existing class and from - ProbabilisticMixIn. You will need to define a new constructor for - the new class, which explicitly calls the constructors of both its - parent classes. For example: - - >>> from nltk.probability import ProbabilisticMixIn - >>> class A: - ... def __init__(self, x, y): self.data = (x,y) - ... - >>> class ProbabilisticA(A, ProbabilisticMixIn): - ... def __init__(self, x, y, **prob_kwarg): - ... A.__init__(self, x, y) - ... ProbabilisticMixIn.__init__(self, **prob_kwarg) - - See the documentation for the ProbabilisticMixIn - ``constructor<__init__>`` for information about the arguments it - expects. - - You should generally also redefine the string representation - methods, the comparison methods, and the hashing method. - """ - - def __init__(self, **kwargs): - """ - Initialize this object's probability. This initializer should - be called by subclass constructors. ``prob`` should generally be - the first argument for those constructors. - - :param prob: The probability associated with the object. - :type prob: float - :param logprob: The log of the probability associated with - the object. - :type logprob: float - """ - if "prob" in kwargs: - if "logprob" in kwargs: - raise TypeError("Must specify either prob or logprob " "(not both)") - else: - ProbabilisticMixIn.set_prob(self, kwargs["prob"]) - elif "logprob" in kwargs: - ProbabilisticMixIn.set_logprob(self, kwargs["logprob"]) - else: - self.__prob = self.__logprob = None - - def set_prob(self, prob): - """ - Set the probability associated with this object to ``prob``. - - :param prob: The new probability - :type prob: float - """ - self.__prob = prob - self.__logprob = None - - def set_logprob(self, logprob): - """ - Set the log probability associated with this object to - ``logprob``. I.e., set the probability associated with this - object to ``2**(logprob)``. - - :param logprob: The new log probability - :type logprob: float - """ - self.__logprob = logprob - self.__prob = None - - def prob(self): - """ - Return the probability associated with this object. - - :rtype: float - """ - if self.__prob is None: - if self.__logprob is None: - return None - self.__prob = 2 ** (self.__logprob) - return self.__prob - - def logprob(self): - """ - Return ``log(p)``, where ``p`` is the probability associated - with this object. - - :rtype: float - """ - if self.__logprob is None: - if self.__prob is None: - return None - self.__logprob = math.log(self.__prob, 2) - return self.__logprob - - -class ImmutableProbabilisticMixIn(ProbabilisticMixIn): - def set_prob(self, prob): - raise ValueError("%s is immutable" % self.__class__.__name__) - - def set_logprob(self, prob): - raise ValueError("%s is immutable" % self.__class__.__name__) - - -## Helper function for processing keyword arguments - - -def _get_kwarg(kwargs, key, default): - if key in kwargs: - arg = kwargs[key] - del kwargs[key] - else: - arg = default - return arg - - -##////////////////////////////////////////////////////// -## Demonstration -##////////////////////////////////////////////////////// - - -def _create_rand_fdist(numsamples, numoutcomes): - """ - Create a new frequency distribution, with random samples. The - samples are numbers from 1 to ``numsamples``, and are generated by - summing two numbers, each of which has a uniform distribution. - """ - - fdist = FreqDist() - for x in range(numoutcomes): - y = random.randint(1, (1 + numsamples) // 2) + random.randint( - 0, numsamples // 2 - ) - fdist[y] += 1 - return fdist - - -def _create_sum_pdist(numsamples): - """ - Return the true probability distribution for the experiment - ``_create_rand_fdist(numsamples, x)``. - """ - fdist = FreqDist() - for x in range(1, (1 + numsamples) // 2 + 1): - for y in range(0, numsamples // 2 + 1): - fdist[x + y] += 1 - return MLEProbDist(fdist) - - -def demo(numsamples=6, numoutcomes=500): - """ - A demonstration of frequency distributions and probability - distributions. This demonstration creates three frequency - distributions with, and uses them to sample a random process with - ``numsamples`` samples. Each frequency distribution is sampled - ``numoutcomes`` times. These three frequency distributions are - then used to build six probability distributions. Finally, the - probability estimates of these distributions are compared to the - actual probability of each sample. - - :type numsamples: int - :param numsamples: The number of samples to use in each demo - frequency distributions. - :type numoutcomes: int - :param numoutcomes: The total number of outcomes for each - demo frequency distribution. These outcomes are divided into - ``numsamples`` bins. - :rtype: None - """ - - # Randomly sample a stochastic process three times. - fdist1 = _create_rand_fdist(numsamples, numoutcomes) - fdist2 = _create_rand_fdist(numsamples, numoutcomes) - fdist3 = _create_rand_fdist(numsamples, numoutcomes) - - # Use our samples to create probability distributions. - pdists = [ - MLEProbDist(fdist1), - LidstoneProbDist(fdist1, 0.5, numsamples), - HeldoutProbDist(fdist1, fdist2, numsamples), - HeldoutProbDist(fdist2, fdist1, numsamples), - CrossValidationProbDist([fdist1, fdist2, fdist3], numsamples), - SimpleGoodTuringProbDist(fdist1), - SimpleGoodTuringProbDist(fdist1, 7), - _create_sum_pdist(numsamples), - ] - - # Find the probability of each sample. - vals = [] - for n in range(1, numsamples + 1): - vals.append(tuple([n, fdist1.freq(n)] + [pdist.prob(n) for pdist in pdists])) - - # Print the results in a formatted table. - print( - "%d samples (1-%d); %d outcomes were sampled for each FreqDist" - % (numsamples, numsamples, numoutcomes) - ) - print("=" * 9 * (len(pdists) + 2)) - FORMATSTR = " FreqDist " + "%8s " * (len(pdists) - 1) + "| Actual" - print(FORMATSTR % tuple(repr(pdist)[1:9] for pdist in pdists[:-1])) - print("-" * 9 * (len(pdists) + 2)) - FORMATSTR = "%3d %8.6f " + "%8.6f " * (len(pdists) - 1) + "| %8.6f" - for val in vals: - print(FORMATSTR % val) - - # Print the totals for each column (should all be 1.0) - zvals = list(zip(*vals)) - sums = [sum(val) for val in zvals[1:]] - print("-" * 9 * (len(pdists) + 2)) - FORMATSTR = "Total " + "%8.6f " * (len(pdists)) + "| %8.6f" - print(FORMATSTR % tuple(sums)) - print("=" * 9 * (len(pdists) + 2)) - - # Display the distributions themselves, if they're short enough. - if len("%s" % fdist1) < 70: - print(" fdist1: %s" % fdist1) - print(" fdist2: %s" % fdist2) - print(" fdist3: %s" % fdist3) - print() - - print("Generating:") - for pdist in pdists: - fdist = FreqDist(pdist.generate() for i in range(5000)) - print("{:>20} {}".format(pdist.__class__.__name__[:20], ("%s" % fdist)[:55])) - print() - - -def gt_demo(): - from nltk import corpus - - emma_words = corpus.gutenberg.words("austen-emma.txt") - fd = FreqDist(emma_words) - sgt = SimpleGoodTuringProbDist(fd) - print("{:>18} {:>8} {:>14}".format("word", "frequency", "SimpleGoodTuring")) - fd_keys_sorted = ( - key for key, value in sorted(fd.items(), key=lambda item: item[1], reverse=True) - ) - for key in fd_keys_sorted: - print("%18s %8d %14e" % (key, fd[key], sgt.prob(key))) - - -if __name__ == "__main__": - demo(6, 10) - demo(5, 5000) - gt_demo() - -__all__ = [ - "ConditionalFreqDist", - "ConditionalProbDist", - "ConditionalProbDistI", - "CrossValidationProbDist", - "DictionaryConditionalProbDist", - "DictionaryProbDist", - "ELEProbDist", - "FreqDist", - "SimpleGoodTuringProbDist", - "HeldoutProbDist", - "ImmutableProbabilisticMixIn", - "LaplaceProbDist", - "LidstoneProbDist", - "MLEProbDist", - "MutableProbDist", - "KneserNeyProbDist", - "ProbDistI", - "ProbabilisticMixIn", - "UniformProbDist", - "WittenBellProbDist", - "add_logs", - "log_likelihood", - "sum_logs", - "entropy", -] diff --git a/pipeline/nltk/sem/__init__.py b/pipeline/nltk/sem/__init__.py deleted file mode 100644 index 5bbb3f032bef5ce79ab7232566bc73aa17ff661b..0000000000000000000000000000000000000000 --- a/pipeline/nltk/sem/__init__.py +++ /dev/null @@ -1,75 +0,0 @@ -# Natural Language Toolkit: Semantic Interpretation -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ewan Klein -# URL: -# For license information, see LICENSE.TXT - -""" -NLTK Semantic Interpretation Package - -This package contains classes for representing semantic structure in -formulas of first-order logic and for evaluating such formulas in -set-theoretic models. - - >>> from nltk.sem import logic - >>> logic._counter._value = 0 - -The package has two main components: - - - ``logic`` provides support for analyzing expressions of First - Order Logic (FOL). - - ``evaluate`` allows users to recursively determine truth in a - model for formulas of FOL. - -A model consists of a domain of discourse and a valuation function, -which assigns values to non-logical constants. We assume that entities -in the domain are represented as strings such as ``'b1'``, ``'g1'``, -etc. A ``Valuation`` is initialized with a list of (symbol, value) -pairs, where values are entities, sets of entities or sets of tuples -of entities. -The domain of discourse can be inferred from the valuation, and model -is then created with domain and valuation as parameters. - - >>> from nltk.sem import Valuation, Model - >>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'), - ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), - ... ('dog', set(['d1'])), - ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))] - >>> val = Valuation(v) - >>> dom = val.domain - >>> m = Model(dom, val) -""" - -from nltk.sem.boxer import Boxer -from nltk.sem.drt import DRS, DrtExpression -from nltk.sem.evaluate import ( - Assignment, - Model, - Undefined, - Valuation, - arity, - is_rel, - read_valuation, - set2rel, -) -from nltk.sem.lfg import FStructure -from nltk.sem.logic import ( - ApplicationExpression, - Expression, - LogicalExpressionException, - Variable, - binding_ops, - boolean_ops, - equality_preds, - read_logic, -) -from nltk.sem.relextract import clause, extract_rels, rtuple -from nltk.sem.skolemize import skolemize -from nltk.sem.util import evaluate_sents, interpret_sents, parse_sents, root_semrep - -# from nltk.sem.glue import Glue -# from nltk.sem.hole import HoleSemantics -# from nltk.sem.cooper_storage import CooperStore - -# don't import chat80 as its names are too generic diff --git a/pipeline/nltk/sem/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/sem/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 7e52138eb5221a25d3d4a083afb89d1ebc1fa9f7..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/sem/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/sem/__pycache__/boxer.cpython-39.pyc b/pipeline/nltk/sem/__pycache__/boxer.cpython-39.pyc deleted file mode 100644 index a0eed610d0139657f09dc16eb9ab81831184348c..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/sem/__pycache__/boxer.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/sem/__pycache__/chat80.cpython-39.pyc b/pipeline/nltk/sem/__pycache__/chat80.cpython-39.pyc deleted file mode 100644 index aecbb366bd1dbd57ea7a4a0eed3aa0c30d40da3a..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/sem/__pycache__/chat80.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/sem/__pycache__/cooper_storage.cpython-39.pyc b/pipeline/nltk/sem/__pycache__/cooper_storage.cpython-39.pyc deleted file mode 100644 index 77ae9a2e16c0892af984bd3219b85c4c9bc4009f..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/sem/__pycache__/cooper_storage.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/sem/__pycache__/drt.cpython-39.pyc b/pipeline/nltk/sem/__pycache__/drt.cpython-39.pyc deleted file mode 100644 index e50aefa8acfc4bbb39d49adce6e74212d2f47b03..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/sem/__pycache__/drt.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/sem/__pycache__/drt_glue_demo.cpython-39.pyc b/pipeline/nltk/sem/__pycache__/drt_glue_demo.cpython-39.pyc deleted file mode 100644 index 531bfb7da8daab4a0f126e47ba43065824bcaed5..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/sem/__pycache__/drt_glue_demo.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/sem/__pycache__/evaluate.cpython-39.pyc b/pipeline/nltk/sem/__pycache__/evaluate.cpython-39.pyc deleted file mode 100644 index ead7005c8dfe32bad8fd136b9e0088e17609ac33..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/sem/__pycache__/evaluate.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/sem/__pycache__/glue.cpython-39.pyc b/pipeline/nltk/sem/__pycache__/glue.cpython-39.pyc deleted file mode 100644 index c645b5ea7b3029490a198dec187f845b7b568642..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/sem/__pycache__/glue.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/sem/__pycache__/hole.cpython-39.pyc b/pipeline/nltk/sem/__pycache__/hole.cpython-39.pyc deleted file mode 100644 index 9bcffda0bfae8acd4e4a47396596a22558ce2643..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/sem/__pycache__/hole.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/sem/__pycache__/lfg.cpython-39.pyc b/pipeline/nltk/sem/__pycache__/lfg.cpython-39.pyc deleted file mode 100644 index e2259f31033bae50a8f998436f1bab49e5b1d942..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/sem/__pycache__/lfg.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/sem/__pycache__/linearlogic.cpython-39.pyc b/pipeline/nltk/sem/__pycache__/linearlogic.cpython-39.pyc deleted file mode 100644 index bc86c104525ebf98854cb51215dd4746bc2105a1..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/sem/__pycache__/linearlogic.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/sem/__pycache__/logic.cpython-39.pyc b/pipeline/nltk/sem/__pycache__/logic.cpython-39.pyc deleted file mode 100644 index 09f842b6b9155740d9e882a24184cf4e48a571ce..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/sem/__pycache__/logic.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/sem/__pycache__/relextract.cpython-39.pyc b/pipeline/nltk/sem/__pycache__/relextract.cpython-39.pyc deleted file mode 100644 index 0e48d9d9345875059ce94d9e2c56a32825f0aaa8..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/sem/__pycache__/relextract.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/sem/__pycache__/skolemize.cpython-39.pyc b/pipeline/nltk/sem/__pycache__/skolemize.cpython-39.pyc deleted file mode 100644 index cb06ca821b82c676b3a713baceb42c8a5813a35a..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/sem/__pycache__/skolemize.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/sem/__pycache__/util.cpython-39.pyc b/pipeline/nltk/sem/__pycache__/util.cpython-39.pyc deleted file mode 100644 index 53c26330d8f53c83d5b2bb7ff7a16327f23079e9..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/sem/__pycache__/util.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/sem/boxer.py b/pipeline/nltk/sem/boxer.py deleted file mode 100644 index d0acd4a607e3bf3481b3f896e8103a9069870c56..0000000000000000000000000000000000000000 --- a/pipeline/nltk/sem/boxer.py +++ /dev/null @@ -1,1605 +0,0 @@ -# Natural Language Toolkit: Interface to Boxer -# -# -# Author: Dan Garrette -# -# Copyright (C) 2001-2023 NLTK Project -# URL: -# For license information, see LICENSE.TXT - -""" -An interface to Boxer. - -This interface relies on the latest version of the development (subversion) version of -C&C and Boxer. - -Usage -===== - -Set the environment variable CANDC to the bin directory of your CandC installation. -The models directory should be in the CandC root directory. -For example:: - - /path/to/candc/ - bin/ - candc - boxer - models/ - boxer/ -""" - -import operator -import os -import re -import subprocess -import tempfile -from functools import reduce -from optparse import OptionParser - -from nltk.internals import find_binary -from nltk.sem.drt import ( - DRS, - DrtApplicationExpression, - DrtEqualityExpression, - DrtNegatedExpression, - DrtOrExpression, - DrtParser, - DrtProposition, - DrtTokens, - DrtVariableExpression, -) -from nltk.sem.logic import ( - ExpectedMoreTokensException, - LogicalExpressionException, - UnexpectedTokenException, - Variable, -) - - -class Boxer: - """ - This class is an interface to Johan Bos's program Boxer, a wide-coverage - semantic parser that produces Discourse Representation Structures (DRSs). - """ - - def __init__( - self, - boxer_drs_interpreter=None, - elimeq=False, - bin_dir=None, - verbose=False, - resolve=True, - ): - """ - :param boxer_drs_interpreter: A class that converts from the - ``AbstractBoxerDrs`` object hierarchy to a different object. The - default is ``NltkDrtBoxerDrsInterpreter``, which converts to the NLTK - DRT hierarchy. - :param elimeq: When set to true, Boxer removes all equalities from the - DRSs and discourse referents standing in the equality relation are - unified, but only if this can be done in a meaning-preserving manner. - :param resolve: When set to true, Boxer will resolve all anaphoric DRSs and perform merge-reduction. - Resolution follows Van der Sandt's theory of binding and accommodation. - """ - if boxer_drs_interpreter is None: - boxer_drs_interpreter = NltkDrtBoxerDrsInterpreter() - self._boxer_drs_interpreter = boxer_drs_interpreter - - self._resolve = resolve - self._elimeq = elimeq - - self.set_bin_dir(bin_dir, verbose) - - def set_bin_dir(self, bin_dir, verbose=False): - self._candc_bin = self._find_binary("candc", bin_dir, verbose) - self._candc_models_path = os.path.normpath( - os.path.join(self._candc_bin[:-5], "../models") - ) - self._boxer_bin = self._find_binary("boxer", bin_dir, verbose) - - def interpret(self, input, discourse_id=None, question=False, verbose=False): - """ - Use Boxer to give a first order representation. - - :param input: str Input sentence to parse - :param occur_index: bool Should predicates be occurrence indexed? - :param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate. - :return: ``drt.DrtExpression`` - """ - discourse_ids = [discourse_id] if discourse_id is not None else None - (d,) = self.interpret_multi_sents([[input]], discourse_ids, question, verbose) - if not d: - raise Exception(f'Unable to interpret: "{input}"') - return d - - def interpret_multi(self, input, discourse_id=None, question=False, verbose=False): - """ - Use Boxer to give a first order representation. - - :param input: list of str Input sentences to parse as a single discourse - :param occur_index: bool Should predicates be occurrence indexed? - :param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate. - :return: ``drt.DrtExpression`` - """ - discourse_ids = [discourse_id] if discourse_id is not None else None - (d,) = self.interpret_multi_sents([input], discourse_ids, question, verbose) - if not d: - raise Exception(f'Unable to interpret: "{input}"') - return d - - def interpret_sents( - self, inputs, discourse_ids=None, question=False, verbose=False - ): - """ - Use Boxer to give a first order representation. - - :param inputs: list of str Input sentences to parse as individual discourses - :param occur_index: bool Should predicates be occurrence indexed? - :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate. - :return: list of ``drt.DrtExpression`` - """ - return self.interpret_multi_sents( - [[input] for input in inputs], discourse_ids, question, verbose - ) - - def interpret_multi_sents( - self, inputs, discourse_ids=None, question=False, verbose=False - ): - """ - Use Boxer to give a first order representation. - - :param inputs: list of list of str Input discourses to parse - :param occur_index: bool Should predicates be occurrence indexed? - :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate. - :return: ``drt.DrtExpression`` - """ - if discourse_ids is not None: - assert len(inputs) == len(discourse_ids) - assert reduce(operator.and_, (id is not None for id in discourse_ids)) - use_disc_id = True - else: - discourse_ids = list(map(str, range(len(inputs)))) - use_disc_id = False - - candc_out = self._call_candc(inputs, discourse_ids, question, verbose=verbose) - boxer_out = self._call_boxer(candc_out, verbose=verbose) - - # if 'ERROR: input file contains no ccg/2 terms.' in boxer_out: - # raise UnparseableInputException('Could not parse with candc: "%s"' % input_str) - - drs_dict = self._parse_to_drs_dict(boxer_out, use_disc_id) - return [drs_dict.get(id, None) for id in discourse_ids] - - def _call_candc(self, inputs, discourse_ids, question, verbose=False): - """ - Call the ``candc`` binary with the given input. - - :param inputs: list of list of str Input discourses to parse - :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate. - :param filename: str A filename for the output file - :return: stdout - """ - args = [ - "--models", - os.path.join(self._candc_models_path, ["boxer", "questions"][question]), - "--candc-printer", - "boxer", - ] - return self._call( - "\n".join( - sum( - ([f"'{id}'"] + d for d, id in zip(inputs, discourse_ids)), - [], - ) - ), - self._candc_bin, - args, - verbose, - ) - - def _call_boxer(self, candc_out, verbose=False): - """ - Call the ``boxer`` binary with the given input. - - :param candc_out: str output from C&C parser - :return: stdout - """ - f = None - try: - fd, temp_filename = tempfile.mkstemp( - prefix="boxer-", suffix=".in", text=True - ) - f = os.fdopen(fd, "w") - f.write(candc_out.decode("utf-8")) - finally: - if f: - f.close() - - args = [ - "--box", - "false", - "--semantics", - "drs", - #'--flat', 'false', # removed from boxer - "--resolve", - ["false", "true"][self._resolve], - "--elimeq", - ["false", "true"][self._elimeq], - "--format", - "prolog", - "--instantiate", - "true", - "--input", - temp_filename, - ] - stdout = self._call(None, self._boxer_bin, args, verbose) - os.remove(temp_filename) - return stdout - - def _find_binary(self, name, bin_dir, verbose=False): - return find_binary( - name, - path_to_bin=bin_dir, - env_vars=["CANDC"], - url="http://svn.ask.it.usyd.edu.au/trac/candc/", - binary_names=[name, name + ".exe"], - verbose=verbose, - ) - - def _call(self, input_str, binary, args=[], verbose=False): - """ - Call the binary with the given input. - - :param input_str: A string whose contents are used as stdin. - :param binary: The location of the binary to call - :param args: A list of command-line arguments. - :return: stdout - """ - if verbose: - print("Calling:", binary) - print("Args:", args) - print("Input:", input_str) - print("Command:", binary + " " + " ".join(args)) - - # Call via a subprocess - if input_str is None: - cmd = [binary] + args - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - else: - cmd = 'echo "{}" | {} {}'.format(input_str, binary, " ".join(args)) - p = subprocess.Popen( - cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True - ) - stdout, stderr = p.communicate() - - if verbose: - print("Return code:", p.returncode) - if stdout: - print("stdout:\n", stdout, "\n") - if stderr: - print("stderr:\n", stderr, "\n") - if p.returncode != 0: - raise Exception( - "ERROR CALLING: {} {}\nReturncode: {}\n{}".format( - binary, " ".join(args), p.returncode, stderr - ) - ) - - return stdout - - def _parse_to_drs_dict(self, boxer_out, use_disc_id): - lines = boxer_out.decode("utf-8").split("\n") - drs_dict = {} - i = 0 - while i < len(lines): - line = lines[i] - if line.startswith("id("): - comma_idx = line.index(",") - discourse_id = line[3:comma_idx] - if discourse_id[0] == "'" and discourse_id[-1] == "'": - discourse_id = discourse_id[1:-1] - drs_id = line[comma_idx + 1 : line.index(")")] - i += 1 - line = lines[i] - assert line.startswith(f"sem({drs_id},") - if line[-4:] == "').'": - line = line[:-4] + ")." - assert line.endswith(")."), f"can't parse line: {line}" - - search_start = len(f"sem({drs_id},[") - brace_count = 1 - drs_start = -1 - for j, c in enumerate(line[search_start:]): - if c == "[": - brace_count += 1 - if c == "]": - brace_count -= 1 - if brace_count == 0: - drs_start = search_start + j + 1 - if line[drs_start : drs_start + 3] == "','": - drs_start = drs_start + 3 - else: - drs_start = drs_start + 1 - break - assert drs_start > -1 - - drs_input = line[drs_start:-2].strip() - parsed = self._parse_drs(drs_input, discourse_id, use_disc_id) - drs_dict[discourse_id] = self._boxer_drs_interpreter.interpret(parsed) - i += 1 - return drs_dict - - def _parse_drs(self, drs_string, discourse_id, use_disc_id): - return BoxerOutputDrsParser([None, discourse_id][use_disc_id]).parse(drs_string) - - -class BoxerOutputDrsParser(DrtParser): - def __init__(self, discourse_id=None): - """ - This class is used to parse the Prolog DRS output from Boxer into a - hierarchy of python objects. - """ - DrtParser.__init__(self) - self.discourse_id = discourse_id - self.sentence_id_offset = None - self.quote_chars = [("'", "'", "\\", False)] - - def parse(self, data, signature=None): - return DrtParser.parse(self, data, signature) - - def get_all_symbols(self): - return ["(", ")", ",", "[", "]", ":"] - - def handle(self, tok, context): - return self.handle_drs(tok) - - def attempt_adjuncts(self, expression, context): - return expression - - def parse_condition(self, indices): - """ - Parse a DRS condition - - :return: list of ``DrtExpression`` - """ - tok = self.token() - accum = self.handle_condition(tok, indices) - if accum is None: - raise UnexpectedTokenException(tok) - return accum - - def handle_drs(self, tok): - if tok == "drs": - return self.parse_drs() - elif tok in ["merge", "smerge"]: - return self._handle_binary_expression(self._make_merge_expression)(None, []) - elif tok in ["alfa"]: - return self._handle_alfa(self._make_merge_expression)(None, []) - - def handle_condition(self, tok, indices): - """ - Handle a DRS condition - - :param indices: list of int - :return: list of ``DrtExpression`` - """ - if tok == "not": - return [self._handle_not()] - - if tok == "or": - conds = [self._handle_binary_expression(self._make_or_expression)] - elif tok == "imp": - conds = [self._handle_binary_expression(self._make_imp_expression)] - elif tok == "eq": - conds = [self._handle_eq()] - elif tok == "prop": - conds = [self._handle_prop()] - - elif tok == "pred": - conds = [self._handle_pred()] - elif tok == "named": - conds = [self._handle_named()] - elif tok == "rel": - conds = [self._handle_rel()] - elif tok == "timex": - conds = self._handle_timex() - elif tok == "card": - conds = [self._handle_card()] - - elif tok == "whq": - conds = [self._handle_whq()] - elif tok == "duplex": - conds = [self._handle_duplex()] - - else: - conds = [] - - return sum( - ( - [cond(sent_index, word_indices) for cond in conds] - for sent_index, word_indices in self._sent_and_word_indices(indices) - ), - [], - ) - - def _handle_not(self): - self.assertToken(self.token(), "(") - drs = self.process_next_expression(None) - self.assertToken(self.token(), ")") - return BoxerNot(drs) - - def _handle_pred(self): - # pred(_G3943, dog, n, 0) - self.assertToken(self.token(), "(") - variable = self.parse_variable() - self.assertToken(self.token(), ",") - name = self.token() - self.assertToken(self.token(), ",") - pos = self.token() - self.assertToken(self.token(), ",") - sense = int(self.token()) - self.assertToken(self.token(), ")") - - def _handle_pred_f(sent_index, word_indices): - return BoxerPred( - self.discourse_id, sent_index, word_indices, variable, name, pos, sense - ) - - return _handle_pred_f - - def _handle_duplex(self): - # duplex(whq, drs(...), var, drs(...)) - self.assertToken(self.token(), "(") - # self.assertToken(self.token(), '[') - ans_types = [] - # while self.token(0) != ']': - # cat = self.token() - # self.assertToken(self.token(), ':') - # if cat == 'des': - # ans_types.append(self.token()) - # elif cat == 'num': - # ans_types.append('number') - # typ = self.token() - # if typ == 'cou': - # ans_types.append('count') - # else: - # ans_types.append(typ) - # else: - # ans_types.append(self.token()) - # self.token() #swallow the ']' - - self.assertToken(self.token(), "whq") - self.assertToken(self.token(), ",") - d1 = self.process_next_expression(None) - self.assertToken(self.token(), ",") - ref = self.parse_variable() - self.assertToken(self.token(), ",") - d2 = self.process_next_expression(None) - self.assertToken(self.token(), ")") - return lambda sent_index, word_indices: BoxerWhq( - self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2 - ) - - def _handle_named(self): - # named(x0, john, per, 0) - self.assertToken(self.token(), "(") - variable = self.parse_variable() - self.assertToken(self.token(), ",") - name = self.token() - self.assertToken(self.token(), ",") - type = self.token() - self.assertToken(self.token(), ",") - sense = self.token() # as per boxer rev 2554 - self.assertToken(self.token(), ")") - return lambda sent_index, word_indices: BoxerNamed( - self.discourse_id, sent_index, word_indices, variable, name, type, sense - ) - - def _handle_rel(self): - # rel(_G3993, _G3943, agent, 0) - self.assertToken(self.token(), "(") - var1 = self.parse_variable() - self.assertToken(self.token(), ",") - var2 = self.parse_variable() - self.assertToken(self.token(), ",") - rel = self.token() - self.assertToken(self.token(), ",") - sense = int(self.token()) - self.assertToken(self.token(), ")") - return lambda sent_index, word_indices: BoxerRel( - self.discourse_id, sent_index, word_indices, var1, var2, rel, sense - ) - - def _handle_timex(self): - # timex(_G18322, date([]: (+), []:'XXXX', [1004]:'04', []:'XX')) - self.assertToken(self.token(), "(") - arg = self.parse_variable() - self.assertToken(self.token(), ",") - new_conds = self._handle_time_expression(arg) - self.assertToken(self.token(), ")") - return new_conds - - def _handle_time_expression(self, arg): - # date([]: (+), []:'XXXX', [1004]:'04', []:'XX') - tok = self.token() - self.assertToken(self.token(), "(") - if tok == "date": - conds = self._handle_date(arg) - elif tok == "time": - conds = self._handle_time(arg) - else: - return None - self.assertToken(self.token(), ")") - return [ - lambda sent_index, word_indices: BoxerPred( - self.discourse_id, sent_index, word_indices, arg, tok, "n", 0 - ) - ] + [lambda sent_index, word_indices: cond for cond in conds] - - def _handle_date(self, arg): - # []: (+), []:'XXXX', [1004]:'04', []:'XX' - conds = [] - ((sent_index, word_indices),) = self._sent_and_word_indices( - self._parse_index_list() - ) - self.assertToken(self.token(), "(") - pol = self.token() - self.assertToken(self.token(), ")") - conds.append( - BoxerPred( - self.discourse_id, - sent_index, - word_indices, - arg, - f"date_pol_{pol}", - "a", - 0, - ) - ) - self.assertToken(self.token(), ",") - - ((sent_index, word_indices),) = self._sent_and_word_indices( - self._parse_index_list() - ) - year = self.token() - if year != "XXXX": - year = year.replace(":", "_") - conds.append( - BoxerPred( - self.discourse_id, - sent_index, - word_indices, - arg, - f"date_year_{year}", - "a", - 0, - ) - ) - self.assertToken(self.token(), ",") - - ((sent_index, word_indices),) = self._sent_and_word_indices( - self._parse_index_list() - ) - month = self.token() - if month != "XX": - conds.append( - BoxerPred( - self.discourse_id, - sent_index, - word_indices, - arg, - f"date_month_{month}", - "a", - 0, - ) - ) - self.assertToken(self.token(), ",") - - ((sent_index, word_indices),) = self._sent_and_word_indices( - self._parse_index_list() - ) - day = self.token() - if day != "XX": - conds.append( - BoxerPred( - self.discourse_id, - sent_index, - word_indices, - arg, - f"date_day_{day}", - "a", - 0, - ) - ) - - return conds - - def _handle_time(self, arg): - # time([1018]:'18', []:'XX', []:'XX') - conds = [] - self._parse_index_list() - hour = self.token() - if hour != "XX": - conds.append(self._make_atom("r_hour_2", arg, hour)) - self.assertToken(self.token(), ",") - - self._parse_index_list() - min = self.token() - if min != "XX": - conds.append(self._make_atom("r_min_2", arg, min)) - self.assertToken(self.token(), ",") - - self._parse_index_list() - sec = self.token() - if sec != "XX": - conds.append(self._make_atom("r_sec_2", arg, sec)) - - return conds - - def _handle_card(self): - # card(_G18535, 28, ge) - self.assertToken(self.token(), "(") - variable = self.parse_variable() - self.assertToken(self.token(), ",") - value = self.token() - self.assertToken(self.token(), ",") - type = self.token() - self.assertToken(self.token(), ")") - return lambda sent_index, word_indices: BoxerCard( - self.discourse_id, sent_index, word_indices, variable, value, type - ) - - def _handle_prop(self): - # prop(_G15949, drs(...)) - self.assertToken(self.token(), "(") - variable = self.parse_variable() - self.assertToken(self.token(), ",") - drs = self.process_next_expression(None) - self.assertToken(self.token(), ")") - return lambda sent_index, word_indices: BoxerProp( - self.discourse_id, sent_index, word_indices, variable, drs - ) - - def _parse_index_list(self): - # [1001,1002]: - indices = [] - self.assertToken(self.token(), "[") - while self.token(0) != "]": - indices.append(self.parse_index()) - if self.token(0) == ",": - self.token() # swallow ',' - self.token() # swallow ']' - self.assertToken(self.token(), ":") - return indices - - def parse_drs(self): - # drs([[1001]:_G3943], - # [[1002]:pred(_G3943, dog, n, 0)] - # ) - self.assertToken(self.token(), "(") - self.assertToken(self.token(), "[") - refs = set() - while self.token(0) != "]": - indices = self._parse_index_list() - refs.add(self.parse_variable()) - if self.token(0) == ",": - self.token() # swallow ',' - self.token() # swallow ']' - self.assertToken(self.token(), ",") - self.assertToken(self.token(), "[") - conds = [] - while self.token(0) != "]": - indices = self._parse_index_list() - conds.extend(self.parse_condition(indices)) - if self.token(0) == ",": - self.token() # swallow ',' - self.token() # swallow ']' - self.assertToken(self.token(), ")") - return BoxerDrs(list(refs), conds) - - def _handle_binary_expression(self, make_callback): - self.assertToken(self.token(), "(") - drs1 = self.process_next_expression(None) - self.assertToken(self.token(), ",") - drs2 = self.process_next_expression(None) - self.assertToken(self.token(), ")") - return lambda sent_index, word_indices: make_callback( - sent_index, word_indices, drs1, drs2 - ) - - def _handle_alfa(self, make_callback): - self.assertToken(self.token(), "(") - type = self.token() - self.assertToken(self.token(), ",") - drs1 = self.process_next_expression(None) - self.assertToken(self.token(), ",") - drs2 = self.process_next_expression(None) - self.assertToken(self.token(), ")") - return lambda sent_index, word_indices: make_callback( - sent_index, word_indices, drs1, drs2 - ) - - def _handle_eq(self): - self.assertToken(self.token(), "(") - var1 = self.parse_variable() - self.assertToken(self.token(), ",") - var2 = self.parse_variable() - self.assertToken(self.token(), ")") - return lambda sent_index, word_indices: BoxerEq( - self.discourse_id, sent_index, word_indices, var1, var2 - ) - - def _handle_whq(self): - self.assertToken(self.token(), "(") - self.assertToken(self.token(), "[") - ans_types = [] - while self.token(0) != "]": - cat = self.token() - self.assertToken(self.token(), ":") - if cat == "des": - ans_types.append(self.token()) - elif cat == "num": - ans_types.append("number") - typ = self.token() - if typ == "cou": - ans_types.append("count") - else: - ans_types.append(typ) - else: - ans_types.append(self.token()) - self.token() # swallow the ']' - - self.assertToken(self.token(), ",") - d1 = self.process_next_expression(None) - self.assertToken(self.token(), ",") - ref = self.parse_variable() - self.assertToken(self.token(), ",") - d2 = self.process_next_expression(None) - self.assertToken(self.token(), ")") - return lambda sent_index, word_indices: BoxerWhq( - self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2 - ) - - def _make_merge_expression(self, sent_index, word_indices, drs1, drs2): - return BoxerDrs(drs1.refs + drs2.refs, drs1.conds + drs2.conds) - - def _make_or_expression(self, sent_index, word_indices, drs1, drs2): - return BoxerOr(self.discourse_id, sent_index, word_indices, drs1, drs2) - - def _make_imp_expression(self, sent_index, word_indices, drs1, drs2): - return BoxerDrs(drs1.refs, drs1.conds, drs2) - - def parse_variable(self): - var = self.token() - assert re.match(r"^[exps]\d+$", var), var - return var - - def parse_index(self): - return int(self.token()) - - def _sent_and_word_indices(self, indices): - """ - :return: list of (sent_index, word_indices) tuples - """ - sent_indices = {(i / 1000) - 1 for i in indices if i >= 0} - if sent_indices: - pairs = [] - for sent_index in sent_indices: - word_indices = [ - (i % 1000) - 1 for i in indices if sent_index == (i / 1000) - 1 - ] - pairs.append((sent_index, word_indices)) - return pairs - else: - word_indices = [(i % 1000) - 1 for i in indices] - return [(None, word_indices)] - - -class BoxerDrsParser(DrtParser): - """ - Reparse the str form of subclasses of ``AbstractBoxerDrs`` - """ - - def __init__(self, discourse_id=None): - DrtParser.__init__(self) - self.discourse_id = discourse_id - - def get_all_symbols(self): - return [ - DrtTokens.OPEN, - DrtTokens.CLOSE, - DrtTokens.COMMA, - DrtTokens.OPEN_BRACKET, - DrtTokens.CLOSE_BRACKET, - ] - - def attempt_adjuncts(self, expression, context): - return expression - - def handle(self, tok, context): - try: - # if tok == 'drs': - # self.assertNextToken(DrtTokens.OPEN) - # label = int(self.token()) - # self.assertNextToken(DrtTokens.COMMA) - # refs = list(map(int, self.handle_refs())) - # self.assertNextToken(DrtTokens.COMMA) - # conds = self.handle_conds(None) - # self.assertNextToken(DrtTokens.CLOSE) - # return BoxerDrs(label, refs, conds) - if tok == "pred": - self.assertNextToken(DrtTokens.OPEN) - disc_id = ( - self.discourse_id if self.discourse_id is not None else self.token() - ) - self.assertNextToken(DrtTokens.COMMA) - sent_id = self.nullableIntToken() - self.assertNextToken(DrtTokens.COMMA) - word_ids = list(map(int, self.handle_refs())) - self.assertNextToken(DrtTokens.COMMA) - variable = int(self.token()) - self.assertNextToken(DrtTokens.COMMA) - name = self.token() - self.assertNextToken(DrtTokens.COMMA) - pos = self.token() - self.assertNextToken(DrtTokens.COMMA) - sense = int(self.token()) - self.assertNextToken(DrtTokens.CLOSE) - return BoxerPred(disc_id, sent_id, word_ids, variable, name, pos, sense) - elif tok == "named": - self.assertNextToken(DrtTokens.OPEN) - disc_id = ( - self.discourse_id if self.discourse_id is not None else self.token() - ) - self.assertNextToken(DrtTokens.COMMA) - sent_id = int(self.token()) - self.assertNextToken(DrtTokens.COMMA) - word_ids = map(int, self.handle_refs()) - self.assertNextToken(DrtTokens.COMMA) - variable = int(self.token()) - self.assertNextToken(DrtTokens.COMMA) - name = self.token() - self.assertNextToken(DrtTokens.COMMA) - type = self.token() - self.assertNextToken(DrtTokens.COMMA) - sense = int(self.token()) - self.assertNextToken(DrtTokens.CLOSE) - return BoxerNamed( - disc_id, sent_id, word_ids, variable, name, type, sense - ) - elif tok == "rel": - self.assertNextToken(DrtTokens.OPEN) - disc_id = ( - self.discourse_id if self.discourse_id is not None else self.token() - ) - self.assertNextToken(DrtTokens.COMMA) - sent_id = self.nullableIntToken() - self.assertNextToken(DrtTokens.COMMA) - word_ids = list(map(int, self.handle_refs())) - self.assertNextToken(DrtTokens.COMMA) - var1 = int(self.token()) - self.assertNextToken(DrtTokens.COMMA) - var2 = int(self.token()) - self.assertNextToken(DrtTokens.COMMA) - rel = self.token() - self.assertNextToken(DrtTokens.COMMA) - sense = int(self.token()) - self.assertNextToken(DrtTokens.CLOSE) - return BoxerRel(disc_id, sent_id, word_ids, var1, var2, rel, sense) - elif tok == "prop": - self.assertNextToken(DrtTokens.OPEN) - disc_id = ( - self.discourse_id if self.discourse_id is not None else self.token() - ) - self.assertNextToken(DrtTokens.COMMA) - sent_id = int(self.token()) - self.assertNextToken(DrtTokens.COMMA) - word_ids = list(map(int, self.handle_refs())) - self.assertNextToken(DrtTokens.COMMA) - variable = int(self.token()) - self.assertNextToken(DrtTokens.COMMA) - drs = self.process_next_expression(None) - self.assertNextToken(DrtTokens.CLOSE) - return BoxerProp(disc_id, sent_id, word_ids, variable, drs) - elif tok == "not": - self.assertNextToken(DrtTokens.OPEN) - drs = self.process_next_expression(None) - self.assertNextToken(DrtTokens.CLOSE) - return BoxerNot(drs) - elif tok == "imp": - self.assertNextToken(DrtTokens.OPEN) - drs1 = self.process_next_expression(None) - self.assertNextToken(DrtTokens.COMMA) - drs2 = self.process_next_expression(None) - self.assertNextToken(DrtTokens.CLOSE) - return BoxerDrs(drs1.refs, drs1.conds, drs2) - elif tok == "or": - self.assertNextToken(DrtTokens.OPEN) - disc_id = ( - self.discourse_id if self.discourse_id is not None else self.token() - ) - self.assertNextToken(DrtTokens.COMMA) - sent_id = self.nullableIntToken() - self.assertNextToken(DrtTokens.COMMA) - word_ids = map(int, self.handle_refs()) - self.assertNextToken(DrtTokens.COMMA) - drs1 = self.process_next_expression(None) - self.assertNextToken(DrtTokens.COMMA) - drs2 = self.process_next_expression(None) - self.assertNextToken(DrtTokens.CLOSE) - return BoxerOr(disc_id, sent_id, word_ids, drs1, drs2) - elif tok == "eq": - self.assertNextToken(DrtTokens.OPEN) - disc_id = ( - self.discourse_id if self.discourse_id is not None else self.token() - ) - self.assertNextToken(DrtTokens.COMMA) - sent_id = self.nullableIntToken() - self.assertNextToken(DrtTokens.COMMA) - word_ids = list(map(int, self.handle_refs())) - self.assertNextToken(DrtTokens.COMMA) - var1 = int(self.token()) - self.assertNextToken(DrtTokens.COMMA) - var2 = int(self.token()) - self.assertNextToken(DrtTokens.CLOSE) - return BoxerEq(disc_id, sent_id, word_ids, var1, var2) - elif tok == "card": - self.assertNextToken(DrtTokens.OPEN) - disc_id = ( - self.discourse_id if self.discourse_id is not None else self.token() - ) - self.assertNextToken(DrtTokens.COMMA) - sent_id = self.nullableIntToken() - self.assertNextToken(DrtTokens.COMMA) - word_ids = map(int, self.handle_refs()) - self.assertNextToken(DrtTokens.COMMA) - var = int(self.token()) - self.assertNextToken(DrtTokens.COMMA) - value = self.token() - self.assertNextToken(DrtTokens.COMMA) - type = self.token() - self.assertNextToken(DrtTokens.CLOSE) - return BoxerCard(disc_id, sent_id, word_ids, var, value, type) - elif tok == "whq": - self.assertNextToken(DrtTokens.OPEN) - disc_id = ( - self.discourse_id if self.discourse_id is not None else self.token() - ) - self.assertNextToken(DrtTokens.COMMA) - sent_id = self.nullableIntToken() - self.assertNextToken(DrtTokens.COMMA) - word_ids = list(map(int, self.handle_refs())) - self.assertNextToken(DrtTokens.COMMA) - ans_types = self.handle_refs() - self.assertNextToken(DrtTokens.COMMA) - drs1 = self.process_next_expression(None) - self.assertNextToken(DrtTokens.COMMA) - var = int(self.token()) - self.assertNextToken(DrtTokens.COMMA) - drs2 = self.process_next_expression(None) - self.assertNextToken(DrtTokens.CLOSE) - return BoxerWhq(disc_id, sent_id, word_ids, ans_types, drs1, var, drs2) - except Exception as e: - raise LogicalExpressionException(self._currentIndex, str(e)) from e - assert False, repr(tok) - - def nullableIntToken(self): - t = self.token() - return int(t) if t != "None" else None - - def get_next_token_variable(self, description): - try: - return self.token() - except ExpectedMoreTokensException as e: - raise ExpectedMoreTokensException(e.index, "Variable expected.") from e - - -class AbstractBoxerDrs: - def variables(self): - """ - :return: (set, set, set) - """ - variables, events, propositions = self._variables() - return (variables - (events | propositions), events, propositions - events) - - def variable_types(self): - vartypes = {} - for t, vars in zip(("z", "e", "p"), self.variables()): - for v in vars: - vartypes[v] = t - return vartypes - - def _variables(self): - """ - :return: (set, set, set) - """ - return (set(), set(), set()) - - def atoms(self): - return set() - - def clean(self): - return self - - def _clean_name(self, name): - return name.replace("-", "_").replace("'", "_") - - def renumber_sentences(self, f): - return self - - def __hash__(self): - return hash(f"{self}") - - -class BoxerDrs(AbstractBoxerDrs): - def __init__(self, refs, conds, consequent=None): - AbstractBoxerDrs.__init__(self) - self.refs = refs - self.conds = conds - self.consequent = consequent - - def _variables(self): - variables = (set(), set(), set()) - for cond in self.conds: - for s, v in zip(variables, cond._variables()): - s.update(v) - if self.consequent is not None: - for s, v in zip(variables, self.consequent._variables()): - s.update(v) - return variables - - def atoms(self): - atoms = reduce(operator.or_, (cond.atoms() for cond in self.conds), set()) - if self.consequent is not None: - atoms.update(self.consequent.atoms()) - return atoms - - def clean(self): - consequent = self.consequent.clean() if self.consequent else None - return BoxerDrs(self.refs, [c.clean() for c in self.conds], consequent) - - def renumber_sentences(self, f): - consequent = self.consequent.renumber_sentences(f) if self.consequent else None - return BoxerDrs( - self.refs, [c.renumber_sentences(f) for c in self.conds], consequent - ) - - def __repr__(self): - s = "drs([{}], [{}])".format( - ", ".join("%s" % r for r in self.refs), - ", ".join("%s" % c for c in self.conds), - ) - if self.consequent is not None: - s = f"imp({s}, {self.consequent})" - return s - - def __eq__(self, other): - return ( - self.__class__ == other.__class__ - and self.refs == other.refs - and len(self.conds) == len(other.conds) - and reduce( - operator.and_, (c1 == c2 for c1, c2 in zip(self.conds, other.conds)) - ) - and self.consequent == other.consequent - ) - - def __ne__(self, other): - return not self == other - - __hash__ = AbstractBoxerDrs.__hash__ - - -class BoxerNot(AbstractBoxerDrs): - def __init__(self, drs): - AbstractBoxerDrs.__init__(self) - self.drs = drs - - def _variables(self): - return self.drs._variables() - - def atoms(self): - return self.drs.atoms() - - def clean(self): - return BoxerNot(self.drs.clean()) - - def renumber_sentences(self, f): - return BoxerNot(self.drs.renumber_sentences(f)) - - def __repr__(self): - return "not(%s)" % (self.drs) - - def __eq__(self, other): - return self.__class__ == other.__class__ and self.drs == other.drs - - def __ne__(self, other): - return not self == other - - __hash__ = AbstractBoxerDrs.__hash__ - - -class BoxerIndexed(AbstractBoxerDrs): - def __init__(self, discourse_id, sent_index, word_indices): - AbstractBoxerDrs.__init__(self) - self.discourse_id = discourse_id - self.sent_index = sent_index - self.word_indices = word_indices - - def atoms(self): - return {self} - - def __eq__(self, other): - return ( - self.__class__ == other.__class__ - and self.discourse_id == other.discourse_id - and self.sent_index == other.sent_index - and self.word_indices == other.word_indices - and reduce(operator.and_, (s == o for s, o in zip(self, other))) - ) - - def __ne__(self, other): - return not self == other - - __hash__ = AbstractBoxerDrs.__hash__ - - def __repr__(self): - s = "{}({}, {}, [{}]".format( - self._pred(), - self.discourse_id, - self.sent_index, - ", ".join("%s" % wi for wi in self.word_indices), - ) - for v in self: - s += ", %s" % v - return s + ")" - - -class BoxerPred(BoxerIndexed): - def __init__(self, discourse_id, sent_index, word_indices, var, name, pos, sense): - BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) - self.var = var - self.name = name - self.pos = pos - self.sense = sense - - def _variables(self): - return ({self.var}, set(), set()) - - def change_var(self, var): - return BoxerPred( - self.discourse_id, - self.sent_index, - self.word_indices, - var, - self.name, - self.pos, - self.sense, - ) - - def clean(self): - return BoxerPred( - self.discourse_id, - self.sent_index, - self.word_indices, - self.var, - self._clean_name(self.name), - self.pos, - self.sense, - ) - - def renumber_sentences(self, f): - new_sent_index = f(self.sent_index) - return BoxerPred( - self.discourse_id, - new_sent_index, - self.word_indices, - self.var, - self.name, - self.pos, - self.sense, - ) - - def __iter__(self): - return iter((self.var, self.name, self.pos, self.sense)) - - def _pred(self): - return "pred" - - -class BoxerNamed(BoxerIndexed): - def __init__(self, discourse_id, sent_index, word_indices, var, name, type, sense): - BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) - self.var = var - self.name = name - self.type = type - self.sense = sense - - def _variables(self): - return ({self.var}, set(), set()) - - def change_var(self, var): - return BoxerNamed( - self.discourse_id, - self.sent_index, - self.word_indices, - var, - self.name, - self.type, - self.sense, - ) - - def clean(self): - return BoxerNamed( - self.discourse_id, - self.sent_index, - self.word_indices, - self.var, - self._clean_name(self.name), - self.type, - self.sense, - ) - - def renumber_sentences(self, f): - return BoxerNamed( - self.discourse_id, - f(self.sent_index), - self.word_indices, - self.var, - self.name, - self.type, - self.sense, - ) - - def __iter__(self): - return iter((self.var, self.name, self.type, self.sense)) - - def _pred(self): - return "named" - - -class BoxerRel(BoxerIndexed): - def __init__(self, discourse_id, sent_index, word_indices, var1, var2, rel, sense): - BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) - self.var1 = var1 - self.var2 = var2 - self.rel = rel - self.sense = sense - - def _variables(self): - return ({self.var1, self.var2}, set(), set()) - - def clean(self): - return BoxerRel( - self.discourse_id, - self.sent_index, - self.word_indices, - self.var1, - self.var2, - self._clean_name(self.rel), - self.sense, - ) - - def renumber_sentences(self, f): - return BoxerRel( - self.discourse_id, - f(self.sent_index), - self.word_indices, - self.var1, - self.var2, - self.rel, - self.sense, - ) - - def __iter__(self): - return iter((self.var1, self.var2, self.rel, self.sense)) - - def _pred(self): - return "rel" - - -class BoxerProp(BoxerIndexed): - def __init__(self, discourse_id, sent_index, word_indices, var, drs): - BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) - self.var = var - self.drs = drs - - def _variables(self): - return tuple( - map(operator.or_, (set(), set(), {self.var}), self.drs._variables()) - ) - - def referenced_labels(self): - return {self.drs} - - def atoms(self): - return self.drs.atoms() - - def clean(self): - return BoxerProp( - self.discourse_id, - self.sent_index, - self.word_indices, - self.var, - self.drs.clean(), - ) - - def renumber_sentences(self, f): - return BoxerProp( - self.discourse_id, - f(self.sent_index), - self.word_indices, - self.var, - self.drs.renumber_sentences(f), - ) - - def __iter__(self): - return iter((self.var, self.drs)) - - def _pred(self): - return "prop" - - -class BoxerEq(BoxerIndexed): - def __init__(self, discourse_id, sent_index, word_indices, var1, var2): - BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) - self.var1 = var1 - self.var2 = var2 - - def _variables(self): - return ({self.var1, self.var2}, set(), set()) - - def atoms(self): - return set() - - def renumber_sentences(self, f): - return BoxerEq( - self.discourse_id, - f(self.sent_index), - self.word_indices, - self.var1, - self.var2, - ) - - def __iter__(self): - return iter((self.var1, self.var2)) - - def _pred(self): - return "eq" - - -class BoxerCard(BoxerIndexed): - def __init__(self, discourse_id, sent_index, word_indices, var, value, type): - BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) - self.var = var - self.value = value - self.type = type - - def _variables(self): - return ({self.var}, set(), set()) - - def renumber_sentences(self, f): - return BoxerCard( - self.discourse_id, - f(self.sent_index), - self.word_indices, - self.var, - self.value, - self.type, - ) - - def __iter__(self): - return iter((self.var, self.value, self.type)) - - def _pred(self): - return "card" - - -class BoxerOr(BoxerIndexed): - def __init__(self, discourse_id, sent_index, word_indices, drs1, drs2): - BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) - self.drs1 = drs1 - self.drs2 = drs2 - - def _variables(self): - return tuple(map(operator.or_, self.drs1._variables(), self.drs2._variables())) - - def atoms(self): - return self.drs1.atoms() | self.drs2.atoms() - - def clean(self): - return BoxerOr( - self.discourse_id, - self.sent_index, - self.word_indices, - self.drs1.clean(), - self.drs2.clean(), - ) - - def renumber_sentences(self, f): - return BoxerOr( - self.discourse_id, - f(self.sent_index), - self.word_indices, - self.drs1, - self.drs2, - ) - - def __iter__(self): - return iter((self.drs1, self.drs2)) - - def _pred(self): - return "or" - - -class BoxerWhq(BoxerIndexed): - def __init__( - self, discourse_id, sent_index, word_indices, ans_types, drs1, variable, drs2 - ): - BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) - self.ans_types = ans_types - self.drs1 = drs1 - self.variable = variable - self.drs2 = drs2 - - def _variables(self): - return tuple( - map( - operator.or_, - ({self.variable}, set(), set()), - self.drs1._variables(), - self.drs2._variables(), - ) - ) - - def atoms(self): - return self.drs1.atoms() | self.drs2.atoms() - - def clean(self): - return BoxerWhq( - self.discourse_id, - self.sent_index, - self.word_indices, - self.ans_types, - self.drs1.clean(), - self.variable, - self.drs2.clean(), - ) - - def renumber_sentences(self, f): - return BoxerWhq( - self.discourse_id, - f(self.sent_index), - self.word_indices, - self.ans_types, - self.drs1, - self.variable, - self.drs2, - ) - - def __iter__(self): - return iter( - ("[" + ",".join(self.ans_types) + "]", self.drs1, self.variable, self.drs2) - ) - - def _pred(self): - return "whq" - - -class PassthroughBoxerDrsInterpreter: - def interpret(self, ex): - return ex - - -class NltkDrtBoxerDrsInterpreter: - def __init__(self, occur_index=False): - self._occur_index = occur_index - - def interpret(self, ex): - """ - :param ex: ``AbstractBoxerDrs`` - :return: ``DrtExpression`` - """ - if isinstance(ex, BoxerDrs): - drs = DRS( - [Variable(r) for r in ex.refs], list(map(self.interpret, ex.conds)) - ) - if ex.consequent is not None: - drs.consequent = self.interpret(ex.consequent) - return drs - elif isinstance(ex, BoxerNot): - return DrtNegatedExpression(self.interpret(ex.drs)) - elif isinstance(ex, BoxerPred): - pred = self._add_occur_indexing(f"{ex.pos}_{ex.name}", ex) - return self._make_atom(pred, ex.var) - elif isinstance(ex, BoxerNamed): - pred = self._add_occur_indexing(f"ne_{ex.type}_{ex.name}", ex) - return self._make_atom(pred, ex.var) - elif isinstance(ex, BoxerRel): - pred = self._add_occur_indexing("%s" % (ex.rel), ex) - return self._make_atom(pred, ex.var1, ex.var2) - elif isinstance(ex, BoxerProp): - return DrtProposition(Variable(ex.var), self.interpret(ex.drs)) - elif isinstance(ex, BoxerEq): - return DrtEqualityExpression( - DrtVariableExpression(Variable(ex.var1)), - DrtVariableExpression(Variable(ex.var2)), - ) - elif isinstance(ex, BoxerCard): - pred = self._add_occur_indexing(f"card_{ex.type}_{ex.value}", ex) - return self._make_atom(pred, ex.var) - elif isinstance(ex, BoxerOr): - return DrtOrExpression(self.interpret(ex.drs1), self.interpret(ex.drs2)) - elif isinstance(ex, BoxerWhq): - drs1 = self.interpret(ex.drs1) - drs2 = self.interpret(ex.drs2) - return DRS(drs1.refs + drs2.refs, drs1.conds + drs2.conds) - assert False, f"{ex.__class__.__name__}: {ex}" - - def _make_atom(self, pred, *args): - accum = DrtVariableExpression(Variable(pred)) - for arg in args: - accum = DrtApplicationExpression( - accum, DrtVariableExpression(Variable(arg)) - ) - return accum - - def _add_occur_indexing(self, base, ex): - if self._occur_index and ex.sent_index is not None: - if ex.discourse_id: - base += "_%s" % ex.discourse_id - base += "_s%s" % ex.sent_index - base += "_w%s" % sorted(ex.word_indices)[0] - return base - - -class UnparseableInputException(Exception): - pass - - -if __name__ == "__main__": - opts = OptionParser("usage: %prog TEXT [options]") - opts.add_option( - "--verbose", - "-v", - help="display verbose logs", - action="store_true", - default=False, - dest="verbose", - ) - opts.add_option( - "--fol", "-f", help="output FOL", action="store_true", default=False, dest="fol" - ) - opts.add_option( - "--question", - "-q", - help="input is a question", - action="store_true", - default=False, - dest="question", - ) - opts.add_option( - "--occur", - "-o", - help="occurrence index", - action="store_true", - default=False, - dest="occur_index", - ) - (options, args) = opts.parse_args() - - if len(args) != 1: - opts.error("incorrect number of arguments") - - interpreter = NltkDrtBoxerDrsInterpreter(occur_index=options.occur_index) - drs = Boxer(interpreter).interpret_multi( - args[0].split(r"\n"), question=options.question, verbose=options.verbose - ) - if drs is None: - print(None) - else: - drs = drs.simplify().eliminate_equality() - if options.fol: - print(drs.fol().normalize()) - else: - drs.pretty_print() diff --git a/pipeline/nltk/sem/chat80.py b/pipeline/nltk/sem/chat80.py deleted file mode 100644 index 3d1e77a49f19b2e4414f66741570cdb033ec7ca6..0000000000000000000000000000000000000000 --- a/pipeline/nltk/sem/chat80.py +++ /dev/null @@ -1,857 +0,0 @@ -# Natural Language Toolkit: Chat-80 KB Reader -# See https://www.w3.org/TR/swbp-skos-core-guide/ -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ewan Klein , -# URL: -# For license information, see LICENSE.TXT - -r""" -Overview -======== - -Chat-80 was a natural language system which allowed the user to -interrogate a Prolog knowledge base in the domain of world -geography. It was developed in the early '80s by Warren and Pereira; see -``https://www.aclweb.org/anthology/J82-3002.pdf`` for a description and -``http://www.cis.upenn.edu/~pereira/oldies.html`` for the source -files. - -This module contains functions to extract data from the Chat-80 -relation files ('the world database'), and convert then into a format -that can be incorporated in the FOL models of -``nltk.sem.evaluate``. The code assumes that the Prolog -input files are available in the NLTK corpora directory. - -The Chat-80 World Database consists of the following files:: - - world0.pl - rivers.pl - cities.pl - countries.pl - contain.pl - borders.pl - -This module uses a slightly modified version of ``world0.pl``, in which -a set of Prolog rules have been omitted. The modified file is named -``world1.pl``. Currently, the file ``rivers.pl`` is not read in, since -it uses a list rather than a string in the second field. - -Reading Chat-80 Files -===================== - -Chat-80 relations are like tables in a relational database. The -relation acts as the name of the table; the first argument acts as the -'primary key'; and subsequent arguments are further fields in the -table. In general, the name of the table provides a label for a unary -predicate whose extension is all the primary keys. For example, -relations in ``cities.pl`` are of the following form:: - - 'city(athens,greece,1368).' - -Here, ``'athens'`` is the key, and will be mapped to a member of the -unary predicate *city*. - -The fields in the table are mapped to binary predicates. The first -argument of the predicate is the primary key, while the second -argument is the data in the relevant field. Thus, in the above -example, the third field is mapped to the binary predicate -*population_of*, whose extension is a set of pairs such as -``'(athens, 1368)'``. - -An exception to this general framework is required by the relations in -the files ``borders.pl`` and ``contains.pl``. These contain facts of the -following form:: - - 'borders(albania,greece).' - - 'contains0(africa,central_africa).' - -We do not want to form a unary concept out the element in -the first field of these records, and we want the label of the binary -relation just to be ``'border'``/``'contain'`` respectively. - -In order to drive the extraction process, we use 'relation metadata bundles' -which are Python dictionaries such as the following:: - - city = {'label': 'city', - 'closures': [], - 'schema': ['city', 'country', 'population'], - 'filename': 'cities.pl'} - -According to this, the file ``city['filename']`` contains a list of -relational tuples (or more accurately, the corresponding strings in -Prolog form) whose predicate symbol is ``city['label']`` and whose -relational schema is ``city['schema']``. The notion of a ``closure`` is -discussed in the next section. - -Concepts -======== -In order to encapsulate the results of the extraction, a class of -``Concept`` objects is introduced. A ``Concept`` object has a number of -attributes, in particular a ``prefLabel`` and ``extension``, which make -it easier to inspect the output of the extraction. In addition, the -``extension`` can be further processed: in the case of the ``'border'`` -relation, we check that the relation is symmetric, and in the case -of the ``'contain'`` relation, we carry out the transitive -closure. The closure properties associated with a concept is -indicated in the relation metadata, as indicated earlier. - -The ``extension`` of a ``Concept`` object is then incorporated into a -``Valuation`` object. - -Persistence -=========== -The functions ``val_dump`` and ``val_load`` are provided to allow a -valuation to be stored in a persistent database and re-loaded, rather -than having to be re-computed each time. - -Individuals and Lexical Items -============================= -As well as deriving relations from the Chat-80 data, we also create a -set of individual constants, one for each entity in the domain. The -individual constants are string-identical to the entities. For -example, given a data item such as ``'zloty'``, we add to the valuation -a pair ``('zloty', 'zloty')``. In order to parse English sentences that -refer to these entities, we also create a lexical item such as the -following for each individual constant:: - - PropN[num=sg, sem=<\P.(P zloty)>] -> 'Zloty' - -The set of rules is written to the file ``chat_pnames.cfg`` in the -current directory. - -""" - -import os -import re -import shelve -import sys - -import nltk.data - -########################################################################### -# Chat-80 relation metadata bundles needed to build the valuation -########################################################################### - -borders = { - "rel_name": "borders", - "closures": ["symmetric"], - "schema": ["region", "border"], - "filename": "borders.pl", -} - -contains = { - "rel_name": "contains0", - "closures": ["transitive"], - "schema": ["region", "contain"], - "filename": "contain.pl", -} - -city = { - "rel_name": "city", - "closures": [], - "schema": ["city", "country", "population"], - "filename": "cities.pl", -} - -country = { - "rel_name": "country", - "closures": [], - "schema": [ - "country", - "region", - "latitude", - "longitude", - "area", - "population", - "capital", - "currency", - ], - "filename": "countries.pl", -} - -circle_of_lat = { - "rel_name": "circle_of_latitude", - "closures": [], - "schema": ["circle_of_latitude", "degrees"], - "filename": "world1.pl", -} - -circle_of_long = { - "rel_name": "circle_of_longitude", - "closures": [], - "schema": ["circle_of_longitude", "degrees"], - "filename": "world1.pl", -} - -continent = { - "rel_name": "continent", - "closures": [], - "schema": ["continent"], - "filename": "world1.pl", -} - -region = { - "rel_name": "in_continent", - "closures": [], - "schema": ["region", "continent"], - "filename": "world1.pl", -} - -ocean = { - "rel_name": "ocean", - "closures": [], - "schema": ["ocean"], - "filename": "world1.pl", -} - -sea = {"rel_name": "sea", "closures": [], "schema": ["sea"], "filename": "world1.pl"} - - -items = [ - "borders", - "contains", - "city", - "country", - "circle_of_lat", - "circle_of_long", - "continent", - "region", - "ocean", - "sea", -] -items = tuple(sorted(items)) - -item_metadata = { - "borders": borders, - "contains": contains, - "city": city, - "country": country, - "circle_of_lat": circle_of_lat, - "circle_of_long": circle_of_long, - "continent": continent, - "region": region, - "ocean": ocean, - "sea": sea, -} - -rels = item_metadata.values() - -not_unary = ["borders.pl", "contain.pl"] - -########################################################################### - - -class Concept: - """ - A Concept class, loosely based on SKOS - (https://www.w3.org/TR/swbp-skos-core-guide/). - """ - - def __init__(self, prefLabel, arity, altLabels=[], closures=[], extension=set()): - """ - :param prefLabel: the preferred label for the concept - :type prefLabel: str - :param arity: the arity of the concept - :type arity: int - :param altLabels: other (related) labels - :type altLabels: list - :param closures: closure properties of the extension - (list items can be ``symmetric``, ``reflexive``, ``transitive``) - :type closures: list - :param extension: the extensional value of the concept - :type extension: set - """ - self.prefLabel = prefLabel - self.arity = arity - self.altLabels = altLabels - self.closures = closures - # keep _extension internally as a set - self._extension = extension - # public access is via a list (for slicing) - self.extension = sorted(list(extension)) - - def __str__(self): - # _extension = '' - # for element in sorted(self.extension): - # if isinstance(element, tuple): - # element = '(%s, %s)' % (element) - # _extension += element + ', ' - # _extension = _extension[:-1] - - return "Label = '{}'\nArity = {}\nExtension = {}".format( - self.prefLabel, - self.arity, - self.extension, - ) - - def __repr__(self): - return "Concept('%s')" % self.prefLabel - - def augment(self, data): - """ - Add more data to the ``Concept``'s extension set. - - :param data: a new semantic value - :type data: string or pair of strings - :rtype: set - - """ - self._extension.add(data) - self.extension = sorted(list(self._extension)) - return self._extension - - def _make_graph(self, s): - """ - Convert a set of pairs into an adjacency linked list encoding of a graph. - """ - g = {} - for (x, y) in s: - if x in g: - g[x].append(y) - else: - g[x] = [y] - return g - - def _transclose(self, g): - """ - Compute the transitive closure of a graph represented as a linked list. - """ - for x in g: - for adjacent in g[x]: - # check that adjacent is a key - if adjacent in g: - for y in g[adjacent]: - if y not in g[x]: - g[x].append(y) - return g - - def _make_pairs(self, g): - """ - Convert an adjacency linked list back into a set of pairs. - """ - pairs = [] - for node in g: - for adjacent in g[node]: - pairs.append((node, adjacent)) - return set(pairs) - - def close(self): - """ - Close a binary relation in the ``Concept``'s extension set. - - :return: a new extension for the ``Concept`` in which the - relation is closed under a given property - """ - from nltk.sem import is_rel - - assert is_rel(self._extension) - if "symmetric" in self.closures: - pairs = [] - for (x, y) in self._extension: - pairs.append((y, x)) - sym = set(pairs) - self._extension = self._extension.union(sym) - if "transitive" in self.closures: - all = self._make_graph(self._extension) - closed = self._transclose(all) - trans = self._make_pairs(closed) - self._extension = self._extension.union(trans) - self.extension = sorted(list(self._extension)) - - -def clause2concepts(filename, rel_name, schema, closures=[]): - """ - Convert a file of Prolog clauses into a list of ``Concept`` objects. - - :param filename: filename containing the relations - :type filename: str - :param rel_name: name of the relation - :type rel_name: str - :param schema: the schema used in a set of relational tuples - :type schema: list - :param closures: closure properties for the extension of the concept - :type closures: list - :return: a list of ``Concept`` objects - :rtype: list - """ - concepts = [] - # position of the subject of a binary relation - subj = 0 - # label of the 'primary key' - pkey = schema[0] - # fields other than the primary key - fields = schema[1:] - - # convert a file into a list of lists - records = _str2records(filename, rel_name) - - # add a unary concept corresponding to the set of entities - # in the primary key position - # relations in 'not_unary' are more like ordinary binary relations - if not filename in not_unary: - concepts.append(unary_concept(pkey, subj, records)) - - # add a binary concept for each non-key field - for field in fields: - obj = schema.index(field) - concepts.append(binary_concept(field, closures, subj, obj, records)) - - return concepts - - -def cities2table(filename, rel_name, dbname, verbose=False, setup=False): - """ - Convert a file of Prolog clauses into a database table. - - This is not generic, since it doesn't allow arbitrary - schemas to be set as a parameter. - - Intended usage:: - - cities2table('cities.pl', 'city', 'city.db', verbose=True, setup=True) - - :param filename: filename containing the relations - :type filename: str - :param rel_name: name of the relation - :type rel_name: str - :param dbname: filename of persistent store - :type schema: str - """ - import sqlite3 - - records = _str2records(filename, rel_name) - connection = sqlite3.connect(dbname) - cur = connection.cursor() - if setup: - cur.execute( - """CREATE TABLE city_table - (City text, Country text, Population int)""" - ) - - table_name = "city_table" - for t in records: - cur.execute("insert into %s values (?,?,?)" % table_name, t) - if verbose: - print("inserting values into %s: " % table_name, t) - connection.commit() - if verbose: - print("Committing update to %s" % dbname) - cur.close() - - -def sql_query(dbname, query): - """ - Execute an SQL query over a database. - :param dbname: filename of persistent store - :type schema: str - :param query: SQL query - :type rel_name: str - """ - import sqlite3 - - try: - path = nltk.data.find(dbname) - connection = sqlite3.connect(str(path)) - cur = connection.cursor() - return cur.execute(query) - except (ValueError, sqlite3.OperationalError): - import warnings - - warnings.warn( - "Make sure the database file %s is installed and uncompressed." % dbname - ) - raise - - -def _str2records(filename, rel): - """ - Read a file into memory and convert each relation clause into a list. - """ - recs = [] - contents = nltk.data.load("corpora/chat80/%s" % filename, format="text") - for line in contents.splitlines(): - if line.startswith(rel): - line = re.sub(rel + r"\(", "", line) - line = re.sub(r"\)\.$", "", line) - record = line.split(",") - recs.append(record) - return recs - - -def unary_concept(label, subj, records): - """ - Make a unary concept out of the primary key in a record. - - A record is a list of entities in some relation, such as - ``['france', 'paris']``, where ``'france'`` is acting as the primary - key. - - :param label: the preferred label for the concept - :type label: string - :param subj: position in the record of the subject of the predicate - :type subj: int - :param records: a list of records - :type records: list of lists - :return: ``Concept`` of arity 1 - :rtype: Concept - """ - c = Concept(label, arity=1, extension=set()) - for record in records: - c.augment(record[subj]) - return c - - -def binary_concept(label, closures, subj, obj, records): - """ - Make a binary concept out of the primary key and another field in a record. - - A record is a list of entities in some relation, such as - ``['france', 'paris']``, where ``'france'`` is acting as the primary - key, and ``'paris'`` stands in the ``'capital_of'`` relation to - ``'france'``. - - More generally, given a record such as ``['a', 'b', 'c']``, where - label is bound to ``'B'``, and ``obj`` bound to 1, the derived - binary concept will have label ``'B_of'``, and its extension will - be a set of pairs such as ``('a', 'b')``. - - - :param label: the base part of the preferred label for the concept - :type label: str - :param closures: closure properties for the extension of the concept - :type closures: list - :param subj: position in the record of the subject of the predicate - :type subj: int - :param obj: position in the record of the object of the predicate - :type obj: int - :param records: a list of records - :type records: list of lists - :return: ``Concept`` of arity 2 - :rtype: Concept - """ - if not label == "border" and not label == "contain": - label = label + "_of" - c = Concept(label, arity=2, closures=closures, extension=set()) - for record in records: - c.augment((record[subj], record[obj])) - # close the concept's extension according to the properties in closures - c.close() - return c - - -def process_bundle(rels): - """ - Given a list of relation metadata bundles, make a corresponding - dictionary of concepts, indexed by the relation name. - - :param rels: bundle of metadata needed for constructing a concept - :type rels: list(dict) - :return: a dictionary of concepts, indexed by the relation name. - :rtype: dict(str): Concept - """ - concepts = {} - for rel in rels: - rel_name = rel["rel_name"] - closures = rel["closures"] - schema = rel["schema"] - filename = rel["filename"] - - concept_list = clause2concepts(filename, rel_name, schema, closures) - for c in concept_list: - label = c.prefLabel - if label in concepts: - for data in c.extension: - concepts[label].augment(data) - concepts[label].close() - else: - concepts[label] = c - return concepts - - -def make_valuation(concepts, read=False, lexicon=False): - """ - Convert a list of ``Concept`` objects into a list of (label, extension) pairs; - optionally create a ``Valuation`` object. - - :param concepts: concepts - :type concepts: list(Concept) - :param read: if ``True``, ``(symbol, set)`` pairs are read into a ``Valuation`` - :type read: bool - :rtype: list or Valuation - """ - vals = [] - - for c in concepts: - vals.append((c.prefLabel, c.extension)) - if lexicon: - read = True - if read: - from nltk.sem import Valuation - - val = Valuation({}) - val.update(vals) - # add labels for individuals - val = label_indivs(val, lexicon=lexicon) - return val - else: - return vals - - -def val_dump(rels, db): - """ - Make a ``Valuation`` from a list of relation metadata bundles and dump to - persistent database. - - :param rels: bundle of metadata needed for constructing a concept - :type rels: list of dict - :param db: name of file to which data is written. - The suffix '.db' will be automatically appended. - :type db: str - """ - concepts = process_bundle(rels).values() - valuation = make_valuation(concepts, read=True) - db_out = shelve.open(db, "n") - - db_out.update(valuation) - - db_out.close() - - -def val_load(db): - """ - Load a ``Valuation`` from a persistent database. - - :param db: name of file from which data is read. - The suffix '.db' should be omitted from the name. - :type db: str - """ - dbname = db + ".db" - - if not os.access(dbname, os.R_OK): - sys.exit("Cannot read file: %s" % dbname) - else: - db_in = shelve.open(db) - from nltk.sem import Valuation - - val = Valuation(db_in) - # val.read(db_in.items()) - return val - - -# def alpha(str): -# """ -# Utility to filter out non-alphabetic constants. - -#:param str: candidate constant -#:type str: string -#:rtype: bool -# """ -# try: -# int(str) -# return False -# except ValueError: -## some unknown values in records are labeled '?' -# if not str == '?': -# return True - - -def label_indivs(valuation, lexicon=False): - """ - Assign individual constants to the individuals in the domain of a ``Valuation``. - - Given a valuation with an entry of the form ``{'rel': {'a': True}}``, - add a new entry ``{'a': 'a'}``. - - :type valuation: Valuation - :rtype: Valuation - """ - # collect all the individuals into a domain - domain = valuation.domain - # convert the domain into a sorted list of alphabetic terms - # use the same string as a label - pairs = [(e, e) for e in domain] - if lexicon: - lex = make_lex(domain) - with open("chat_pnames.cfg", "w") as outfile: - outfile.writelines(lex) - # read the pairs into the valuation - valuation.update(pairs) - return valuation - - -def make_lex(symbols): - """ - Create lexical CFG rules for each individual symbol. - - Given a valuation with an entry of the form ``{'zloty': 'zloty'}``, - create a lexical rule for the proper name 'Zloty'. - - :param symbols: a list of individual constants in the semantic representation - :type symbols: sequence -- set(str) - :rtype: list(str) - """ - lex = [] - header = """ -################################################################## -# Lexical rules automatically generated by running 'chat80.py -x'. -################################################################## - -""" - lex.append(header) - template = r"PropN[num=sg, sem=<\P.(P %s)>] -> '%s'\n" - - for s in symbols: - parts = s.split("_") - caps = [p.capitalize() for p in parts] - pname = "_".join(caps) - rule = template % (s, pname) - lex.append(rule) - return lex - - -########################################################################### -# Interface function to emulate other corpus readers -########################################################################### - - -def concepts(items=items): - """ - Build a list of concepts corresponding to the relation names in ``items``. - - :param items: names of the Chat-80 relations to extract - :type items: list(str) - :return: the ``Concept`` objects which are extracted from the relations - :rtype: list(Concept) - """ - if isinstance(items, str): - items = (items,) - - rels = [item_metadata[r] for r in items] - - concept_map = process_bundle(rels) - return concept_map.values() - - -########################################################################### - - -def main(): - import sys - from optparse import OptionParser - - description = """ -Extract data from the Chat-80 Prolog files and convert them into a -Valuation object for use in the NLTK semantics package. - """ - - opts = OptionParser(description=description) - opts.set_defaults(verbose=True, lex=False, vocab=False) - opts.add_option( - "-s", "--store", dest="outdb", help="store a valuation in DB", metavar="DB" - ) - opts.add_option( - "-l", - "--load", - dest="indb", - help="load a stored valuation from DB", - metavar="DB", - ) - opts.add_option( - "-c", - "--concepts", - action="store_true", - help="print concepts instead of a valuation", - ) - opts.add_option( - "-r", - "--relation", - dest="label", - help="print concept with label REL (check possible labels with '-v' option)", - metavar="REL", - ) - opts.add_option( - "-q", - "--quiet", - action="store_false", - dest="verbose", - help="don't print out progress info", - ) - opts.add_option( - "-x", - "--lex", - action="store_true", - dest="lex", - help="write a file of lexical entries for country names, then exit", - ) - opts.add_option( - "-v", - "--vocab", - action="store_true", - dest="vocab", - help="print out the vocabulary of concept labels and their arity, then exit", - ) - - (options, args) = opts.parse_args() - if options.outdb and options.indb: - opts.error("Options --store and --load are mutually exclusive") - - if options.outdb: - # write the valuation to a persistent database - if options.verbose: - outdb = options.outdb + ".db" - print("Dumping a valuation to %s" % outdb) - val_dump(rels, options.outdb) - sys.exit(0) - else: - # try to read in a valuation from a database - if options.indb is not None: - dbname = options.indb + ".db" - if not os.access(dbname, os.R_OK): - sys.exit("Cannot read file: %s" % dbname) - else: - valuation = val_load(options.indb) - # we need to create the valuation from scratch - else: - # build some concepts - concept_map = process_bundle(rels) - concepts = concept_map.values() - # just print out the vocabulary - if options.vocab: - items = sorted((c.arity, c.prefLabel) for c in concepts) - for (arity, label) in items: - print(label, arity) - sys.exit(0) - # show all the concepts - if options.concepts: - for c in concepts: - print(c) - print() - if options.label: - print(concept_map[options.label]) - sys.exit(0) - else: - # turn the concepts into a Valuation - if options.lex: - if options.verbose: - print("Writing out lexical rules") - make_valuation(concepts, lexicon=True) - else: - valuation = make_valuation(concepts, read=True) - print(valuation) - - -def sql_demo(): - """ - Print out every row from the 'city.db' database. - """ - print() - print("Using SQL to extract rows from 'city.db' RDB.") - for row in sql_query("corpora/city_database/city.db", "SELECT * FROM city_table"): - print(row) - - -if __name__ == "__main__": - main() - sql_demo() diff --git a/pipeline/nltk/sem/cooper_storage.py b/pipeline/nltk/sem/cooper_storage.py deleted file mode 100644 index a41502187ed1dfbfae5bc21bdf7c29624cab1e0f..0000000000000000000000000000000000000000 --- a/pipeline/nltk/sem/cooper_storage.py +++ /dev/null @@ -1,124 +0,0 @@ -# Natural Language Toolkit: Cooper storage for Quantifier Ambiguity -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ewan Klein -# URL: -# For license information, see LICENSE.TXT - -from nltk.parse import load_parser -from nltk.parse.featurechart import InstantiateVarsChart -from nltk.sem.logic import ApplicationExpression, LambdaExpression, Variable - - -class CooperStore: - """ - A container for handling quantifier ambiguity via Cooper storage. - """ - - def __init__(self, featstruct): - """ - :param featstruct: The value of the ``sem`` node in a tree from - ``parse_with_bindops()`` - :type featstruct: FeatStruct (with features ``core`` and ``store``) - - """ - self.featstruct = featstruct - self.readings = [] - try: - self.core = featstruct["CORE"] - self.store = featstruct["STORE"] - except KeyError: - print("%s is not a Cooper storage structure" % featstruct) - - def _permute(self, lst): - """ - :return: An iterator over the permutations of the input list - :type lst: list - :rtype: iter - """ - remove = lambda lst0, index: lst0[:index] + lst0[index + 1 :] - if lst: - for index, x in enumerate(lst): - for y in self._permute(remove(lst, index)): - yield (x,) + y - else: - yield () - - def s_retrieve(self, trace=False): - r""" - Carry out S-Retrieval of binding operators in store. If hack=True, - serialize the bindop and core as strings and reparse. Ugh. - - Each permutation of the store (i.e. list of binding operators) is - taken to be a possible scoping of quantifiers. We iterate through the - binding operators in each permutation, and successively apply them to - the current term, starting with the core semantic representation, - working from the inside out. - - Binding operators are of the form:: - - bo(\P.all x.(man(x) -> P(x)),z1) - """ - for perm, store_perm in enumerate(self._permute(self.store)): - if trace: - print("Permutation %s" % (perm + 1)) - term = self.core - for bindop in store_perm: - # we just want the arguments that are wrapped by the 'bo' predicate - quant, varex = tuple(bindop.args) - # use var to make an abstraction over the current term and then - # apply the quantifier to it - term = ApplicationExpression( - quant, LambdaExpression(varex.variable, term) - ) - if trace: - print(" ", term) - term = term.simplify() - self.readings.append(term) - - -def parse_with_bindops(sentence, grammar=None, trace=0): - """ - Use a grammar with Binding Operators to parse a sentence. - """ - if not grammar: - grammar = "grammars/book_grammars/storage.fcfg" - parser = load_parser(grammar, trace=trace, chart_class=InstantiateVarsChart) - # Parse the sentence. - tokens = sentence.split() - return list(parser.parse(tokens)) - - -def demo(): - from nltk.sem import cooper_storage as cs - - sentence = "every girl chases a dog" - # sentence = "a man gives a bone to every dog" - print() - print("Analysis of sentence '%s'" % sentence) - print("=" * 50) - trees = cs.parse_with_bindops(sentence, trace=0) - for tree in trees: - semrep = cs.CooperStore(tree.label()["SEM"]) - print() - print("Binding operators:") - print("-" * 15) - for s in semrep.store: - print(s) - print() - print("Core:") - print("-" * 15) - print(semrep.core) - print() - print("S-Retrieval:") - print("-" * 15) - semrep.s_retrieve(trace=True) - print("Readings:") - print("-" * 15) - - for i, reading in enumerate(semrep.readings): - print(f"{i + 1}: {reading}") - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/sem/drt.py b/pipeline/nltk/sem/drt.py deleted file mode 100644 index 53441d6617310683bab97bb7abd84f656ebc28af..0000000000000000000000000000000000000000 --- a/pipeline/nltk/sem/drt.py +++ /dev/null @@ -1,1460 +0,0 @@ -# Natural Language Toolkit: Discourse Representation Theory (DRT) -# -# Author: Dan Garrette -# -# Copyright (C) 2001-2023 NLTK Project -# URL: -# For license information, see LICENSE.TXT - -import operator -from functools import reduce -from itertools import chain - -from nltk.sem.logic import ( - APP, - AbstractVariableExpression, - AllExpression, - AndExpression, - ApplicationExpression, - BinaryExpression, - BooleanExpression, - ConstantExpression, - EqualityExpression, - EventVariableExpression, - ExistsExpression, - Expression, - FunctionVariableExpression, - ImpExpression, - IndividualVariableExpression, - LambdaExpression, - LogicParser, - NegatedExpression, - OrExpression, - Tokens, - Variable, - is_eventvar, - is_funcvar, - is_indvar, - unique_variable, -) - -# Import Tkinter-based modules if they are available -try: - from tkinter import Canvas, Tk - from tkinter.font import Font - - from nltk.util import in_idle - -except ImportError: - # No need to print a warning here, nltk.draw has already printed one. - pass - - -class DrtTokens(Tokens): - DRS = "DRS" - DRS_CONC = "+" - PRONOUN = "PRO" - OPEN_BRACKET = "[" - CLOSE_BRACKET = "]" - COLON = ":" - - PUNCT = [DRS_CONC, OPEN_BRACKET, CLOSE_BRACKET, COLON] - - SYMBOLS = Tokens.SYMBOLS + PUNCT - - TOKENS = Tokens.TOKENS + [DRS] + PUNCT - - -class DrtParser(LogicParser): - """A lambda calculus expression parser.""" - - def __init__(self): - LogicParser.__init__(self) - - self.operator_precedence = dict( - [(x, 1) for x in DrtTokens.LAMBDA_LIST] - + [(x, 2) for x in DrtTokens.NOT_LIST] - + [(APP, 3)] - + [(x, 4) for x in DrtTokens.EQ_LIST + Tokens.NEQ_LIST] - + [(DrtTokens.COLON, 5)] - + [(DrtTokens.DRS_CONC, 6)] - + [(x, 7) for x in DrtTokens.OR_LIST] - + [(x, 8) for x in DrtTokens.IMP_LIST] - + [(None, 9)] - ) - - def get_all_symbols(self): - """This method exists to be overridden""" - return DrtTokens.SYMBOLS - - def isvariable(self, tok): - return tok not in DrtTokens.TOKENS - - def handle(self, tok, context): - """This method is intended to be overridden for logics that - use different operators or expressions""" - if tok in DrtTokens.NOT_LIST: - return self.handle_negation(tok, context) - - elif tok in DrtTokens.LAMBDA_LIST: - return self.handle_lambda(tok, context) - - elif tok == DrtTokens.OPEN: - if self.inRange(0) and self.token(0) == DrtTokens.OPEN_BRACKET: - return self.handle_DRS(tok, context) - else: - return self.handle_open(tok, context) - - elif tok.upper() == DrtTokens.DRS: - self.assertNextToken(DrtTokens.OPEN) - return self.handle_DRS(tok, context) - - elif self.isvariable(tok): - if self.inRange(0) and self.token(0) == DrtTokens.COLON: - return self.handle_prop(tok, context) - else: - return self.handle_variable(tok, context) - - def make_NegatedExpression(self, expression): - return DrtNegatedExpression(expression) - - def handle_DRS(self, tok, context): - # a DRS - refs = self.handle_refs() - if ( - self.inRange(0) and self.token(0) == DrtTokens.COMMA - ): # if there is a comma (it's optional) - self.token() # swallow the comma - conds = self.handle_conds(context) - self.assertNextToken(DrtTokens.CLOSE) - return DRS(refs, conds, None) - - def handle_refs(self): - self.assertNextToken(DrtTokens.OPEN_BRACKET) - refs = [] - while self.inRange(0) and self.token(0) != DrtTokens.CLOSE_BRACKET: - # Support expressions like: DRS([x y],C) == DRS([x,y],C) - if refs and self.token(0) == DrtTokens.COMMA: - self.token() # swallow the comma - refs.append(self.get_next_token_variable("quantified")) - self.assertNextToken(DrtTokens.CLOSE_BRACKET) - return refs - - def handle_conds(self, context): - self.assertNextToken(DrtTokens.OPEN_BRACKET) - conds = [] - while self.inRange(0) and self.token(0) != DrtTokens.CLOSE_BRACKET: - # Support expressions like: DRS([x y],C) == DRS([x, y],C) - if conds and self.token(0) == DrtTokens.COMMA: - self.token() # swallow the comma - conds.append(self.process_next_expression(context)) - self.assertNextToken(DrtTokens.CLOSE_BRACKET) - return conds - - def handle_prop(self, tok, context): - variable = self.make_VariableExpression(tok) - self.assertNextToken(":") - drs = self.process_next_expression(DrtTokens.COLON) - return DrtProposition(variable, drs) - - def make_EqualityExpression(self, first, second): - """This method serves as a hook for other logic parsers that - have different equality expression classes""" - return DrtEqualityExpression(first, second) - - def get_BooleanExpression_factory(self, tok): - """This method serves as a hook for other logic parsers that - have different boolean operators""" - if tok == DrtTokens.DRS_CONC: - return lambda first, second: DrtConcatenation(first, second, None) - elif tok in DrtTokens.OR_LIST: - return DrtOrExpression - elif tok in DrtTokens.IMP_LIST: - - def make_imp_expression(first, second): - if isinstance(first, DRS): - return DRS(first.refs, first.conds, second) - if isinstance(first, DrtConcatenation): - return DrtConcatenation(first.first, first.second, second) - raise Exception("Antecedent of implication must be a DRS") - - return make_imp_expression - else: - return None - - def make_BooleanExpression(self, factory, first, second): - return factory(first, second) - - def make_ApplicationExpression(self, function, argument): - return DrtApplicationExpression(function, argument) - - def make_VariableExpression(self, name): - return DrtVariableExpression(Variable(name)) - - def make_LambdaExpression(self, variables, term): - return DrtLambdaExpression(variables, term) - - -class DrtExpression: - """ - This is the base abstract DRT Expression from which every DRT - Expression extends. - """ - - _drt_parser = DrtParser() - - @classmethod - def fromstring(cls, s): - return cls._drt_parser.parse(s) - - def applyto(self, other): - return DrtApplicationExpression(self, other) - - def __neg__(self): - return DrtNegatedExpression(self) - - def __and__(self, other): - return NotImplemented - - def __or__(self, other): - assert isinstance(other, DrtExpression) - return DrtOrExpression(self, other) - - def __gt__(self, other): - assert isinstance(other, DrtExpression) - if isinstance(self, DRS): - return DRS(self.refs, self.conds, other) - if isinstance(self, DrtConcatenation): - return DrtConcatenation(self.first, self.second, other) - raise Exception("Antecedent of implication must be a DRS") - - def equiv(self, other, prover=None): - """ - Check for logical equivalence. - Pass the expression (self <-> other) to the theorem prover. - If the prover says it is valid, then the self and other are equal. - - :param other: an ``DrtExpression`` to check equality against - :param prover: a ``nltk.inference.api.Prover`` - """ - assert isinstance(other, DrtExpression) - - f1 = self.simplify().fol() - f2 = other.simplify().fol() - return f1.equiv(f2, prover) - - @property - def type(self): - raise AttributeError( - "'%s' object has no attribute 'type'" % self.__class__.__name__ - ) - - def typecheck(self, signature=None): - raise NotImplementedError() - - def __add__(self, other): - return DrtConcatenation(self, other, None) - - def get_refs(self, recursive=False): - """ - Return the set of discourse referents in this DRS. - :param recursive: bool Also find discourse referents in subterms? - :return: list of ``Variable`` objects - """ - raise NotImplementedError() - - def is_pronoun_function(self): - """Is self of the form "PRO(x)"?""" - return ( - isinstance(self, DrtApplicationExpression) - and isinstance(self.function, DrtAbstractVariableExpression) - and self.function.variable.name == DrtTokens.PRONOUN - and isinstance(self.argument, DrtIndividualVariableExpression) - ) - - def make_EqualityExpression(self, first, second): - return DrtEqualityExpression(first, second) - - def make_VariableExpression(self, variable): - return DrtVariableExpression(variable) - - def resolve_anaphora(self): - return resolve_anaphora(self) - - def eliminate_equality(self): - return self.visit_structured(lambda e: e.eliminate_equality(), self.__class__) - - def pretty_format(self): - """ - Draw the DRS - :return: the pretty print string - """ - return "\n".join(self._pretty()) - - def pretty_print(self): - print(self.pretty_format()) - - def draw(self): - DrsDrawer(self).draw() - - -class DRS(DrtExpression, Expression): - """A Discourse Representation Structure.""" - - def __init__(self, refs, conds, consequent=None): - """ - :param refs: list of ``DrtIndividualVariableExpression`` for the - discourse referents - :param conds: list of ``Expression`` for the conditions - """ - self.refs = refs - self.conds = conds - self.consequent = consequent - - def replace(self, variable, expression, replace_bound=False, alpha_convert=True): - """Replace all instances of variable v with expression E in self, - where v is free in self.""" - if variable in self.refs: - # if a bound variable is the thing being replaced - if not replace_bound: - return self - else: - i = self.refs.index(variable) - if self.consequent: - consequent = self.consequent.replace( - variable, expression, True, alpha_convert - ) - else: - consequent = None - return DRS( - self.refs[:i] + [expression.variable] + self.refs[i + 1 :], - [ - cond.replace(variable, expression, True, alpha_convert) - for cond in self.conds - ], - consequent, - ) - else: - if alpha_convert: - # any bound variable that appears in the expression must - # be alpha converted to avoid a conflict - for ref in set(self.refs) & expression.free(): - newvar = unique_variable(ref) - newvarex = DrtVariableExpression(newvar) - i = self.refs.index(ref) - if self.consequent: - consequent = self.consequent.replace( - ref, newvarex, True, alpha_convert - ) - else: - consequent = None - self = DRS( - self.refs[:i] + [newvar] + self.refs[i + 1 :], - [ - cond.replace(ref, newvarex, True, alpha_convert) - for cond in self.conds - ], - consequent, - ) - - # replace in the conditions - if self.consequent: - consequent = self.consequent.replace( - variable, expression, replace_bound, alpha_convert - ) - else: - consequent = None - return DRS( - self.refs, - [ - cond.replace(variable, expression, replace_bound, alpha_convert) - for cond in self.conds - ], - consequent, - ) - - def free(self): - """:see: Expression.free()""" - conds_free = reduce(operator.or_, [c.free() for c in self.conds], set()) - if self.consequent: - conds_free.update(self.consequent.free()) - return conds_free - set(self.refs) - - def get_refs(self, recursive=False): - """:see: AbstractExpression.get_refs()""" - if recursive: - conds_refs = self.refs + list( - chain.from_iterable(c.get_refs(True) for c in self.conds) - ) - if self.consequent: - conds_refs.extend(self.consequent.get_refs(True)) - return conds_refs - else: - return self.refs - - def visit(self, function, combinator): - """:see: Expression.visit()""" - parts = list(map(function, self.conds)) - if self.consequent: - parts.append(function(self.consequent)) - return combinator(parts) - - def visit_structured(self, function, combinator): - """:see: Expression.visit_structured()""" - consequent = function(self.consequent) if self.consequent else None - return combinator(self.refs, list(map(function, self.conds)), consequent) - - def eliminate_equality(self): - drs = self - i = 0 - while i < len(drs.conds): - cond = drs.conds[i] - if ( - isinstance(cond, EqualityExpression) - and isinstance(cond.first, AbstractVariableExpression) - and isinstance(cond.second, AbstractVariableExpression) - ): - drs = DRS( - list(set(drs.refs) - {cond.second.variable}), - drs.conds[:i] + drs.conds[i + 1 :], - drs.consequent, - ) - if cond.second.variable != cond.first.variable: - drs = drs.replace(cond.second.variable, cond.first, False, False) - i = 0 - i -= 1 - i += 1 - - conds = [] - for cond in drs.conds: - new_cond = cond.eliminate_equality() - new_cond_simp = new_cond.simplify() - if ( - not isinstance(new_cond_simp, DRS) - or new_cond_simp.refs - or new_cond_simp.conds - or new_cond_simp.consequent - ): - conds.append(new_cond) - - consequent = drs.consequent.eliminate_equality() if drs.consequent else None - return DRS(drs.refs, conds, consequent) - - def fol(self): - if self.consequent: - accum = None - if self.conds: - accum = reduce(AndExpression, [c.fol() for c in self.conds]) - - if accum: - accum = ImpExpression(accum, self.consequent.fol()) - else: - accum = self.consequent.fol() - - for ref in self.refs[::-1]: - accum = AllExpression(ref, accum) - - return accum - - else: - if not self.conds: - raise Exception("Cannot convert DRS with no conditions to FOL.") - accum = reduce(AndExpression, [c.fol() for c in self.conds]) - for ref in map(Variable, self._order_ref_strings(self.refs)[::-1]): - accum = ExistsExpression(ref, accum) - return accum - - def _pretty(self): - refs_line = " ".join(self._order_ref_strings(self.refs)) - - cond_lines = [ - cond - for cond_line in [ - filter(lambda s: s.strip(), cond._pretty()) for cond in self.conds - ] - for cond in cond_line - ] - length = max([len(refs_line)] + list(map(len, cond_lines))) - drs = ( - [ - " _" + "_" * length + "_ ", - "| " + refs_line.ljust(length) + " |", - "|-" + "-" * length + "-|", - ] - + ["| " + line.ljust(length) + " |" for line in cond_lines] - + ["|_" + "_" * length + "_|"] - ) - if self.consequent: - return DrtBinaryExpression._assemble_pretty( - drs, DrtTokens.IMP, self.consequent._pretty() - ) - return drs - - def _order_ref_strings(self, refs): - strings = ["%s" % ref for ref in refs] - ind_vars = [] - func_vars = [] - event_vars = [] - other_vars = [] - for s in strings: - if is_indvar(s): - ind_vars.append(s) - elif is_funcvar(s): - func_vars.append(s) - elif is_eventvar(s): - event_vars.append(s) - else: - other_vars.append(s) - return ( - sorted(other_vars) - + sorted(event_vars, key=lambda v: int([v[2:], -1][len(v[2:]) == 0])) - + sorted(func_vars, key=lambda v: (v[0], int([v[1:], -1][len(v[1:]) == 0]))) - + sorted(ind_vars, key=lambda v: (v[0], int([v[1:], -1][len(v[1:]) == 0]))) - ) - - def __eq__(self, other): - r"""Defines equality modulo alphabetic variance. - If we are comparing \x.M and \y.N, then check equality of M and N[x/y].""" - if isinstance(other, DRS): - if len(self.refs) == len(other.refs): - converted_other = other - for (r1, r2) in zip(self.refs, converted_other.refs): - varex = self.make_VariableExpression(r1) - converted_other = converted_other.replace(r2, varex, True) - if self.consequent == converted_other.consequent and len( - self.conds - ) == len(converted_other.conds): - for c1, c2 in zip(self.conds, converted_other.conds): - if not (c1 == c2): - return False - return True - return False - - def __ne__(self, other): - return not self == other - - __hash__ = Expression.__hash__ - - def __str__(self): - drs = "([{}],[{}])".format( - ",".join(self._order_ref_strings(self.refs)), - ", ".join("%s" % cond for cond in self.conds), - ) # map(str, self.conds))) - if self.consequent: - return ( - DrtTokens.OPEN - + drs - + " " - + DrtTokens.IMP - + " " - + "%s" % self.consequent - + DrtTokens.CLOSE - ) - return drs - - -def DrtVariableExpression(variable): - """ - This is a factory method that instantiates and returns a subtype of - ``DrtAbstractVariableExpression`` appropriate for the given variable. - """ - if is_indvar(variable.name): - return DrtIndividualVariableExpression(variable) - elif is_funcvar(variable.name): - return DrtFunctionVariableExpression(variable) - elif is_eventvar(variable.name): - return DrtEventVariableExpression(variable) - else: - return DrtConstantExpression(variable) - - -class DrtAbstractVariableExpression(DrtExpression, AbstractVariableExpression): - def fol(self): - return self - - def get_refs(self, recursive=False): - """:see: AbstractExpression.get_refs()""" - return [] - - def _pretty(self): - s = "%s" % self - blank = " " * len(s) - return [blank, blank, s, blank] - - def eliminate_equality(self): - return self - - -class DrtIndividualVariableExpression( - DrtAbstractVariableExpression, IndividualVariableExpression -): - pass - - -class DrtFunctionVariableExpression( - DrtAbstractVariableExpression, FunctionVariableExpression -): - pass - - -class DrtEventVariableExpression( - DrtIndividualVariableExpression, EventVariableExpression -): - pass - - -class DrtConstantExpression(DrtAbstractVariableExpression, ConstantExpression): - pass - - -class DrtProposition(DrtExpression, Expression): - def __init__(self, variable, drs): - self.variable = variable - self.drs = drs - - def replace(self, variable, expression, replace_bound=False, alpha_convert=True): - if self.variable == variable: - assert isinstance( - expression, DrtAbstractVariableExpression - ), "Can only replace a proposition label with a variable" - return DrtProposition( - expression.variable, - self.drs.replace(variable, expression, replace_bound, alpha_convert), - ) - else: - return DrtProposition( - self.variable, - self.drs.replace(variable, expression, replace_bound, alpha_convert), - ) - - def eliminate_equality(self): - return DrtProposition(self.variable, self.drs.eliminate_equality()) - - def get_refs(self, recursive=False): - return self.drs.get_refs(True) if recursive else [] - - def __eq__(self, other): - return ( - self.__class__ == other.__class__ - and self.variable == other.variable - and self.drs == other.drs - ) - - def __ne__(self, other): - return not self == other - - __hash__ = Expression.__hash__ - - def fol(self): - return self.drs.fol() - - def _pretty(self): - drs_s = self.drs._pretty() - blank = " " * len("%s" % self.variable) - return ( - [blank + " " + line for line in drs_s[:1]] - + ["%s" % self.variable + ":" + line for line in drs_s[1:2]] - + [blank + " " + line for line in drs_s[2:]] - ) - - def visit(self, function, combinator): - """:see: Expression.visit()""" - return combinator([function(self.drs)]) - - def visit_structured(self, function, combinator): - """:see: Expression.visit_structured()""" - return combinator(self.variable, function(self.drs)) - - def __str__(self): - return f"prop({self.variable}, {self.drs})" - - -class DrtNegatedExpression(DrtExpression, NegatedExpression): - def fol(self): - return NegatedExpression(self.term.fol()) - - def get_refs(self, recursive=False): - """:see: AbstractExpression.get_refs()""" - return self.term.get_refs(recursive) - - def _pretty(self): - term_lines = self.term._pretty() - return ( - [" " + line for line in term_lines[:2]] - + ["__ " + line for line in term_lines[2:3]] - + [" | " + line for line in term_lines[3:4]] - + [" " + line for line in term_lines[4:]] - ) - - -class DrtLambdaExpression(DrtExpression, LambdaExpression): - def alpha_convert(self, newvar): - """Rename all occurrences of the variable introduced by this variable - binder in the expression to ``newvar``. - :param newvar: ``Variable``, for the new variable - """ - return self.__class__( - newvar, - self.term.replace(self.variable, DrtVariableExpression(newvar), True), - ) - - def fol(self): - return LambdaExpression(self.variable, self.term.fol()) - - def _pretty(self): - variables = [self.variable] - term = self.term - while term.__class__ == self.__class__: - variables.append(term.variable) - term = term.term - var_string = " ".join("%s" % v for v in variables) + DrtTokens.DOT - term_lines = term._pretty() - blank = " " * len(var_string) - return ( - [" " + blank + line for line in term_lines[:1]] - + [r" \ " + blank + line for line in term_lines[1:2]] - + [r" /\ " + var_string + line for line in term_lines[2:3]] - + [" " + blank + line for line in term_lines[3:]] - ) - - def get_refs(self, recursive=False): - """:see: AbstractExpression.get_refs()""" - return ( - [self.variable] + self.term.get_refs(True) if recursive else [self.variable] - ) - - -class DrtBinaryExpression(DrtExpression, BinaryExpression): - def get_refs(self, recursive=False): - """:see: AbstractExpression.get_refs()""" - return ( - self.first.get_refs(True) + self.second.get_refs(True) if recursive else [] - ) - - def _pretty(self): - return DrtBinaryExpression._assemble_pretty( - self._pretty_subex(self.first), - self.getOp(), - self._pretty_subex(self.second), - ) - - @staticmethod - def _assemble_pretty(first_lines, op, second_lines): - max_lines = max(len(first_lines), len(second_lines)) - first_lines = _pad_vertically(first_lines, max_lines) - second_lines = _pad_vertically(second_lines, max_lines) - blank = " " * len(op) - first_second_lines = list(zip(first_lines, second_lines)) - return ( - [ - " " + first_line + " " + blank + " " + second_line + " " - for first_line, second_line in first_second_lines[:2] - ] - + [ - "(" + first_line + " " + op + " " + second_line + ")" - for first_line, second_line in first_second_lines[2:3] - ] - + [ - " " + first_line + " " + blank + " " + second_line + " " - for first_line, second_line in first_second_lines[3:] - ] - ) - - def _pretty_subex(self, subex): - return subex._pretty() - - -class DrtBooleanExpression(DrtBinaryExpression, BooleanExpression): - pass - - -class DrtOrExpression(DrtBooleanExpression, OrExpression): - def fol(self): - return OrExpression(self.first.fol(), self.second.fol()) - - def _pretty_subex(self, subex): - if isinstance(subex, DrtOrExpression): - return [line[1:-1] for line in subex._pretty()] - return DrtBooleanExpression._pretty_subex(self, subex) - - -class DrtEqualityExpression(DrtBinaryExpression, EqualityExpression): - def fol(self): - return EqualityExpression(self.first.fol(), self.second.fol()) - - -class DrtConcatenation(DrtBooleanExpression): - """DRS of the form '(DRS + DRS)'""" - - def __init__(self, first, second, consequent=None): - DrtBooleanExpression.__init__(self, first, second) - self.consequent = consequent - - def replace(self, variable, expression, replace_bound=False, alpha_convert=True): - """Replace all instances of variable v with expression E in self, - where v is free in self.""" - first = self.first - second = self.second - consequent = self.consequent - - # If variable is bound - if variable in self.get_refs(): - if replace_bound: - first = first.replace( - variable, expression, replace_bound, alpha_convert - ) - second = second.replace( - variable, expression, replace_bound, alpha_convert - ) - if consequent: - consequent = consequent.replace( - variable, expression, replace_bound, alpha_convert - ) - else: - if alpha_convert: - # alpha convert every ref that is free in 'expression' - for ref in set(self.get_refs(True)) & expression.free(): - v = DrtVariableExpression(unique_variable(ref)) - first = first.replace(ref, v, True, alpha_convert) - second = second.replace(ref, v, True, alpha_convert) - if consequent: - consequent = consequent.replace(ref, v, True, alpha_convert) - - first = first.replace(variable, expression, replace_bound, alpha_convert) - second = second.replace(variable, expression, replace_bound, alpha_convert) - if consequent: - consequent = consequent.replace( - variable, expression, replace_bound, alpha_convert - ) - - return self.__class__(first, second, consequent) - - def eliminate_equality(self): - # TODO: at some point. for now, simplify. - drs = self.simplify() - assert not isinstance(drs, DrtConcatenation) - return drs.eliminate_equality() - - def simplify(self): - first = self.first.simplify() - second = self.second.simplify() - consequent = self.consequent.simplify() if self.consequent else None - - if isinstance(first, DRS) and isinstance(second, DRS): - # For any ref that is in both 'first' and 'second' - for ref in set(first.get_refs(True)) & set(second.get_refs(True)): - # alpha convert the ref in 'second' to prevent collision - newvar = DrtVariableExpression(unique_variable(ref)) - second = second.replace(ref, newvar, True) - - return DRS(first.refs + second.refs, first.conds + second.conds, consequent) - else: - return self.__class__(first, second, consequent) - - def get_refs(self, recursive=False): - """:see: AbstractExpression.get_refs()""" - refs = self.first.get_refs(recursive) + self.second.get_refs(recursive) - if self.consequent and recursive: - refs.extend(self.consequent.get_refs(True)) - return refs - - def getOp(self): - return DrtTokens.DRS_CONC - - def __eq__(self, other): - r"""Defines equality modulo alphabetic variance. - If we are comparing \x.M and \y.N, then check equality of M and N[x/y].""" - if isinstance(other, DrtConcatenation): - self_refs = self.get_refs() - other_refs = other.get_refs() - if len(self_refs) == len(other_refs): - converted_other = other - for (r1, r2) in zip(self_refs, other_refs): - varex = self.make_VariableExpression(r1) - converted_other = converted_other.replace(r2, varex, True) - return ( - self.first == converted_other.first - and self.second == converted_other.second - and self.consequent == converted_other.consequent - ) - return False - - def __ne__(self, other): - return not self == other - - __hash__ = DrtBooleanExpression.__hash__ - - def fol(self): - e = AndExpression(self.first.fol(), self.second.fol()) - if self.consequent: - e = ImpExpression(e, self.consequent.fol()) - return e - - def _pretty(self): - drs = DrtBinaryExpression._assemble_pretty( - self._pretty_subex(self.first), - self.getOp(), - self._pretty_subex(self.second), - ) - if self.consequent: - drs = DrtBinaryExpression._assemble_pretty( - drs, DrtTokens.IMP, self.consequent._pretty() - ) - return drs - - def _pretty_subex(self, subex): - if isinstance(subex, DrtConcatenation): - return [line[1:-1] for line in subex._pretty()] - return DrtBooleanExpression._pretty_subex(self, subex) - - def visit(self, function, combinator): - """:see: Expression.visit()""" - if self.consequent: - return combinator( - [function(self.first), function(self.second), function(self.consequent)] - ) - else: - return combinator([function(self.first), function(self.second)]) - - def __str__(self): - first = self._str_subex(self.first) - second = self._str_subex(self.second) - drs = Tokens.OPEN + first + " " + self.getOp() + " " + second + Tokens.CLOSE - if self.consequent: - return ( - DrtTokens.OPEN - + drs - + " " - + DrtTokens.IMP - + " " - + "%s" % self.consequent - + DrtTokens.CLOSE - ) - return drs - - def _str_subex(self, subex): - s = "%s" % subex - if isinstance(subex, DrtConcatenation) and subex.consequent is None: - return s[1:-1] - return s - - -class DrtApplicationExpression(DrtExpression, ApplicationExpression): - def fol(self): - return ApplicationExpression(self.function.fol(), self.argument.fol()) - - def get_refs(self, recursive=False): - """:see: AbstractExpression.get_refs()""" - return ( - self.function.get_refs(True) + self.argument.get_refs(True) - if recursive - else [] - ) - - def _pretty(self): - function, args = self.uncurry() - function_lines = function._pretty() - args_lines = [arg._pretty() for arg in args] - max_lines = max(map(len, [function_lines] + args_lines)) - function_lines = _pad_vertically(function_lines, max_lines) - args_lines = [_pad_vertically(arg_lines, max_lines) for arg_lines in args_lines] - func_args_lines = list(zip(function_lines, list(zip(*args_lines)))) - return ( - [ - func_line + " " + " ".join(args_line) + " " - for func_line, args_line in func_args_lines[:2] - ] - + [ - func_line + "(" + ",".join(args_line) + ")" - for func_line, args_line in func_args_lines[2:3] - ] - + [ - func_line + " " + " ".join(args_line) + " " - for func_line, args_line in func_args_lines[3:] - ] - ) - - -def _pad_vertically(lines, max_lines): - pad_line = [" " * len(lines[0])] - return lines + pad_line * (max_lines - len(lines)) - - -class PossibleAntecedents(list, DrtExpression, Expression): - def free(self): - """Set of free variables.""" - return set(self) - - def replace(self, variable, expression, replace_bound=False, alpha_convert=True): - """Replace all instances of variable v with expression E in self, - where v is free in self.""" - result = PossibleAntecedents() - for item in self: - if item == variable: - self.append(expression) - else: - self.append(item) - return result - - def _pretty(self): - s = "%s" % self - blank = " " * len(s) - return [blank, blank, s] - - def __str__(self): - return "[" + ",".join("%s" % it for it in self) + "]" - - -class AnaphoraResolutionException(Exception): - pass - - -def resolve_anaphora(expression, trail=[]): - if isinstance(expression, ApplicationExpression): - if expression.is_pronoun_function(): - possible_antecedents = PossibleAntecedents() - for ancestor in trail: - for ref in ancestor.get_refs(): - refex = expression.make_VariableExpression(ref) - - # ========================================================== - # Don't allow resolution to itself or other types - # ========================================================== - if refex.__class__ == expression.argument.__class__ and not ( - refex == expression.argument - ): - possible_antecedents.append(refex) - - if len(possible_antecedents) == 1: - resolution = possible_antecedents[0] - else: - resolution = possible_antecedents - return expression.make_EqualityExpression(expression.argument, resolution) - else: - r_function = resolve_anaphora(expression.function, trail + [expression]) - r_argument = resolve_anaphora(expression.argument, trail + [expression]) - return expression.__class__(r_function, r_argument) - - elif isinstance(expression, DRS): - r_conds = [] - for cond in expression.conds: - r_cond = resolve_anaphora(cond, trail + [expression]) - - # if the condition is of the form '(x = [])' then raise exception - if isinstance(r_cond, EqualityExpression): - if isinstance(r_cond.first, PossibleAntecedents): - # Reverse the order so that the variable is on the left - temp = r_cond.first - r_cond.first = r_cond.second - r_cond.second = temp - if isinstance(r_cond.second, PossibleAntecedents): - if not r_cond.second: - raise AnaphoraResolutionException( - "Variable '%s' does not " - "resolve to anything." % r_cond.first - ) - - r_conds.append(r_cond) - if expression.consequent: - consequent = resolve_anaphora(expression.consequent, trail + [expression]) - else: - consequent = None - return expression.__class__(expression.refs, r_conds, consequent) - - elif isinstance(expression, AbstractVariableExpression): - return expression - - elif isinstance(expression, NegatedExpression): - return expression.__class__( - resolve_anaphora(expression.term, trail + [expression]) - ) - - elif isinstance(expression, DrtConcatenation): - if expression.consequent: - consequent = resolve_anaphora(expression.consequent, trail + [expression]) - else: - consequent = None - return expression.__class__( - resolve_anaphora(expression.first, trail + [expression]), - resolve_anaphora(expression.second, trail + [expression]), - consequent, - ) - - elif isinstance(expression, BinaryExpression): - return expression.__class__( - resolve_anaphora(expression.first, trail + [expression]), - resolve_anaphora(expression.second, trail + [expression]), - ) - - elif isinstance(expression, LambdaExpression): - return expression.__class__( - expression.variable, resolve_anaphora(expression.term, trail + [expression]) - ) - - -class DrsDrawer: - BUFFER = 3 # Space between elements - TOPSPACE = 10 # Space above whole DRS - OUTERSPACE = 6 # Space to the left, right, and bottom of the while DRS - - def __init__(self, drs, size_canvas=True, canvas=None): - """ - :param drs: ``DrtExpression``, The DRS to be drawn - :param size_canvas: bool, True if the canvas size should be the exact size of the DRS - :param canvas: ``Canvas`` The canvas on which to draw the DRS. If none is given, create a new canvas. - """ - master = None - if not canvas: - master = Tk() - master.title("DRT") - - font = Font(family="helvetica", size=12) - - if size_canvas: - canvas = Canvas(master, width=0, height=0) - canvas.font = font - self.canvas = canvas - (right, bottom) = self._visit(drs, self.OUTERSPACE, self.TOPSPACE) - - width = max(right + self.OUTERSPACE, 100) - height = bottom + self.OUTERSPACE - canvas = Canvas(master, width=width, height=height) # , bg='white') - else: - canvas = Canvas(master, width=300, height=300) - - canvas.pack() - canvas.font = font - - self.canvas = canvas - self.drs = drs - self.master = master - - def _get_text_height(self): - """Get the height of a line of text""" - return self.canvas.font.metrics("linespace") - - def draw(self, x=OUTERSPACE, y=TOPSPACE): - """Draw the DRS""" - self._handle(self.drs, self._draw_command, x, y) - - if self.master and not in_idle(): - self.master.mainloop() - else: - return self._visit(self.drs, x, y) - - def _visit(self, expression, x, y): - """ - Return the bottom-rightmost point without actually drawing the item - - :param expression: the item to visit - :param x: the top of the current drawing area - :param y: the left side of the current drawing area - :return: the bottom-rightmost point - """ - return self._handle(expression, self._visit_command, x, y) - - def _draw_command(self, item, x, y): - """ - Draw the given item at the given location - - :param item: the item to draw - :param x: the top of the current drawing area - :param y: the left side of the current drawing area - :return: the bottom-rightmost point - """ - if isinstance(item, str): - self.canvas.create_text(x, y, anchor="nw", font=self.canvas.font, text=item) - elif isinstance(item, tuple): - # item is the lower-right of a box - (right, bottom) = item - self.canvas.create_rectangle(x, y, right, bottom) - horiz_line_y = ( - y + self._get_text_height() + (self.BUFFER * 2) - ) # the line separating refs from conds - self.canvas.create_line(x, horiz_line_y, right, horiz_line_y) - - return self._visit_command(item, x, y) - - def _visit_command(self, item, x, y): - """ - Return the bottom-rightmost point without actually drawing the item - - :param item: the item to visit - :param x: the top of the current drawing area - :param y: the left side of the current drawing area - :return: the bottom-rightmost point - """ - if isinstance(item, str): - return (x + self.canvas.font.measure(item), y + self._get_text_height()) - elif isinstance(item, tuple): - return item - - def _handle(self, expression, command, x=0, y=0): - """ - :param expression: the expression to handle - :param command: the function to apply, either _draw_command or _visit_command - :param x: the top of the current drawing area - :param y: the left side of the current drawing area - :return: the bottom-rightmost point - """ - if command == self._visit_command: - # if we don't need to draw the item, then we can use the cached values - try: - # attempt to retrieve cached values - right = expression._drawing_width + x - bottom = expression._drawing_height + y - return (right, bottom) - except AttributeError: - # the values have not been cached yet, so compute them - pass - - if isinstance(expression, DrtAbstractVariableExpression): - factory = self._handle_VariableExpression - elif isinstance(expression, DRS): - factory = self._handle_DRS - elif isinstance(expression, DrtNegatedExpression): - factory = self._handle_NegatedExpression - elif isinstance(expression, DrtLambdaExpression): - factory = self._handle_LambdaExpression - elif isinstance(expression, BinaryExpression): - factory = self._handle_BinaryExpression - elif isinstance(expression, DrtApplicationExpression): - factory = self._handle_ApplicationExpression - elif isinstance(expression, PossibleAntecedents): - factory = self._handle_VariableExpression - elif isinstance(expression, DrtProposition): - factory = self._handle_DrtProposition - else: - raise Exception(expression.__class__.__name__) - - (right, bottom) = factory(expression, command, x, y) - - # cache the values - expression._drawing_width = right - x - expression._drawing_height = bottom - y - - return (right, bottom) - - def _handle_VariableExpression(self, expression, command, x, y): - return command("%s" % expression, x, y) - - def _handle_NegatedExpression(self, expression, command, x, y): - # Find the width of the negation symbol - right = self._visit_command(DrtTokens.NOT, x, y)[0] - - # Handle term - (right, bottom) = self._handle(expression.term, command, right, y) - - # Handle variables now that we know the y-coordinate - command( - DrtTokens.NOT, - x, - self._get_centered_top(y, bottom - y, self._get_text_height()), - ) - - return (right, bottom) - - def _handle_DRS(self, expression, command, x, y): - left = x + self.BUFFER # indent the left side - bottom = y + self.BUFFER # indent the top - - # Handle Discourse Referents - if expression.refs: - refs = " ".join("%s" % r for r in expression.refs) - else: - refs = " " - (max_right, bottom) = command(refs, left, bottom) - bottom += self.BUFFER * 2 - - # Handle Conditions - if expression.conds: - for cond in expression.conds: - (right, bottom) = self._handle(cond, command, left, bottom) - max_right = max(max_right, right) - bottom += self.BUFFER - else: - bottom += self._get_text_height() + self.BUFFER - - # Handle Box - max_right += self.BUFFER - return command((max_right, bottom), x, y) - - def _handle_ApplicationExpression(self, expression, command, x, y): - function, args = expression.uncurry() - if not isinstance(function, DrtAbstractVariableExpression): - # It's not a predicate expression ("P(x,y)"), so leave arguments curried - function = expression.function - args = [expression.argument] - - # Get the max bottom of any element on the line - function_bottom = self._visit(function, x, y)[1] - max_bottom = max( - [function_bottom] + [self._visit(arg, x, y)[1] for arg in args] - ) - - line_height = max_bottom - y - - # Handle 'function' - function_drawing_top = self._get_centered_top( - y, line_height, function._drawing_height - ) - right = self._handle(function, command, x, function_drawing_top)[0] - - # Handle open paren - centred_string_top = self._get_centered_top( - y, line_height, self._get_text_height() - ) - right = command(DrtTokens.OPEN, right, centred_string_top)[0] - - # Handle each arg - for (i, arg) in enumerate(args): - arg_drawing_top = self._get_centered_top( - y, line_height, arg._drawing_height - ) - right = self._handle(arg, command, right, arg_drawing_top)[0] - - if i + 1 < len(args): - # since it's not the last arg, add a comma - right = command(DrtTokens.COMMA + " ", right, centred_string_top)[0] - - # Handle close paren - right = command(DrtTokens.CLOSE, right, centred_string_top)[0] - - return (right, max_bottom) - - def _handle_LambdaExpression(self, expression, command, x, y): - # Find the width of the lambda symbol and abstracted variables - variables = DrtTokens.LAMBDA + "%s" % expression.variable + DrtTokens.DOT - right = self._visit_command(variables, x, y)[0] - - # Handle term - (right, bottom) = self._handle(expression.term, command, right, y) - - # Handle variables now that we know the y-coordinate - command( - variables, x, self._get_centered_top(y, bottom - y, self._get_text_height()) - ) - - return (right, bottom) - - def _handle_BinaryExpression(self, expression, command, x, y): - # Get the full height of the line, based on the operands - first_height = self._visit(expression.first, 0, 0)[1] - second_height = self._visit(expression.second, 0, 0)[1] - line_height = max(first_height, second_height) - - # Handle open paren - centred_string_top = self._get_centered_top( - y, line_height, self._get_text_height() - ) - right = command(DrtTokens.OPEN, x, centred_string_top)[0] - - # Handle the first operand - first_height = expression.first._drawing_height - (right, first_bottom) = self._handle( - expression.first, - command, - right, - self._get_centered_top(y, line_height, first_height), - ) - - # Handle the operator - right = command(" %s " % expression.getOp(), right, centred_string_top)[0] - - # Handle the second operand - second_height = expression.second._drawing_height - (right, second_bottom) = self._handle( - expression.second, - command, - right, - self._get_centered_top(y, line_height, second_height), - ) - - # Handle close paren - right = command(DrtTokens.CLOSE, right, centred_string_top)[0] - - return (right, max(first_bottom, second_bottom)) - - def _handle_DrtProposition(self, expression, command, x, y): - # Find the width of the negation symbol - right = command(expression.variable, x, y)[0] - - # Handle term - (right, bottom) = self._handle(expression.term, command, right, y) - - return (right, bottom) - - def _get_centered_top(self, top, full_height, item_height): - """Get the y-coordinate of the point that a figure should start at if - its height is 'item_height' and it needs to be centered in an area that - starts at 'top' and is 'full_height' tall.""" - return top + (full_height - item_height) / 2 - - -def demo(): - print("=" * 20 + "TEST PARSE" + "=" * 20) - dexpr = DrtExpression.fromstring - print(dexpr(r"([x,y],[sees(x,y)])")) - print(dexpr(r"([x],[man(x), walks(x)])")) - print(dexpr(r"\x.\y.([],[sees(x,y)])")) - print(dexpr(r"\x.([],[walks(x)])(john)")) - print(dexpr(r"(([x],[walks(x)]) + ([y],[runs(y)]))")) - print(dexpr(r"(([],[walks(x)]) -> ([],[runs(x)]))")) - print(dexpr(r"([x],[PRO(x), sees(John,x)])")) - print(dexpr(r"([x],[man(x), -([],[walks(x)])])")) - print(dexpr(r"([],[(([x],[man(x)]) -> ([],[walks(x)]))])")) - - print("=" * 20 + "Test fol()" + "=" * 20) - print(dexpr(r"([x,y],[sees(x,y)])").fol()) - - print("=" * 20 + "Test alpha conversion and lambda expression equality" + "=" * 20) - e1 = dexpr(r"\x.([],[P(x)])") - print(e1) - e2 = e1.alpha_convert(Variable("z")) - print(e2) - print(e1 == e2) - - print("=" * 20 + "Test resolve_anaphora()" + "=" * 20) - print(resolve_anaphora(dexpr(r"([x,y,z],[dog(x), cat(y), walks(z), PRO(z)])"))) - print( - resolve_anaphora(dexpr(r"([],[(([x],[dog(x)]) -> ([y],[walks(y), PRO(y)]))])")) - ) - print(resolve_anaphora(dexpr(r"(([x,y],[]) + ([],[PRO(x)]))"))) - - print("=" * 20 + "Test pretty_print()" + "=" * 20) - dexpr(r"([],[])").pretty_print() - dexpr( - r"([],[([x],[big(x), dog(x)]) -> ([],[bark(x)]) -([x],[walk(x)])])" - ).pretty_print() - dexpr(r"([x,y],[x=y]) + ([z],[dog(z), walk(z)])").pretty_print() - dexpr(r"([],[([x],[]) | ([y],[]) | ([z],[dog(z), walk(z)])])").pretty_print() - dexpr(r"\P.\Q.(([x],[]) + P(x) + Q(x))(\x.([],[dog(x)]))").pretty_print() - - -def test_draw(): - try: - from tkinter import Tk - except ImportError as e: - raise ValueError("tkinter is required, but it's not available.") - - expressions = [ - r"x", - r"([],[])", - r"([x],[])", - r"([x],[man(x)])", - r"([x,y],[sees(x,y)])", - r"([x],[man(x), walks(x)])", - r"\x.([],[man(x), walks(x)])", - r"\x y.([],[sees(x,y)])", - r"([],[(([],[walks(x)]) + ([],[runs(x)]))])", - r"([x],[man(x), -([],[walks(x)])])", - r"([],[(([x],[man(x)]) -> ([],[walks(x)]))])", - ] - - for e in expressions: - d = DrtExpression.fromstring(e) - d.draw() - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/sem/drt_glue_demo.py b/pipeline/nltk/sem/drt_glue_demo.py deleted file mode 100644 index fe27c9fc66f92600ebdcb13eb622d3d07db36985..0000000000000000000000000000000000000000 --- a/pipeline/nltk/sem/drt_glue_demo.py +++ /dev/null @@ -1,553 +0,0 @@ -# Natural Language Toolkit: GUI Demo for Glue Semantics with Discourse -# Representation Theory (DRT) as meaning language -# -# Author: Dan Garrette -# -# Copyright (C) 2001-2023 NLTK Project -# URL: -# For license information, see LICENSE.TXT - -try: - from tkinter import Button, Frame, IntVar, Label, Listbox, Menu, Scrollbar, Tk - from tkinter.font import Font - - from nltk.draw.util import CanvasFrame, ShowText - -except ImportError: - """Ignore ImportError because tkinter might not be available.""" - -from nltk.parse import MaltParser -from nltk.sem.drt import DrsDrawer, DrtVariableExpression -from nltk.sem.glue import DrtGlue -from nltk.sem.logic import Variable -from nltk.tag import RegexpTagger -from nltk.util import in_idle - - -class DrtGlueDemo: - def __init__(self, examples): - # Set up the main window. - self._top = Tk() - self._top.title("DRT Glue Demo") - - # Set up key bindings. - self._init_bindings() - - # Initialize the fonts.self._error = None - self._init_fonts(self._top) - - self._examples = examples - self._readingCache = [None for example in examples] - - # The user can hide the grammar. - self._show_grammar = IntVar(self._top) - self._show_grammar.set(1) - - # Set the data to None - self._curExample = -1 - self._readings = [] - self._drs = None - self._drsWidget = None - self._error = None - - self._init_glue() - - # Create the basic frames. - self._init_menubar(self._top) - self._init_buttons(self._top) - self._init_exampleListbox(self._top) - self._init_readingListbox(self._top) - self._init_canvas(self._top) - - # Resize callback - self._canvas.bind("", self._configure) - - ######################################### - ## Initialization Helpers - ######################################### - - def _init_glue(self): - tagger = RegexpTagger( - [ - ("^(David|Mary|John)$", "NNP"), - ( - "^(walks|sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$", - "VB", - ), - ("^(go|order|vanish|find|approach)$", "VB"), - ("^(a)$", "ex_quant"), - ("^(every)$", "univ_quant"), - ("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"), - ("^(big|gray|former)$", "JJ"), - ("^(him|himself)$", "PRP"), - ] - ) - - depparser = MaltParser(tagger=tagger) - self._glue = DrtGlue(depparser=depparser, remove_duplicates=False) - - def _init_fonts(self, root): - # See: - self._sysfont = Font(font=Button()["font"]) - root.option_add("*Font", self._sysfont) - - # TWhat's our font size (default=same as sysfont) - self._size = IntVar(root) - self._size.set(self._sysfont.cget("size")) - - self._boldfont = Font(family="helvetica", weight="bold", size=self._size.get()) - self._font = Font(family="helvetica", size=self._size.get()) - if self._size.get() < 0: - big = self._size.get() - 2 - else: - big = self._size.get() + 2 - self._bigfont = Font(family="helvetica", weight="bold", size=big) - - def _init_exampleListbox(self, parent): - self._exampleFrame = listframe = Frame(parent) - self._exampleFrame.pack(fill="both", side="left", padx=2) - self._exampleList_label = Label( - self._exampleFrame, font=self._boldfont, text="Examples" - ) - self._exampleList_label.pack() - self._exampleList = Listbox( - self._exampleFrame, - selectmode="single", - relief="groove", - background="white", - foreground="#909090", - font=self._font, - selectforeground="#004040", - selectbackground="#c0f0c0", - ) - - self._exampleList.pack(side="right", fill="both", expand=1) - - for example in self._examples: - self._exampleList.insert("end", (" %s" % example)) - self._exampleList.config(height=min(len(self._examples), 25), width=40) - - # Add a scrollbar if there are more than 25 examples. - if len(self._examples) > 25: - listscroll = Scrollbar(self._exampleFrame, orient="vertical") - self._exampleList.config(yscrollcommand=listscroll.set) - listscroll.config(command=self._exampleList.yview) - listscroll.pack(side="left", fill="y") - - # If they select a example, apply it. - self._exampleList.bind("<>", self._exampleList_select) - - def _init_readingListbox(self, parent): - self._readingFrame = listframe = Frame(parent) - self._readingFrame.pack(fill="both", side="left", padx=2) - self._readingList_label = Label( - self._readingFrame, font=self._boldfont, text="Readings" - ) - self._readingList_label.pack() - self._readingList = Listbox( - self._readingFrame, - selectmode="single", - relief="groove", - background="white", - foreground="#909090", - font=self._font, - selectforeground="#004040", - selectbackground="#c0f0c0", - ) - - self._readingList.pack(side="right", fill="both", expand=1) - - # Add a scrollbar if there are more than 25 examples. - listscroll = Scrollbar(self._readingFrame, orient="vertical") - self._readingList.config(yscrollcommand=listscroll.set) - listscroll.config(command=self._readingList.yview) - listscroll.pack(side="right", fill="y") - - self._populate_readingListbox() - - def _populate_readingListbox(self): - # Populate the listbox with integers - self._readingList.delete(0, "end") - for i in range(len(self._readings)): - self._readingList.insert("end", (" %s" % (i + 1))) - self._readingList.config(height=min(len(self._readings), 25), width=5) - - # If they select a example, apply it. - self._readingList.bind("<>", self._readingList_select) - - def _init_bindings(self): - # Key bindings are a good thing. - self._top.bind("", self.destroy) - self._top.bind("", self.destroy) - self._top.bind("", self.destroy) - self._top.bind("n", self.next) - self._top.bind("", self.next) - self._top.bind("p", self.prev) - self._top.bind("", self.prev) - - def _init_buttons(self, parent): - # Set up the frames. - self._buttonframe = buttonframe = Frame(parent) - buttonframe.pack(fill="none", side="bottom", padx=3, pady=2) - Button( - buttonframe, - text="Prev", - background="#90c0d0", - foreground="black", - command=self.prev, - ).pack(side="left") - Button( - buttonframe, - text="Next", - background="#90c0d0", - foreground="black", - command=self.next, - ).pack(side="left") - - def _configure(self, event): - self._autostep = 0 - (x1, y1, x2, y2) = self._cframe.scrollregion() - y2 = event.height - 6 - self._canvas["scrollregion"] = "%d %d %d %d" % (x1, y1, x2, y2) - self._redraw() - - def _init_canvas(self, parent): - self._cframe = CanvasFrame( - parent, - background="white", - # width=525, height=250, - closeenough=10, - border=2, - relief="sunken", - ) - self._cframe.pack(expand=1, fill="both", side="top", pady=2) - canvas = self._canvas = self._cframe.canvas() - - # Initially, there's no tree or text - self._tree = None - self._textwidgets = [] - self._textline = None - - def _init_menubar(self, parent): - menubar = Menu(parent) - - filemenu = Menu(menubar, tearoff=0) - filemenu.add_command( - label="Exit", underline=1, command=self.destroy, accelerator="q" - ) - menubar.add_cascade(label="File", underline=0, menu=filemenu) - - actionmenu = Menu(menubar, tearoff=0) - actionmenu.add_command( - label="Next", underline=0, command=self.next, accelerator="n, Space" - ) - actionmenu.add_command( - label="Previous", underline=0, command=self.prev, accelerator="p, Backspace" - ) - menubar.add_cascade(label="Action", underline=0, menu=actionmenu) - - optionmenu = Menu(menubar, tearoff=0) - optionmenu.add_checkbutton( - label="Remove Duplicates", - underline=0, - variable=self._glue.remove_duplicates, - command=self._toggle_remove_duplicates, - accelerator="r", - ) - menubar.add_cascade(label="Options", underline=0, menu=optionmenu) - - viewmenu = Menu(menubar, tearoff=0) - viewmenu.add_radiobutton( - label="Tiny", - variable=self._size, - underline=0, - value=10, - command=self.resize, - ) - viewmenu.add_radiobutton( - label="Small", - variable=self._size, - underline=0, - value=12, - command=self.resize, - ) - viewmenu.add_radiobutton( - label="Medium", - variable=self._size, - underline=0, - value=14, - command=self.resize, - ) - viewmenu.add_radiobutton( - label="Large", - variable=self._size, - underline=0, - value=18, - command=self.resize, - ) - viewmenu.add_radiobutton( - label="Huge", - variable=self._size, - underline=0, - value=24, - command=self.resize, - ) - menubar.add_cascade(label="View", underline=0, menu=viewmenu) - - helpmenu = Menu(menubar, tearoff=0) - helpmenu.add_command(label="About", underline=0, command=self.about) - menubar.add_cascade(label="Help", underline=0, menu=helpmenu) - - parent.config(menu=menubar) - - ######################################### - ## Main draw procedure - ######################################### - - def _redraw(self): - canvas = self._canvas - - # Delete the old DRS, widgets, etc. - if self._drsWidget is not None: - self._drsWidget.clear() - - if self._drs: - self._drsWidget = DrsWidget(self._canvas, self._drs) - self._drsWidget.draw() - - if self._error: - self._drsWidget = DrsWidget(self._canvas, self._error) - self._drsWidget.draw() - - ######################################### - ## Button Callbacks - ######################################### - - def destroy(self, *e): - self._autostep = 0 - if self._top is None: - return - self._top.destroy() - self._top = None - - def prev(self, *e): - selection = self._readingList.curselection() - readingListSize = self._readingList.size() - - # there are readings - if readingListSize > 0: - # if one reading is currently selected - if len(selection) == 1: - index = int(selection[0]) - - # if it's on (or before) the first item - if index <= 0: - self._select_previous_example() - else: - self._readingList_store_selection(index - 1) - - else: - # select its first reading - self._readingList_store_selection(readingListSize - 1) - - else: - self._select_previous_example() - - def _select_previous_example(self): - # if the current example is not the first example - if self._curExample > 0: - self._exampleList_store_selection(self._curExample - 1) - else: - # go to the last example - self._exampleList_store_selection(len(self._examples) - 1) - - def next(self, *e): - selection = self._readingList.curselection() - readingListSize = self._readingList.size() - - # if there are readings - if readingListSize > 0: - # if one reading is currently selected - if len(selection) == 1: - index = int(selection[0]) - - # if it's on (or past) the last item - if index >= (readingListSize - 1): - self._select_next_example() - else: - self._readingList_store_selection(index + 1) - - else: - # select its first reading - self._readingList_store_selection(0) - - else: - self._select_next_example() - - def _select_next_example(self): - # if the current example is not the last example - if self._curExample < len(self._examples) - 1: - self._exampleList_store_selection(self._curExample + 1) - else: - # go to the first example - self._exampleList_store_selection(0) - - def about(self, *e): - ABOUT = ( - "NLTK Discourse Representation Theory (DRT) Glue Semantics Demo\n" - + "Written by Daniel H. Garrette" - ) - TITLE = "About: NLTK DRT Glue Demo" - try: - from tkinter.messagebox import Message - - Message(message=ABOUT, title=TITLE).show() - except: - ShowText(self._top, TITLE, ABOUT) - - def postscript(self, *e): - self._autostep = 0 - self._cframe.print_to_file() - - def mainloop(self, *args, **kwargs): - """ - Enter the Tkinter mainloop. This function must be called if - this demo is created from a non-interactive program (e.g. - from a secript); otherwise, the demo will close as soon as - the script completes. - """ - if in_idle(): - return - self._top.mainloop(*args, **kwargs) - - def resize(self, size=None): - if size is not None: - self._size.set(size) - size = self._size.get() - self._font.configure(size=-(abs(size))) - self._boldfont.configure(size=-(abs(size))) - self._sysfont.configure(size=-(abs(size))) - self._bigfont.configure(size=-(abs(size + 2))) - self._redraw() - - def _toggle_remove_duplicates(self): - self._glue.remove_duplicates = not self._glue.remove_duplicates - - self._exampleList.selection_clear(0, "end") - self._readings = [] - self._populate_readingListbox() - self._readingCache = [None for ex in self._examples] - self._curExample = -1 - self._error = None - - self._drs = None - self._redraw() - - def _exampleList_select(self, event): - selection = self._exampleList.curselection() - if len(selection) != 1: - return - self._exampleList_store_selection(int(selection[0])) - - def _exampleList_store_selection(self, index): - self._curExample = index - example = self._examples[index] - - self._exampleList.selection_clear(0, "end") - if example: - cache = self._readingCache[index] - if cache: - if isinstance(cache, list): - self._readings = cache - self._error = None - else: - self._readings = [] - self._error = cache - else: - try: - self._readings = self._glue.parse_to_meaning(example) - self._error = None - self._readingCache[index] = self._readings - except Exception as e: - self._readings = [] - self._error = DrtVariableExpression(Variable("Error: " + str(e))) - self._readingCache[index] = self._error - - # add a star to the end of the example - self._exampleList.delete(index) - self._exampleList.insert(index, (" %s *" % example)) - self._exampleList.config( - height=min(len(self._examples), 25), width=40 - ) - - self._populate_readingListbox() - - self._exampleList.selection_set(index) - - self._drs = None - self._redraw() - - def _readingList_select(self, event): - selection = self._readingList.curselection() - if len(selection) != 1: - return - self._readingList_store_selection(int(selection[0])) - - def _readingList_store_selection(self, index): - reading = self._readings[index] - - self._readingList.selection_clear(0, "end") - if reading: - self._readingList.selection_set(index) - - self._drs = reading.simplify().normalize().resolve_anaphora() - - self._redraw() - - -class DrsWidget: - def __init__(self, canvas, drs, **attribs): - self._drs = drs - self._canvas = canvas - canvas.font = Font( - font=canvas.itemcget(canvas.create_text(0, 0, text=""), "font") - ) - canvas._BUFFER = 3 - self.bbox = (0, 0, 0, 0) - - def draw(self): - (right, bottom) = DrsDrawer(self._drs, canvas=self._canvas).draw() - self.bbox = (0, 0, right + 1, bottom + 1) - - def clear(self): - self._canvas.create_rectangle(self.bbox, fill="white", width="0") - - -def demo(): - examples = [ - "John walks", - "David sees Mary", - "David eats a sandwich", - "every man chases a dog", - # 'every man believes a dog yawns', - # 'John gives David a sandwich', - "John chases himself", - # 'John persuades David to order a pizza', - # 'John tries to go', - # 'John tries to find a unicorn', - # 'John seems to vanish', - # 'a unicorn seems to approach', - # 'every big cat leaves', - # 'every gray cat leaves', - # 'every big gray cat leaves', - # 'a former senator leaves', - # 'John likes a cat', - # 'John likes every cat', - # 'he walks', - # 'John walks and he leaves' - ] - DrtGlueDemo(examples).mainloop() - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/sem/evaluate.py b/pipeline/nltk/sem/evaluate.py deleted file mode 100644 index bbff44f70b34ce0ac4de038b83a95a325d44abaf..0000000000000000000000000000000000000000 --- a/pipeline/nltk/sem/evaluate.py +++ /dev/null @@ -1,829 +0,0 @@ -# Natural Language Toolkit: Models for first-order languages with lambda -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ewan Klein , -# URL: -# For license information, see LICENSE.TXT - -# TODO: -# - fix tracing -# - fix iterator-based approach to existentials - -""" -This module provides data structures for representing first-order -models. -""" - -import inspect -import re -import sys -import textwrap -from pprint import pformat - -from nltk.decorators import decorator # this used in code that is commented out -from nltk.sem.logic import ( - AbstractVariableExpression, - AllExpression, - AndExpression, - ApplicationExpression, - EqualityExpression, - ExistsExpression, - Expression, - IffExpression, - ImpExpression, - IndividualVariableExpression, - IotaExpression, - LambdaExpression, - NegatedExpression, - OrExpression, - Variable, - is_indvar, -) - - -class Error(Exception): - pass - - -class Undefined(Error): - pass - - -def trace(f, *args, **kw): - argspec = inspect.getfullargspec(f) - d = dict(zip(argspec[0], args)) - if d.pop("trace", None): - print() - for item in d.items(): - print("%s => %s" % item) - return f(*args, **kw) - - -def is_rel(s): - """ - Check whether a set represents a relation (of any arity). - - :param s: a set containing tuples of str elements - :type s: set - :rtype: bool - """ - # we have the empty relation, i.e. set() - if len(s) == 0: - return True - # all the elements are tuples of the same length - elif all(isinstance(el, tuple) for el in s) and len(max(s)) == len(min(s)): - return True - else: - raise ValueError("Set %r contains sequences of different lengths" % s) - - -def set2rel(s): - """ - Convert a set containing individuals (strings or numbers) into a set of - unary tuples. Any tuples of strings already in the set are passed through - unchanged. - - For example: - - set(['a', 'b']) => set([('a',), ('b',)]) - - set([3, 27]) => set([('3',), ('27',)]) - - :type s: set - :rtype: set of tuple of str - """ - new = set() - for elem in s: - if isinstance(elem, str): - new.add((elem,)) - elif isinstance(elem, int): - new.add(str(elem)) - else: - new.add(elem) - return new - - -def arity(rel): - """ - Check the arity of a relation. - :type rel: set of tuples - :rtype: int of tuple of str - """ - if len(rel) == 0: - return 0 - return len(list(rel)[0]) - - -class Valuation(dict): - """ - A dictionary which represents a model-theoretic Valuation of non-logical constants. - Keys are strings representing the constants to be interpreted, and values correspond - to individuals (represented as strings) and n-ary relations (represented as sets of tuples - of strings). - - An instance of ``Valuation`` will raise a KeyError exception (i.e., - just behave like a standard dictionary) if indexed with an expression that - is not in its list of symbols. - """ - - def __init__(self, xs): - """ - :param xs: a list of (symbol, value) pairs. - """ - super().__init__() - for (sym, val) in xs: - if isinstance(val, str) or isinstance(val, bool): - self[sym] = val - elif isinstance(val, set): - self[sym] = set2rel(val) - else: - msg = textwrap.fill( - "Error in initializing Valuation. " - "Unrecognized value for symbol '%s':\n%s" % (sym, val), - width=66, - ) - - raise ValueError(msg) - - def __getitem__(self, key): - if key in self: - return dict.__getitem__(self, key) - else: - raise Undefined("Unknown expression: '%s'" % key) - - def __str__(self): - return pformat(self) - - @property - def domain(self): - """Set-theoretic domain of the value-space of a Valuation.""" - dom = [] - for val in self.values(): - if isinstance(val, str): - dom.append(val) - elif not isinstance(val, bool): - dom.extend( - [elem for tuple_ in val for elem in tuple_ if elem is not None] - ) - return set(dom) - - @property - def symbols(self): - """The non-logical constants which the Valuation recognizes.""" - return sorted(self.keys()) - - @classmethod - def fromstring(cls, s): - return read_valuation(s) - - -########################################## -# REs used by the _read_valuation function -########################################## -_VAL_SPLIT_RE = re.compile(r"\s*=+>\s*") -_ELEMENT_SPLIT_RE = re.compile(r"\s*,\s*") -_TUPLES_RE = re.compile( - r"""\s* - (\([^)]+\)) # tuple-expression - \s*""", - re.VERBOSE, -) - - -def _read_valuation_line(s): - """ - Read a line in a valuation file. - - Lines are expected to be of the form:: - - noosa => n - girl => {g1, g2} - chase => {(b1, g1), (b2, g1), (g1, d1), (g2, d2)} - - :param s: input line - :type s: str - :return: a pair (symbol, value) - :rtype: tuple - """ - pieces = _VAL_SPLIT_RE.split(s) - symbol = pieces[0] - value = pieces[1] - # check whether the value is meant to be a set - if value.startswith("{"): - value = value[1:-1] - tuple_strings = _TUPLES_RE.findall(value) - # are the set elements tuples? - if tuple_strings: - set_elements = [] - for ts in tuple_strings: - ts = ts[1:-1] - element = tuple(_ELEMENT_SPLIT_RE.split(ts)) - set_elements.append(element) - else: - set_elements = _ELEMENT_SPLIT_RE.split(value) - value = set(set_elements) - return symbol, value - - -def read_valuation(s, encoding=None): - """ - Convert a valuation string into a valuation. - - :param s: a valuation string - :type s: str - :param encoding: the encoding of the input string, if it is binary - :type encoding: str - :return: a ``nltk.sem`` valuation - :rtype: Valuation - """ - if encoding is not None: - s = s.decode(encoding) - statements = [] - for linenum, line in enumerate(s.splitlines()): - line = line.strip() - if line.startswith("#") or line == "": - continue - try: - statements.append(_read_valuation_line(line)) - except ValueError as e: - raise ValueError(f"Unable to parse line {linenum}: {line}") from e - return Valuation(statements) - - -class Assignment(dict): - r""" - A dictionary which represents an assignment of values to variables. - - An assignment can only assign values from its domain. - - If an unknown expression *a* is passed to a model *M*\ 's - interpretation function *i*, *i* will first check whether *M*\ 's - valuation assigns an interpretation to *a* as a constant, and if - this fails, *i* will delegate the interpretation of *a* to - *g*. *g* only assigns values to individual variables (i.e., - members of the class ``IndividualVariableExpression`` in the ``logic`` - module. If a variable is not assigned a value by *g*, it will raise - an ``Undefined`` exception. - - A variable *Assignment* is a mapping from individual variables to - entities in the domain. Individual variables are usually indicated - with the letters ``'x'``, ``'y'``, ``'w'`` and ``'z'``, optionally - followed by an integer (e.g., ``'x0'``, ``'y332'``). Assignments are - created using the ``Assignment`` constructor, which also takes the - domain as a parameter. - - >>> from nltk.sem.evaluate import Assignment - >>> dom = set(['u1', 'u2', 'u3', 'u4']) - >>> g3 = Assignment(dom, [('x', 'u1'), ('y', 'u2')]) - >>> g3 == {'x': 'u1', 'y': 'u2'} - True - - There is also a ``print`` format for assignments which uses a notation - closer to that in logic textbooks: - - >>> print(g3) - g[u1/x][u2/y] - - It is also possible to update an assignment using the ``add`` method: - - >>> dom = set(['u1', 'u2', 'u3', 'u4']) - >>> g4 = Assignment(dom) - >>> g4.add('x', 'u1') - {'x': 'u1'} - - With no arguments, ``purge()`` is equivalent to ``clear()`` on a dictionary: - - >>> g4.purge() - >>> g4 - {} - - :param domain: the domain of discourse - :type domain: set - :param assign: a list of (varname, value) associations - :type assign: list - """ - - def __init__(self, domain, assign=None): - super().__init__() - self.domain = domain - if assign: - for (var, val) in assign: - assert val in self.domain, "'{}' is not in the domain: {}".format( - val, - self.domain, - ) - assert is_indvar(var), ( - "Wrong format for an Individual Variable: '%s'" % var - ) - self[var] = val - self.variant = None - self._addvariant() - - def __getitem__(self, key): - if key in self: - return dict.__getitem__(self, key) - else: - raise Undefined("Not recognized as a variable: '%s'" % key) - - def copy(self): - new = Assignment(self.domain) - new.update(self) - return new - - def purge(self, var=None): - """ - Remove one or all keys (i.e. logic variables) from an - assignment, and update ``self.variant``. - - :param var: a Variable acting as a key for the assignment. - """ - if var: - del self[var] - else: - self.clear() - self._addvariant() - return None - - def __str__(self): - """ - Pretty printing for assignments. {'x', 'u'} appears as 'g[u/x]' - """ - gstring = "g" - # Deterministic output for unit testing. - variant = sorted(self.variant) - for (val, var) in variant: - gstring += f"[{val}/{var}]" - return gstring - - def _addvariant(self): - """ - Create a more pretty-printable version of the assignment. - """ - list_ = [] - for item in self.items(): - pair = (item[1], item[0]) - list_.append(pair) - self.variant = list_ - return None - - def add(self, var, val): - """ - Add a new variable-value pair to the assignment, and update - ``self.variant``. - - """ - assert val in self.domain, f"{val} is not in the domain {self.domain}" - assert is_indvar(var), "Wrong format for an Individual Variable: '%s'" % var - self[var] = val - self._addvariant() - return self - - -class Model: - """ - A first order model is a domain *D* of discourse and a valuation *V*. - - A domain *D* is a set, and a valuation *V* is a map that associates - expressions with values in the model. - The domain of *V* should be a subset of *D*. - - Construct a new ``Model``. - - :type domain: set - :param domain: A set of entities representing the domain of discourse of the model. - :type valuation: Valuation - :param valuation: the valuation of the model. - :param prop: If this is set, then we are building a propositional\ - model and don't require the domain of *V* to be subset of *D*. - """ - - def __init__(self, domain, valuation): - assert isinstance(domain, set) - self.domain = domain - self.valuation = valuation - if not domain.issuperset(valuation.domain): - raise Error( - "The valuation domain, %s, must be a subset of the model's domain, %s" - % (valuation.domain, domain) - ) - - def __repr__(self): - return f"({self.domain!r}, {self.valuation!r})" - - def __str__(self): - return f"Domain = {self.domain},\nValuation = \n{self.valuation}" - - def evaluate(self, expr, g, trace=None): - """ - Read input expressions, and provide a handler for ``satisfy`` - that blocks further propagation of the ``Undefined`` error. - :param expr: An ``Expression`` of ``logic``. - :type g: Assignment - :param g: an assignment to individual variables. - :rtype: bool or 'Undefined' - """ - try: - parsed = Expression.fromstring(expr) - value = self.satisfy(parsed, g, trace=trace) - if trace: - print() - print(f"'{expr}' evaluates to {value} under M, {g}") - return value - except Undefined: - if trace: - print() - print(f"'{expr}' is undefined under M, {g}") - return "Undefined" - - def satisfy(self, parsed, g, trace=None): - """ - Recursive interpretation function for a formula of first-order logic. - - Raises an ``Undefined`` error when ``parsed`` is an atomic string - but is not a symbol or an individual variable. - - :return: Returns a truth value or ``Undefined`` if ``parsed`` is\ - complex, and calls the interpretation function ``i`` if ``parsed``\ - is atomic. - - :param parsed: An expression of ``logic``. - :type g: Assignment - :param g: an assignment to individual variables. - """ - - if isinstance(parsed, ApplicationExpression): - function, arguments = parsed.uncurry() - if isinstance(function, AbstractVariableExpression): - # It's a predicate expression ("P(x,y)"), so used uncurried arguments - funval = self.satisfy(function, g) - argvals = tuple(self.satisfy(arg, g) for arg in arguments) - return argvals in funval - else: - # It must be a lambda expression, so use curried form - funval = self.satisfy(parsed.function, g) - argval = self.satisfy(parsed.argument, g) - return funval[argval] - elif isinstance(parsed, NegatedExpression): - return not self.satisfy(parsed.term, g) - elif isinstance(parsed, AndExpression): - return self.satisfy(parsed.first, g) and self.satisfy(parsed.second, g) - elif isinstance(parsed, OrExpression): - return self.satisfy(parsed.first, g) or self.satisfy(parsed.second, g) - elif isinstance(parsed, ImpExpression): - return (not self.satisfy(parsed.first, g)) or self.satisfy(parsed.second, g) - elif isinstance(parsed, IffExpression): - return self.satisfy(parsed.first, g) == self.satisfy(parsed.second, g) - elif isinstance(parsed, EqualityExpression): - return self.satisfy(parsed.first, g) == self.satisfy(parsed.second, g) - elif isinstance(parsed, AllExpression): - new_g = g.copy() - for u in self.domain: - new_g.add(parsed.variable.name, u) - if not self.satisfy(parsed.term, new_g): - return False - return True - elif isinstance(parsed, ExistsExpression): - new_g = g.copy() - for u in self.domain: - new_g.add(parsed.variable.name, u) - if self.satisfy(parsed.term, new_g): - return True - return False - elif isinstance(parsed, IotaExpression): - new_g = g.copy() - for u in self.domain: - new_g.add(parsed.variable.name, u) - if self.satisfy(parsed.term, new_g): - return True - return False - elif isinstance(parsed, LambdaExpression): - cf = {} - var = parsed.variable.name - for u in self.domain: - val = self.satisfy(parsed.term, g.add(var, u)) - # NB the dict would be a lot smaller if we do this: - # if val: cf[u] = val - # But then need to deal with cases where f(a) should yield - # a function rather than just False. - cf[u] = val - return cf - else: - return self.i(parsed, g, trace) - - # @decorator(trace_eval) - def i(self, parsed, g, trace=False): - """ - An interpretation function. - - Assuming that ``parsed`` is atomic: - - - if ``parsed`` is a non-logical constant, calls the valuation *V* - - else if ``parsed`` is an individual variable, calls assignment *g* - - else returns ``Undefined``. - - :param parsed: an ``Expression`` of ``logic``. - :type g: Assignment - :param g: an assignment to individual variables. - :return: a semantic value - """ - # If parsed is a propositional letter 'p', 'q', etc, it could be in valuation.symbols - # and also be an IndividualVariableExpression. We want to catch this first case. - # So there is a procedural consequence to the ordering of clauses here: - if parsed.variable.name in self.valuation.symbols: - return self.valuation[parsed.variable.name] - elif isinstance(parsed, IndividualVariableExpression): - return g[parsed.variable.name] - - else: - raise Undefined("Can't find a value for %s" % parsed) - - def satisfiers(self, parsed, varex, g, trace=None, nesting=0): - """ - Generate the entities from the model's domain that satisfy an open formula. - - :param parsed: an open formula - :type parsed: Expression - :param varex: the relevant free individual variable in ``parsed``. - :type varex: VariableExpression or str - :param g: a variable assignment - :type g: Assignment - :return: a set of the entities that satisfy ``parsed``. - """ - - spacer = " " - indent = spacer + (spacer * nesting) - candidates = [] - - if isinstance(varex, str): - var = Variable(varex) - else: - var = varex - - if var in parsed.free(): - if trace: - print() - print( - (spacer * nesting) - + f"Open formula is '{parsed}' with assignment {g}" - ) - for u in self.domain: - new_g = g.copy() - new_g.add(var.name, u) - if trace and trace > 1: - lowtrace = trace - 1 - else: - lowtrace = 0 - value = self.satisfy(parsed, new_g, lowtrace) - - if trace: - print(indent + "(trying assignment %s)" % new_g) - - # parsed == False under g[u/var]? - if value == False: - if trace: - print(indent + f"value of '{parsed}' under {new_g} is False") - - # so g[u/var] is a satisfying assignment - else: - candidates.append(u) - if trace: - print(indent + f"value of '{parsed}' under {new_g} is {value}") - - result = {c for c in candidates} - # var isn't free in parsed - else: - raise Undefined(f"{var.name} is not free in {parsed}") - - return result - - -# ////////////////////////////////////////////////////////////////////// -# Demo.. -# ////////////////////////////////////////////////////////////////////// -# number of spacer chars -mult = 30 - -# Demo 1: Propositional Logic -################# -def propdemo(trace=None): - """Example of a propositional model.""" - - global val1, dom1, m1, g1 - val1 = Valuation([("P", True), ("Q", True), ("R", False)]) - dom1 = set() - m1 = Model(dom1, val1) - g1 = Assignment(dom1) - - print() - print("*" * mult) - print("Propositional Formulas Demo") - print("*" * mult) - print("(Propositional constants treated as nullary predicates)") - print() - print("Model m1:\n", m1) - print("*" * mult) - sentences = [ - "(P & Q)", - "(P & R)", - "- P", - "- R", - "- - P", - "- (P & R)", - "(P | R)", - "(R | P)", - "(R | R)", - "(- P | R)", - "(P | - P)", - "(P -> Q)", - "(P -> R)", - "(R -> P)", - "(P <-> P)", - "(R <-> R)", - "(P <-> R)", - ] - - for sent in sentences: - if trace: - print() - m1.evaluate(sent, g1, trace) - else: - print(f"The value of '{sent}' is: {m1.evaluate(sent, g1)}") - - -# Demo 2: FOL Model -############# - - -def folmodel(quiet=False, trace=None): - """Example of a first-order model.""" - - global val2, v2, dom2, m2, g2 - - v2 = [ - ("adam", "b1"), - ("betty", "g1"), - ("fido", "d1"), - ("girl", {"g1", "g2"}), - ("boy", {"b1", "b2"}), - ("dog", {"d1"}), - ("love", {("b1", "g1"), ("b2", "g2"), ("g1", "b1"), ("g2", "b1")}), - ] - val2 = Valuation(v2) - dom2 = val2.domain - m2 = Model(dom2, val2) - g2 = Assignment(dom2, [("x", "b1"), ("y", "g2")]) - - if not quiet: - print() - print("*" * mult) - print("Models Demo") - print("*" * mult) - print("Model m2:\n", "-" * 14, "\n", m2) - print("Variable assignment = ", g2) - - exprs = ["adam", "boy", "love", "walks", "x", "y", "z"] - parsed_exprs = [Expression.fromstring(e) for e in exprs] - - print() - for parsed in parsed_exprs: - try: - print( - "The interpretation of '%s' in m2 is %s" - % (parsed, m2.i(parsed, g2)) - ) - except Undefined: - print("The interpretation of '%s' in m2 is Undefined" % parsed) - - applications = [ - ("boy", ("adam")), - ("walks", ("adam",)), - ("love", ("adam", "y")), - ("love", ("y", "adam")), - ] - - for (fun, args) in applications: - try: - funval = m2.i(Expression.fromstring(fun), g2) - argsval = tuple(m2.i(Expression.fromstring(arg), g2) for arg in args) - print(f"{fun}({args}) evaluates to {argsval in funval}") - except Undefined: - print(f"{fun}({args}) evaluates to Undefined") - - -# Demo 3: FOL -######### - - -def foldemo(trace=None): - """ - Interpretation of closed expressions in a first-order model. - """ - folmodel(quiet=True) - - print() - print("*" * mult) - print("FOL Formulas Demo") - print("*" * mult) - - formulas = [ - "love (adam, betty)", - "(adam = mia)", - "\\x. (boy(x) | girl(x))", - "\\x. boy(x)(adam)", - "\\x y. love(x, y)", - "\\x y. love(x, y)(adam)(betty)", - "\\x y. love(x, y)(adam, betty)", - "\\x y. (boy(x) & love(x, y))", - "\\x. exists y. (boy(x) & love(x, y))", - "exists z1. boy(z1)", - "exists x. (boy(x) & -(x = adam))", - "exists x. (boy(x) & all y. love(y, x))", - "all x. (boy(x) | girl(x))", - "all x. (girl(x) -> exists y. boy(y) & love(x, y))", # Every girl loves exists boy. - "exists x. (boy(x) & all y. (girl(y) -> love(y, x)))", # There is exists boy that every girl loves. - "exists x. (boy(x) & all y. (girl(y) -> love(x, y)))", # exists boy loves every girl. - "all x. (dog(x) -> - girl(x))", - "exists x. exists y. (love(x, y) & love(x, y))", - ] - - for fmla in formulas: - g2.purge() - if trace: - m2.evaluate(fmla, g2, trace) - else: - print(f"The value of '{fmla}' is: {m2.evaluate(fmla, g2)}") - - -# Demo 3: Satisfaction -############# - - -def satdemo(trace=None): - """Satisfiers of an open formula in a first order model.""" - - print() - print("*" * mult) - print("Satisfiers Demo") - print("*" * mult) - - folmodel(quiet=True) - - formulas = [ - "boy(x)", - "(x = x)", - "(boy(x) | girl(x))", - "(boy(x) & girl(x))", - "love(adam, x)", - "love(x, adam)", - "-(x = adam)", - "exists z22. love(x, z22)", - "exists y. love(y, x)", - "all y. (girl(y) -> love(x, y))", - "all y. (girl(y) -> love(y, x))", - "all y. (girl(y) -> (boy(x) & love(y, x)))", - "(boy(x) & all y. (girl(y) -> love(x, y)))", - "(boy(x) & all y. (girl(y) -> love(y, x)))", - "(boy(x) & exists y. (girl(y) & love(y, x)))", - "(girl(x) -> dog(x))", - "all y. (dog(y) -> (x = y))", - "exists y. love(y, x)", - "exists y. (love(adam, y) & love(y, x))", - ] - - if trace: - print(m2) - - for fmla in formulas: - print(fmla) - Expression.fromstring(fmla) - - parsed = [Expression.fromstring(fmla) for fmla in formulas] - - for p in parsed: - g2.purge() - print( - "The satisfiers of '{}' are: {}".format(p, m2.satisfiers(p, "x", g2, trace)) - ) - - -def demo(num=0, trace=None): - """ - Run exists demos. - - - num = 1: propositional logic demo - - num = 2: first order model demo (only if trace is set) - - num = 3: first order sentences demo - - num = 4: satisfaction of open formulas demo - - any other value: run all the demos - - :param trace: trace = 1, or trace = 2 for more verbose tracing - """ - demos = {1: propdemo, 2: folmodel, 3: foldemo, 4: satdemo} - - try: - demos[num](trace=trace) - except KeyError: - for num in demos: - demos[num](trace=trace) - - -if __name__ == "__main__": - demo(2, trace=0) diff --git a/pipeline/nltk/sem/glue.py b/pipeline/nltk/sem/glue.py deleted file mode 100644 index 1098c83bec71cee14b6c06e93ba3f15366c0ada2..0000000000000000000000000000000000000000 --- a/pipeline/nltk/sem/glue.py +++ /dev/null @@ -1,835 +0,0 @@ -# Natural Language Toolkit: Glue Semantics -# -# Author: Dan Garrette -# -# Copyright (C) 2001-2023 NLTK Project -# URL: -# For license information, see LICENSE.TXT - -import os -from itertools import chain - -import nltk -from nltk.internals import Counter -from nltk.sem import drt, linearlogic -from nltk.sem.logic import ( - AbstractVariableExpression, - Expression, - LambdaExpression, - Variable, - VariableExpression, -) -from nltk.tag import BigramTagger, RegexpTagger, TrigramTagger, UnigramTagger - -SPEC_SEMTYPES = { - "a": "ex_quant", - "an": "ex_quant", - "every": "univ_quant", - "the": "def_art", - "no": "no_quant", - "default": "ex_quant", -} - -OPTIONAL_RELATIONSHIPS = ["nmod", "vmod", "punct"] - - -class GlueFormula: - def __init__(self, meaning, glue, indices=None): - if not indices: - indices = set() - - if isinstance(meaning, str): - self.meaning = Expression.fromstring(meaning) - elif isinstance(meaning, Expression): - self.meaning = meaning - else: - raise RuntimeError( - "Meaning term neither string or expression: %s, %s" - % (meaning, meaning.__class__) - ) - - if isinstance(glue, str): - self.glue = linearlogic.LinearLogicParser().parse(glue) - elif isinstance(glue, linearlogic.Expression): - self.glue = glue - else: - raise RuntimeError( - "Glue term neither string or expression: %s, %s" - % (glue, glue.__class__) - ) - - self.indices = indices - - def applyto(self, arg): - """self = (\\x.(walk x), (subj -o f)) - arg = (john , subj) - returns ((walk john), f) - """ - if self.indices & arg.indices: # if the sets are NOT disjoint - raise linearlogic.LinearLogicApplicationException( - f"'{self}' applied to '{arg}'. Indices are not disjoint." - ) - else: # if the sets ARE disjoint - return_indices = self.indices | arg.indices - - try: - return_glue = linearlogic.ApplicationExpression( - self.glue, arg.glue, arg.indices - ) - except linearlogic.LinearLogicApplicationException as e: - raise linearlogic.LinearLogicApplicationException( - f"'{self.simplify()}' applied to '{arg.simplify()}'" - ) from e - - arg_meaning_abstracted = arg.meaning - if return_indices: - for dep in self.glue.simplify().antecedent.dependencies[ - ::-1 - ]: # if self.glue is (A -o B), dep is in A.dependencies - arg_meaning_abstracted = self.make_LambdaExpression( - Variable("v%s" % dep), arg_meaning_abstracted - ) - return_meaning = self.meaning.applyto(arg_meaning_abstracted) - - return self.__class__(return_meaning, return_glue, return_indices) - - def make_VariableExpression(self, name): - return VariableExpression(name) - - def make_LambdaExpression(self, variable, term): - return LambdaExpression(variable, term) - - def lambda_abstract(self, other): - assert isinstance(other, GlueFormula) - assert isinstance(other.meaning, AbstractVariableExpression) - return self.__class__( - self.make_LambdaExpression(other.meaning.variable, self.meaning), - linearlogic.ImpExpression(other.glue, self.glue), - ) - - def compile(self, counter=None): - """From Iddo Lev's PhD Dissertation p108-109""" - if not counter: - counter = Counter() - (compiled_glue, new_forms) = self.glue.simplify().compile_pos( - counter, self.__class__ - ) - return new_forms + [ - self.__class__(self.meaning, compiled_glue, {counter.get()}) - ] - - def simplify(self): - return self.__class__( - self.meaning.simplify(), self.glue.simplify(), self.indices - ) - - def __eq__(self, other): - return ( - self.__class__ == other.__class__ - and self.meaning == other.meaning - and self.glue == other.glue - ) - - def __ne__(self, other): - return not self == other - - # sorting for use in doctests which must be deterministic - def __lt__(self, other): - return str(self) < str(other) - - def __str__(self): - assert isinstance(self.indices, set) - accum = f"{self.meaning} : {self.glue}" - if self.indices: - accum += ( - " : {" + ", ".join(str(index) for index in sorted(self.indices)) + "}" - ) - return accum - - def __repr__(self): - return "%s" % self - - -class GlueDict(dict): - def __init__(self, filename, encoding=None): - self.filename = filename - self.file_encoding = encoding - self.read_file() - - def read_file(self, empty_first=True): - if empty_first: - self.clear() - - try: - contents = nltk.data.load( - self.filename, format="text", encoding=self.file_encoding - ) - # TODO: the above can't handle zip files, but this should anyway be fixed in nltk.data.load() - except LookupError as e: - try: - contents = nltk.data.load( - "file:" + self.filename, format="text", encoding=self.file_encoding - ) - except LookupError: - raise e - lines = contents.splitlines() - - for line in lines: # example: 'n : (\\x.( x), (v-or))' - # lambdacalc -^ linear logic -^ - line = line.strip() # remove trailing newline - if not len(line): - continue # skip empty lines - if line[0] == "#": - continue # skip commented out lines - - parts = line.split( - " : ", 2 - ) # ['verb', '(\\x.( x), ( subj -o f ))', '[subj]'] - - glue_formulas = [] - paren_count = 0 - tuple_start = 0 - tuple_comma = 0 - - relationships = None - - if len(parts) > 1: - for (i, c) in enumerate(parts[1]): - if c == "(": - if paren_count == 0: # if it's the first '(' of a tuple - tuple_start = i + 1 # then save the index - paren_count += 1 - elif c == ")": - paren_count -= 1 - if paren_count == 0: # if it's the last ')' of a tuple - meaning_term = parts[1][ - tuple_start:tuple_comma - ] # '\\x.( x)' - glue_term = parts[1][tuple_comma + 1 : i] # '(v-r)' - glue_formulas.append( - [meaning_term, glue_term] - ) # add the GlueFormula to the list - elif c == ",": - if ( - paren_count == 1 - ): # if it's a comma separating the parts of the tuple - tuple_comma = i # then save the index - elif c == "#": # skip comments at the ends of lines - if ( - paren_count != 0 - ): # if the line hasn't parsed correctly so far - raise RuntimeError( - "Formula syntax is incorrect for entry " + line - ) - break # break to the next line - - if len(parts) > 2: # if there is a relationship entry at the end - rel_start = parts[2].index("[") + 1 - rel_end = parts[2].index("]") - if rel_start == rel_end: - relationships = frozenset() - else: - relationships = frozenset( - r.strip() for r in parts[2][rel_start:rel_end].split(",") - ) - - try: - start_inheritance = parts[0].index("(") - end_inheritance = parts[0].index(")") - sem = parts[0][:start_inheritance].strip() - supertype = parts[0][start_inheritance + 1 : end_inheritance] - except: - sem = parts[0].strip() - supertype = None - - if sem not in self: - self[sem] = {} - - if ( - relationships is None - ): # if not specified for a specific relationship set - # add all relationship entries for parents - if supertype: - for rels in self[supertype]: - if rels not in self[sem]: - self[sem][rels] = [] - glue = self[supertype][rels] - self[sem][rels].extend(glue) - self[sem][rels].extend( - glue_formulas - ) # add the glue formulas to every rel entry - else: - if None not in self[sem]: - self[sem][None] = [] - self[sem][None].extend( - glue_formulas - ) # add the glue formulas to every rel entry - else: - if relationships not in self[sem]: - self[sem][relationships] = [] - if supertype: - self[sem][relationships].extend(self[supertype][relationships]) - self[sem][relationships].extend( - glue_formulas - ) # add the glue entry to the dictionary - - def __str__(self): - accum = "" - for pos in self: - str_pos = "%s" % pos - for relset in self[pos]: - i = 1 - for gf in self[pos][relset]: - if i == 1: - accum += str_pos + ": " - else: - accum += " " * (len(str_pos) + 2) - accum += "%s" % gf - if relset and i == len(self[pos][relset]): - accum += " : %s" % relset - accum += "\n" - i += 1 - return accum - - def to_glueformula_list(self, depgraph, node=None, counter=None, verbose=False): - if node is None: - # TODO: should it be depgraph.root? Is this code tested? - top = depgraph.nodes[0] - depList = list(chain.from_iterable(top["deps"].values())) - root = depgraph.nodes[depList[0]] - - return self.to_glueformula_list(depgraph, root, Counter(), verbose) - - glueformulas = self.lookup(node, depgraph, counter) - for dep_idx in chain.from_iterable(node["deps"].values()): - dep = depgraph.nodes[dep_idx] - glueformulas.extend( - self.to_glueformula_list(depgraph, dep, counter, verbose) - ) - return glueformulas - - def lookup(self, node, depgraph, counter): - semtype_names = self.get_semtypes(node) - - semtype = None - for name in semtype_names: - if name in self: - semtype = self[name] - break - if semtype is None: - # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word) - return [] - - self.add_missing_dependencies(node, depgraph) - - lookup = self._lookup_semtype_option(semtype, node, depgraph) - - if not len(lookup): - raise KeyError( - "There is no GlueDict entry for sem type of '%s' " - "with tag '%s', and rel '%s'" % (node["word"], node["tag"], node["rel"]) - ) - - return self.get_glueformulas_from_semtype_entry( - lookup, node["word"], node, depgraph, counter - ) - - def add_missing_dependencies(self, node, depgraph): - rel = node["rel"].lower() - - if rel == "main": - headnode = depgraph.nodes[node["head"]] - subj = self.lookup_unique("subj", headnode, depgraph) - relation = subj["rel"] - node["deps"].setdefault(relation, []) - node["deps"][relation].append(subj["address"]) - # node['deps'].append(subj['address']) - - def _lookup_semtype_option(self, semtype, node, depgraph): - relationships = frozenset( - depgraph.nodes[dep]["rel"].lower() - for dep in chain.from_iterable(node["deps"].values()) - if depgraph.nodes[dep]["rel"].lower() not in OPTIONAL_RELATIONSHIPS - ) - - try: - lookup = semtype[relationships] - except KeyError: - # An exact match is not found, so find the best match where - # 'best' is defined as the glue entry whose relationship set has the - # most relations of any possible relationship set that is a subset - # of the actual depgraph - best_match = frozenset() - for relset_option in set(semtype) - {None}: - if ( - len(relset_option) > len(best_match) - and relset_option < relationships - ): - best_match = relset_option - if not best_match: - if None in semtype: - best_match = None - else: - return None - lookup = semtype[best_match] - - return lookup - - def get_semtypes(self, node): - """ - Based on the node, return a list of plausible semtypes in order of - plausibility. - """ - rel = node["rel"].lower() - word = node["word"].lower() - - if rel == "spec": - if word in SPEC_SEMTYPES: - return [SPEC_SEMTYPES[word]] - else: - return [SPEC_SEMTYPES["default"]] - elif rel in ["nmod", "vmod"]: - return [node["tag"], rel] - else: - return [node["tag"]] - - def get_glueformulas_from_semtype_entry( - self, lookup, word, node, depgraph, counter - ): - glueformulas = [] - - glueFormulaFactory = self.get_GlueFormula_factory() - for meaning, glue in lookup: - gf = glueFormulaFactory(self.get_meaning_formula(meaning, word), glue) - if not len(glueformulas): - gf.word = word - else: - gf.word = f"{word}{len(glueformulas) + 1}" - - gf.glue = self.initialize_labels(gf.glue, node, depgraph, counter.get()) - - glueformulas.append(gf) - return glueformulas - - def get_meaning_formula(self, generic, word): - """ - :param generic: A meaning formula string containing the - parameter "" - :param word: The actual word to be replace "" - """ - word = word.replace(".", "") - return generic.replace("", word) - - def initialize_labels(self, expr, node, depgraph, unique_index): - if isinstance(expr, linearlogic.AtomicExpression): - name = self.find_label_name(expr.name.lower(), node, depgraph, unique_index) - if name[0].isupper(): - return linearlogic.VariableExpression(name) - else: - return linearlogic.ConstantExpression(name) - else: - return linearlogic.ImpExpression( - self.initialize_labels(expr.antecedent, node, depgraph, unique_index), - self.initialize_labels(expr.consequent, node, depgraph, unique_index), - ) - - def find_label_name(self, name, node, depgraph, unique_index): - try: - dot = name.index(".") - - before_dot = name[:dot] - after_dot = name[dot + 1 :] - if before_dot == "super": - return self.find_label_name( - after_dot, depgraph.nodes[node["head"]], depgraph, unique_index - ) - else: - return self.find_label_name( - after_dot, - self.lookup_unique(before_dot, node, depgraph), - depgraph, - unique_index, - ) - except ValueError: - lbl = self.get_label(node) - if name == "f": - return lbl - elif name == "v": - return "%sv" % lbl - elif name == "r": - return "%sr" % lbl - elif name == "super": - return self.get_label(depgraph.nodes[node["head"]]) - elif name == "var": - return f"{lbl.upper()}{unique_index}" - elif name == "a": - return self.get_label(self.lookup_unique("conja", node, depgraph)) - elif name == "b": - return self.get_label(self.lookup_unique("conjb", node, depgraph)) - else: - return self.get_label(self.lookup_unique(name, node, depgraph)) - - def get_label(self, node): - """ - Pick an alphabetic character as identifier for an entity in the model. - - :param value: where to index into the list of characters - :type value: int - """ - value = node["address"] - - letter = [ - "f", - "g", - "h", - "i", - "j", - "k", - "l", - "m", - "n", - "o", - "p", - "q", - "r", - "s", - "t", - "u", - "v", - "w", - "x", - "y", - "z", - "a", - "b", - "c", - "d", - "e", - ][value - 1] - num = int(value) // 26 - if num > 0: - return letter + str(num) - else: - return letter - - def lookup_unique(self, rel, node, depgraph): - """ - Lookup 'key'. There should be exactly one item in the associated relation. - """ - deps = [ - depgraph.nodes[dep] - for dep in chain.from_iterable(node["deps"].values()) - if depgraph.nodes[dep]["rel"].lower() == rel.lower() - ] - - if len(deps) == 0: - raise KeyError( - "'{}' doesn't contain a feature '{}'".format(node["word"], rel) - ) - elif len(deps) > 1: - raise KeyError( - "'{}' should only have one feature '{}'".format(node["word"], rel) - ) - else: - return deps[0] - - def get_GlueFormula_factory(self): - return GlueFormula - - -class Glue: - def __init__( - self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False - ): - self.verbose = verbose - self.remove_duplicates = remove_duplicates - self.depparser = depparser - - from nltk import Prover9 - - self.prover = Prover9() - - if semtype_file: - self.semtype_file = semtype_file - else: - self.semtype_file = os.path.join( - "grammars", "sample_grammars", "glue.semtype" - ) - - def train_depparser(self, depgraphs=None): - if depgraphs: - self.depparser.train(depgraphs) - else: - self.depparser.train_from_file( - nltk.data.find( - os.path.join("grammars", "sample_grammars", "glue_train.conll") - ) - ) - - def parse_to_meaning(self, sentence): - readings = [] - for agenda in self.parse_to_compiled(sentence): - readings.extend(self.get_readings(agenda)) - return readings - - def get_readings(self, agenda): - readings = [] - agenda_length = len(agenda) - atomics = dict() - nonatomics = dict() - while agenda: # is not empty - cur = agenda.pop() - glue_simp = cur.glue.simplify() - if isinstance( - glue_simp, linearlogic.ImpExpression - ): # if cur.glue is non-atomic - for key in atomics: - try: - if isinstance(cur.glue, linearlogic.ApplicationExpression): - bindings = cur.glue.bindings - else: - bindings = linearlogic.BindingDict() - glue_simp.antecedent.unify(key, bindings) - for atomic in atomics[key]: - if not ( - cur.indices & atomic.indices - ): # if the sets of indices are disjoint - try: - agenda.append(cur.applyto(atomic)) - except linearlogic.LinearLogicApplicationException: - pass - except linearlogic.UnificationException: - pass - try: - nonatomics[glue_simp.antecedent].append(cur) - except KeyError: - nonatomics[glue_simp.antecedent] = [cur] - - else: # else cur.glue is atomic - for key in nonatomics: - for nonatomic in nonatomics[key]: - try: - if isinstance( - nonatomic.glue, linearlogic.ApplicationExpression - ): - bindings = nonatomic.glue.bindings - else: - bindings = linearlogic.BindingDict() - glue_simp.unify(key, bindings) - if not ( - cur.indices & nonatomic.indices - ): # if the sets of indices are disjoint - try: - agenda.append(nonatomic.applyto(cur)) - except linearlogic.LinearLogicApplicationException: - pass - except linearlogic.UnificationException: - pass - try: - atomics[glue_simp].append(cur) - except KeyError: - atomics[glue_simp] = [cur] - - for entry in atomics: - for gf in atomics[entry]: - if len(gf.indices) == agenda_length: - self._add_to_reading_list(gf, readings) - for entry in nonatomics: - for gf in nonatomics[entry]: - if len(gf.indices) == agenda_length: - self._add_to_reading_list(gf, readings) - return readings - - def _add_to_reading_list(self, glueformula, reading_list): - add_reading = True - if self.remove_duplicates: - for reading in reading_list: - try: - if reading.equiv(glueformula.meaning, self.prover): - add_reading = False - break - except Exception as e: - # if there is an exception, the syntax of the formula - # may not be understandable by the prover, so don't - # throw out the reading. - print("Error when checking logical equality of statements", e) - - if add_reading: - reading_list.append(glueformula.meaning) - - def parse_to_compiled(self, sentence): - gfls = [self.depgraph_to_glue(dg) for dg in self.dep_parse(sentence)] - return [self.gfl_to_compiled(gfl) for gfl in gfls] - - def dep_parse(self, sentence): - """ - Return a dependency graph for the sentence. - - :param sentence: the sentence to be parsed - :type sentence: list(str) - :rtype: DependencyGraph - """ - - # Lazy-initialize the depparser - if self.depparser is None: - from nltk.parse import MaltParser - - self.depparser = MaltParser(tagger=self.get_pos_tagger()) - if not self.depparser._trained: - self.train_depparser() - return self.depparser.parse(sentence, verbose=self.verbose) - - def depgraph_to_glue(self, depgraph): - return self.get_glue_dict().to_glueformula_list(depgraph) - - def get_glue_dict(self): - return GlueDict(self.semtype_file) - - def gfl_to_compiled(self, gfl): - index_counter = Counter() - return_list = [] - for gf in gfl: - return_list.extend(gf.compile(index_counter)) - - if self.verbose: - print("Compiled Glue Premises:") - for cgf in return_list: - print(cgf) - - return return_list - - def get_pos_tagger(self): - from nltk.corpus import brown - - regexp_tagger = RegexpTagger( - [ - (r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers - (r"(The|the|A|a|An|an)$", "AT"), # articles - (r".*able$", "JJ"), # adjectives - (r".*ness$", "NN"), # nouns formed from adjectives - (r".*ly$", "RB"), # adverbs - (r".*s$", "NNS"), # plural nouns - (r".*ing$", "VBG"), # gerunds - (r".*ed$", "VBD"), # past tense verbs - (r".*", "NN"), # nouns (default) - ] - ) - brown_train = brown.tagged_sents(categories="news") - unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger) - bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger) - trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger) - - # Override particular words - main_tagger = RegexpTagger( - [(r"(A|a|An|an)$", "ex_quant"), (r"(Every|every|All|all)$", "univ_quant")], - backoff=trigram_tagger, - ) - - return main_tagger - - -class DrtGlueFormula(GlueFormula): - def __init__(self, meaning, glue, indices=None): - if not indices: - indices = set() - - if isinstance(meaning, str): - self.meaning = drt.DrtExpression.fromstring(meaning) - elif isinstance(meaning, drt.DrtExpression): - self.meaning = meaning - else: - raise RuntimeError( - "Meaning term neither string or expression: %s, %s" - % (meaning, meaning.__class__) - ) - - if isinstance(glue, str): - self.glue = linearlogic.LinearLogicParser().parse(glue) - elif isinstance(glue, linearlogic.Expression): - self.glue = glue - else: - raise RuntimeError( - "Glue term neither string or expression: %s, %s" - % (glue, glue.__class__) - ) - - self.indices = indices - - def make_VariableExpression(self, name): - return drt.DrtVariableExpression(name) - - def make_LambdaExpression(self, variable, term): - return drt.DrtLambdaExpression(variable, term) - - -class DrtGlueDict(GlueDict): - def get_GlueFormula_factory(self): - return DrtGlueFormula - - -class DrtGlue(Glue): - def __init__( - self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False - ): - if not semtype_file: - semtype_file = os.path.join( - "grammars", "sample_grammars", "drt_glue.semtype" - ) - Glue.__init__(self, semtype_file, remove_duplicates, depparser, verbose) - - def get_glue_dict(self): - return DrtGlueDict(self.semtype_file) - - -def demo(show_example=-1): - from nltk.parse import MaltParser - - examples = [ - "David sees Mary", - "David eats a sandwich", - "every man chases a dog", - "every man believes a dog sleeps", - "John gives David a sandwich", - "John chases himself", - ] - # 'John persuades David to order a pizza', - # 'John tries to go', - # 'John tries to find a unicorn', - # 'John seems to vanish', - # 'a unicorn seems to approach', - # 'every big cat leaves', - # 'every gray cat leaves', - # 'every big gray cat leaves', - # 'a former senator leaves', - - print("============== DEMO ==============") - - tagger = RegexpTagger( - [ - ("^(David|Mary|John)$", "NNP"), - ( - "^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$", - "VB", - ), - ("^(go|order|vanish|find|approach)$", "VB"), - ("^(a)$", "ex_quant"), - ("^(every)$", "univ_quant"), - ("^(sandwich|man|dog|pizza|unicorn|cat|senator)$", "NN"), - ("^(big|gray|former)$", "JJ"), - ("^(him|himself)$", "PRP"), - ] - ) - - depparser = MaltParser(tagger=tagger) - glue = Glue(depparser=depparser, verbose=False) - - for (i, sentence) in enumerate(examples): - if i == show_example or show_example == -1: - print(f"[[[Example {i}]]] {sentence}") - for reading in glue.parse_to_meaning(sentence.split()): - print(reading.simplify()) - print("") - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/sem/hole.py b/pipeline/nltk/sem/hole.py deleted file mode 100644 index 4570cb02a3bf183a73a1f9b5e78b8f0e1dac430f..0000000000000000000000000000000000000000 --- a/pipeline/nltk/sem/hole.py +++ /dev/null @@ -1,395 +0,0 @@ -# Natural Language Toolkit: Logic -# -# Author: Peter Wang -# Updated by: Dan Garrette -# -# Copyright (C) 2001-2023 NLTK Project -# URL: -# For license information, see LICENSE.TXT - -""" -An implementation of the Hole Semantics model, following Blackburn and Bos, -Representation and Inference for Natural Language (CSLI, 2005). - -The semantic representations are built by the grammar hole.fcfg. -This module contains driver code to read in sentences and parse them -according to a hole semantics grammar. - -After parsing, the semantic representation is in the form of an underspecified -representation that is not easy to read. We use a "plugging" algorithm to -convert that representation into first-order logic formulas. -""" - -from functools import reduce - -from nltk.parse import load_parser -from nltk.sem.logic import ( - AllExpression, - AndExpression, - ApplicationExpression, - ExistsExpression, - IffExpression, - ImpExpression, - LambdaExpression, - NegatedExpression, - OrExpression, -) -from nltk.sem.skolemize import skolemize - -# Note that in this code there may be multiple types of trees being referred to: -# -# 1. parse trees -# 2. the underspecified representation -# 3. first-order logic formula trees -# 4. the search space when plugging (search tree) -# - - -class Constants: - ALL = "ALL" - EXISTS = "EXISTS" - NOT = "NOT" - AND = "AND" - OR = "OR" - IMP = "IMP" - IFF = "IFF" - PRED = "PRED" - LEQ = "LEQ" - HOLE = "HOLE" - LABEL = "LABEL" - - MAP = { - ALL: lambda v, e: AllExpression(v.variable, e), - EXISTS: lambda v, e: ExistsExpression(v.variable, e), - NOT: NegatedExpression, - AND: AndExpression, - OR: OrExpression, - IMP: ImpExpression, - IFF: IffExpression, - PRED: ApplicationExpression, - } - - -class HoleSemantics: - """ - This class holds the broken-down components of a hole semantics, i.e. it - extracts the holes, labels, logic formula fragments and constraints out of - a big conjunction of such as produced by the hole semantics grammar. It - then provides some operations on the semantics dealing with holes, labels - and finding legal ways to plug holes with labels. - """ - - def __init__(self, usr): - """ - Constructor. `usr' is a ``sem.Expression`` representing an - Underspecified Representation Structure (USR). A USR has the following - special predicates: - ALL(l,v,n), - EXISTS(l,v,n), - AND(l,n,n), - OR(l,n,n), - IMP(l,n,n), - IFF(l,n,n), - PRED(l,v,n,v[,v]*) where the brackets and star indicate zero or more repetitions, - LEQ(n,n), - HOLE(n), - LABEL(n) - where l is the label of the node described by the predicate, n is either - a label or a hole, and v is a variable. - """ - self.holes = set() - self.labels = set() - self.fragments = {} # mapping of label -> formula fragment - self.constraints = set() # set of Constraints - self._break_down(usr) - self.top_most_labels = self._find_top_most_labels() - self.top_hole = self._find_top_hole() - - def is_node(self, x): - """ - Return true if x is a node (label or hole) in this semantic - representation. - """ - return x in (self.labels | self.holes) - - def _break_down(self, usr): - """ - Extract holes, labels, formula fragments and constraints from the hole - semantics underspecified representation (USR). - """ - if isinstance(usr, AndExpression): - self._break_down(usr.first) - self._break_down(usr.second) - elif isinstance(usr, ApplicationExpression): - func, args = usr.uncurry() - if func.variable.name == Constants.LEQ: - self.constraints.add(Constraint(args[0], args[1])) - elif func.variable.name == Constants.HOLE: - self.holes.add(args[0]) - elif func.variable.name == Constants.LABEL: - self.labels.add(args[0]) - else: - label = args[0] - assert label not in self.fragments - self.fragments[label] = (func, args[1:]) - else: - raise ValueError(usr.label()) - - def _find_top_nodes(self, node_list): - top_nodes = node_list.copy() - for f in self.fragments.values(): - # the label is the first argument of the predicate - args = f[1] - for arg in args: - if arg in node_list: - top_nodes.discard(arg) - return top_nodes - - def _find_top_most_labels(self): - """ - Return the set of labels which are not referenced directly as part of - another formula fragment. These will be the top-most labels for the - subtree that they are part of. - """ - return self._find_top_nodes(self.labels) - - def _find_top_hole(self): - """ - Return the hole that will be the top of the formula tree. - """ - top_holes = self._find_top_nodes(self.holes) - assert len(top_holes) == 1 # it must be unique - return top_holes.pop() - - def pluggings(self): - """ - Calculate and return all the legal pluggings (mappings of labels to - holes) of this semantics given the constraints. - """ - record = [] - self._plug_nodes([(self.top_hole, [])], self.top_most_labels, {}, record) - return record - - def _plug_nodes(self, queue, potential_labels, plug_acc, record): - """ - Plug the nodes in `queue' with the labels in `potential_labels'. - - Each element of `queue' is a tuple of the node to plug and the list of - ancestor holes from the root of the graph to that node. - - `potential_labels' is a set of the labels which are still available for - plugging. - - `plug_acc' is the incomplete mapping of holes to labels made on the - current branch of the search tree so far. - - `record' is a list of all the complete pluggings that we have found in - total so far. It is the only parameter that is destructively updated. - """ - if queue != []: - (node, ancestors) = queue[0] - if node in self.holes: - # The node is a hole, try to plug it. - self._plug_hole( - node, ancestors, queue[1:], potential_labels, plug_acc, record - ) - else: - assert node in self.labels - # The node is a label. Replace it in the queue by the holes and - # labels in the formula fragment named by that label. - args = self.fragments[node][1] - head = [(a, ancestors) for a in args if self.is_node(a)] - self._plug_nodes(head + queue[1:], potential_labels, plug_acc, record) - else: - raise Exception("queue empty") - - def _plug_hole(self, hole, ancestors0, queue, potential_labels0, plug_acc0, record): - """ - Try all possible ways of plugging a single hole. - See _plug_nodes for the meanings of the parameters. - """ - # Add the current hole we're trying to plug into the list of ancestors. - assert hole not in ancestors0 - ancestors = [hole] + ancestors0 - - # Try each potential label in this hole in turn. - for l in potential_labels0: - # Is the label valid in this hole? - if self._violates_constraints(l, ancestors): - continue - - plug_acc = plug_acc0.copy() - plug_acc[hole] = l - potential_labels = potential_labels0.copy() - potential_labels.remove(l) - - if len(potential_labels) == 0: - # No more potential labels. That must mean all the holes have - # been filled so we have found a legal plugging so remember it. - # - # Note that the queue might not be empty because there might - # be labels on there that point to formula fragments with - # no holes in them. _sanity_check_plugging will make sure - # all holes are filled. - self._sanity_check_plugging(plug_acc, self.top_hole, []) - record.append(plug_acc) - else: - # Recursively try to fill in the rest of the holes in the - # queue. The label we just plugged into the hole could have - # holes of its own so at the end of the queue. Putting it on - # the end of the queue gives us a breadth-first search, so that - # all the holes at level i of the formula tree are filled - # before filling level i+1. - # A depth-first search would work as well since the trees must - # be finite but the bookkeeping would be harder. - self._plug_nodes( - queue + [(l, ancestors)], potential_labels, plug_acc, record - ) - - def _violates_constraints(self, label, ancestors): - """ - Return True if the `label' cannot be placed underneath the holes given - by the set `ancestors' because it would violate the constraints imposed - on it. - """ - for c in self.constraints: - if c.lhs == label: - if c.rhs not in ancestors: - return True - return False - - def _sanity_check_plugging(self, plugging, node, ancestors): - """ - Make sure that a given plugging is legal. We recursively go through - each node and make sure that no constraints are violated. - We also check that all holes have been filled. - """ - if node in self.holes: - ancestors = [node] + ancestors - label = plugging[node] - else: - label = node - assert label in self.labels - for c in self.constraints: - if c.lhs == label: - assert c.rhs in ancestors - args = self.fragments[label][1] - for arg in args: - if self.is_node(arg): - self._sanity_check_plugging(plugging, arg, [label] + ancestors) - - def formula_tree(self, plugging): - """ - Return the first-order logic formula tree for this underspecified - representation using the plugging given. - """ - return self._formula_tree(plugging, self.top_hole) - - def _formula_tree(self, plugging, node): - if node in plugging: - return self._formula_tree(plugging, plugging[node]) - elif node in self.fragments: - pred, args = self.fragments[node] - children = [self._formula_tree(plugging, arg) for arg in args] - return reduce(Constants.MAP[pred.variable.name], children) - else: - return node - - -class Constraint: - """ - This class represents a constraint of the form (L =< N), - where L is a label and N is a node (a label or a hole). - """ - - def __init__(self, lhs, rhs): - self.lhs = lhs - self.rhs = rhs - - def __eq__(self, other): - if self.__class__ == other.__class__: - return self.lhs == other.lhs and self.rhs == other.rhs - else: - return False - - def __ne__(self, other): - return not (self == other) - - def __hash__(self): - return hash(repr(self)) - - def __repr__(self): - return f"({self.lhs} < {self.rhs})" - - -def hole_readings(sentence, grammar_filename=None, verbose=False): - if not grammar_filename: - grammar_filename = "grammars/sample_grammars/hole.fcfg" - - if verbose: - print("Reading grammar file", grammar_filename) - - parser = load_parser(grammar_filename) - - # Parse the sentence. - tokens = sentence.split() - trees = list(parser.parse(tokens)) - if verbose: - print("Got %d different parses" % len(trees)) - - all_readings = [] - for tree in trees: - # Get the semantic feature from the top of the parse tree. - sem = tree.label()["SEM"].simplify() - - # Print the raw semantic representation. - if verbose: - print("Raw: ", sem) - - # Skolemize away all quantifiers. All variables become unique. - while isinstance(sem, LambdaExpression): - sem = sem.term - skolemized = skolemize(sem) - - if verbose: - print("Skolemized:", skolemized) - - # Break the hole semantics representation down into its components - # i.e. holes, labels, formula fragments and constraints. - hole_sem = HoleSemantics(skolemized) - - # Maybe show the details of the semantic representation. - if verbose: - print("Holes: ", hole_sem.holes) - print("Labels: ", hole_sem.labels) - print("Constraints: ", hole_sem.constraints) - print("Top hole: ", hole_sem.top_hole) - print("Top labels: ", hole_sem.top_most_labels) - print("Fragments:") - for l, f in hole_sem.fragments.items(): - print(f"\t{l}: {f}") - - # Find all the possible ways to plug the formulas together. - pluggings = hole_sem.pluggings() - - # Build FOL formula trees using the pluggings. - readings = list(map(hole_sem.formula_tree, pluggings)) - - # Print out the formulas in a textual format. - if verbose: - for i, r in enumerate(readings): - print() - print("%d. %s" % (i, r)) - print() - - all_readings.extend(readings) - - return all_readings - - -if __name__ == "__main__": - for r in hole_readings("a dog barks"): - print(r) - print() - for r in hole_readings("every girl chases a dog"): - print(r) diff --git a/pipeline/nltk/sem/lfg.py b/pipeline/nltk/sem/lfg.py deleted file mode 100644 index 13473b0087940c9b42cc4c36d5f442bb0f78eafe..0000000000000000000000000000000000000000 --- a/pipeline/nltk/sem/lfg.py +++ /dev/null @@ -1,261 +0,0 @@ -# Natural Language Toolkit: Lexical Functional Grammar -# -# Author: Dan Garrette -# -# Copyright (C) 2001-2023 NLTK Project -# URL: -# For license information, see LICENSE.TXT - -from itertools import chain - -from nltk.internals import Counter - - -class FStructure(dict): - def safeappend(self, key, item): - """ - Append 'item' to the list at 'key'. If no list exists for 'key', then - construct one. - """ - if key not in self: - self[key] = [] - self[key].append(item) - - def __setitem__(self, key, value): - dict.__setitem__(self, key.lower(), value) - - def __getitem__(self, key): - return dict.__getitem__(self, key.lower()) - - def __contains__(self, key): - return dict.__contains__(self, key.lower()) - - def to_glueformula_list(self, glue_dict): - depgraph = self.to_depgraph() - return glue_dict.to_glueformula_list(depgraph) - - def to_depgraph(self, rel=None): - from nltk.parse.dependencygraph import DependencyGraph - - depgraph = DependencyGraph() - nodes = depgraph.nodes - - self._to_depgraph(nodes, 0, "ROOT") - - # Add all the dependencies for all the nodes - for address, node in nodes.items(): - for n2 in (n for n in nodes.values() if n["rel"] != "TOP"): - if n2["head"] == address: - relation = n2["rel"] - node["deps"].setdefault(relation, []) - node["deps"][relation].append(n2["address"]) - - depgraph.root = nodes[1] - - return depgraph - - def _to_depgraph(self, nodes, head, rel): - index = len(nodes) - - nodes[index].update( - { - "address": index, - "word": self.pred[0], - "tag": self.pred[1], - "head": head, - "rel": rel, - } - ) - - for feature in sorted(self): - for item in sorted(self[feature]): - if isinstance(item, FStructure): - item._to_depgraph(nodes, index, feature) - elif isinstance(item, tuple): - new_index = len(nodes) - nodes[new_index].update( - { - "address": new_index, - "word": item[0], - "tag": item[1], - "head": index, - "rel": feature, - } - ) - elif isinstance(item, list): - for n in item: - n._to_depgraph(nodes, index, feature) - else: - raise Exception( - "feature %s is not an FStruct, a list, or a tuple" % feature - ) - - @staticmethod - def read_depgraph(depgraph): - return FStructure._read_depgraph(depgraph.root, depgraph) - - @staticmethod - def _read_depgraph(node, depgraph, label_counter=None, parent=None): - if not label_counter: - label_counter = Counter() - - if node["rel"].lower() in ["spec", "punct"]: - # the value of a 'spec' entry is a word, not an FStructure - return (node["word"], node["tag"]) - - else: - fstruct = FStructure() - fstruct.pred = None - fstruct.label = FStructure._make_label(label_counter.get()) - - fstruct.parent = parent - - word, tag = node["word"], node["tag"] - if tag[:2] == "VB": - if tag[2:3] == "D": - fstruct.safeappend("tense", ("PAST", "tense")) - fstruct.pred = (word, tag[:2]) - - if not fstruct.pred: - fstruct.pred = (word, tag) - - children = [ - depgraph.nodes[idx] - for idx in chain.from_iterable(node["deps"].values()) - ] - for child in children: - fstruct.safeappend( - child["rel"], - FStructure._read_depgraph(child, depgraph, label_counter, fstruct), - ) - - return fstruct - - @staticmethod - def _make_label(value): - """ - Pick an alphabetic character as identifier for an entity in the model. - - :param value: where to index into the list of characters - :type value: int - """ - letter = [ - "f", - "g", - "h", - "i", - "j", - "k", - "l", - "m", - "n", - "o", - "p", - "q", - "r", - "s", - "t", - "u", - "v", - "w", - "x", - "y", - "z", - "a", - "b", - "c", - "d", - "e", - ][value - 1] - num = int(value) // 26 - if num > 0: - return letter + str(num) - else: - return letter - - def __repr__(self): - return self.__str__().replace("\n", "") - - def __str__(self): - return self.pretty_format() - - def pretty_format(self, indent=3): - try: - accum = "%s:[" % self.label - except NameError: - accum = "[" - try: - accum += "pred '%s'" % (self.pred[0]) - except NameError: - pass - - for feature in sorted(self): - for item in self[feature]: - if isinstance(item, FStructure): - next_indent = indent + len(feature) + 3 + len(self.label) - accum += "\n{}{} {}".format( - " " * (indent), - feature, - item.pretty_format(next_indent), - ) - elif isinstance(item, tuple): - accum += "\n{}{} '{}'".format(" " * (indent), feature, item[0]) - elif isinstance(item, list): - accum += "\n{}{} {{{}}}".format( - " " * (indent), - feature, - ("\n%s" % (" " * (indent + len(feature) + 2))).join(item), - ) - else: # ERROR - raise Exception( - "feature %s is not an FStruct, a list, or a tuple" % feature - ) - return accum + "]" - - -def demo_read_depgraph(): - from nltk.parse.dependencygraph import DependencyGraph - - dg1 = DependencyGraph( - """\ -Esso NNP 2 SUB -said VBD 0 ROOT -the DT 5 NMOD -Whiting NNP 5 NMOD -field NN 6 SUB -started VBD 2 VMOD -production NN 6 OBJ -Tuesday NNP 6 VMOD -""" - ) - dg2 = DependencyGraph( - """\ -John NNP 2 SUB -sees VBP 0 ROOT -Mary NNP 2 OBJ -""" - ) - dg3 = DependencyGraph( - """\ -a DT 2 SPEC -man NN 3 SUBJ -walks VB 0 ROOT -""" - ) - dg4 = DependencyGraph( - """\ -every DT 2 SPEC -girl NN 3 SUBJ -chases VB 0 ROOT -a DT 5 SPEC -dog NN 3 OBJ -""" - ) - - depgraphs = [dg1, dg2, dg3, dg4] - for dg in depgraphs: - print(FStructure.read_depgraph(dg)) - - -if __name__ == "__main__": - demo_read_depgraph() diff --git a/pipeline/nltk/sem/linearlogic.py b/pipeline/nltk/sem/linearlogic.py deleted file mode 100644 index 474f835e2f1bbe19fe2486e259bea2d08fa473b1..0000000000000000000000000000000000000000 --- a/pipeline/nltk/sem/linearlogic.py +++ /dev/null @@ -1,482 +0,0 @@ -# Natural Language Toolkit: Linear Logic -# -# Author: Dan Garrette -# -# Copyright (C) 2001-2023 NLTK Project -# URL: -# For license information, see LICENSE.TXT - -from nltk.internals import Counter -from nltk.sem.logic import APP, LogicParser - -_counter = Counter() - - -class Tokens: - # Punctuation - OPEN = "(" - CLOSE = ")" - - # Operations - IMP = "-o" - - PUNCT = [OPEN, CLOSE] - TOKENS = PUNCT + [IMP] - - -class LinearLogicParser(LogicParser): - """A linear logic expression parser.""" - - def __init__(self): - LogicParser.__init__(self) - - self.operator_precedence = {APP: 1, Tokens.IMP: 2, None: 3} - self.right_associated_operations += [Tokens.IMP] - - def get_all_symbols(self): - return Tokens.TOKENS - - def handle(self, tok, context): - if tok not in Tokens.TOKENS: - return self.handle_variable(tok, context) - elif tok == Tokens.OPEN: - return self.handle_open(tok, context) - - def get_BooleanExpression_factory(self, tok): - if tok == Tokens.IMP: - return ImpExpression - else: - return None - - def make_BooleanExpression(self, factory, first, second): - return factory(first, second) - - def attempt_ApplicationExpression(self, expression, context): - """Attempt to make an application expression. If the next tokens - are an argument in parens, then the argument expression is a - function being applied to the arguments. Otherwise, return the - argument expression.""" - if self.has_priority(APP, context): - if self.inRange(0) and self.token(0) == Tokens.OPEN: - self.token() # swallow then open paren - argument = self.process_next_expression(APP) - self.assertNextToken(Tokens.CLOSE) - expression = ApplicationExpression(expression, argument, None) - return expression - - def make_VariableExpression(self, name): - if name[0].isupper(): - return VariableExpression(name) - else: - return ConstantExpression(name) - - -class Expression: - - _linear_logic_parser = LinearLogicParser() - - @classmethod - def fromstring(cls, s): - return cls._linear_logic_parser.parse(s) - - def applyto(self, other, other_indices=None): - return ApplicationExpression(self, other, other_indices) - - def __call__(self, other): - return self.applyto(other) - - def __repr__(self): - return f"<{self.__class__.__name__} {self}>" - - -class AtomicExpression(Expression): - def __init__(self, name, dependencies=None): - """ - :param name: str for the constant name - :param dependencies: list of int for the indices on which this atom is dependent - """ - assert isinstance(name, str) - self.name = name - - if not dependencies: - dependencies = [] - self.dependencies = dependencies - - def simplify(self, bindings=None): - """ - If 'self' is bound by 'bindings', return the atomic to which it is bound. - Otherwise, return self. - - :param bindings: ``BindingDict`` A dictionary of bindings used to simplify - :return: ``AtomicExpression`` - """ - if bindings and self in bindings: - return bindings[self] - else: - return self - - def compile_pos(self, index_counter, glueFormulaFactory): - """ - From Iddo Lev's PhD Dissertation p108-109 - - :param index_counter: ``Counter`` for unique indices - :param glueFormulaFactory: ``GlueFormula`` for creating new glue formulas - :return: (``Expression``,set) for the compiled linear logic and any newly created glue formulas - """ - self.dependencies = [] - return (self, []) - - def compile_neg(self, index_counter, glueFormulaFactory): - """ - From Iddo Lev's PhD Dissertation p108-109 - - :param index_counter: ``Counter`` for unique indices - :param glueFormulaFactory: ``GlueFormula`` for creating new glue formulas - :return: (``Expression``,set) for the compiled linear logic and any newly created glue formulas - """ - self.dependencies = [] - return (self, []) - - def initialize_labels(self, fstruct): - self.name = fstruct.initialize_label(self.name.lower()) - - def __eq__(self, other): - return self.__class__ == other.__class__ and self.name == other.name - - def __ne__(self, other): - return not self == other - - def __str__(self): - accum = self.name - if self.dependencies: - accum += "%s" % self.dependencies - return accum - - def __hash__(self): - return hash(self.name) - - -class ConstantExpression(AtomicExpression): - def unify(self, other, bindings): - """ - If 'other' is a constant, then it must be equal to 'self'. If 'other' is a variable, - then it must not be bound to anything other than 'self'. - - :param other: ``Expression`` - :param bindings: ``BindingDict`` A dictionary of all current bindings - :return: ``BindingDict`` A new combined dictionary of of 'bindings' and any new binding - :raise UnificationException: If 'self' and 'other' cannot be unified in the context of 'bindings' - """ - assert isinstance(other, Expression) - if isinstance(other, VariableExpression): - try: - return bindings + BindingDict([(other, self)]) - except VariableBindingException: - pass - elif self == other: - return bindings - raise UnificationException(self, other, bindings) - - -class VariableExpression(AtomicExpression): - def unify(self, other, bindings): - """ - 'self' must not be bound to anything other than 'other'. - - :param other: ``Expression`` - :param bindings: ``BindingDict`` A dictionary of all current bindings - :return: ``BindingDict`` A new combined dictionary of of 'bindings' and the new binding - :raise UnificationException: If 'self' and 'other' cannot be unified in the context of 'bindings' - """ - assert isinstance(other, Expression) - try: - if self == other: - return bindings - else: - return bindings + BindingDict([(self, other)]) - except VariableBindingException as e: - raise UnificationException(self, other, bindings) from e - - -class ImpExpression(Expression): - def __init__(self, antecedent, consequent): - """ - :param antecedent: ``Expression`` for the antecedent - :param consequent: ``Expression`` for the consequent - """ - assert isinstance(antecedent, Expression) - assert isinstance(consequent, Expression) - self.antecedent = antecedent - self.consequent = consequent - - def simplify(self, bindings=None): - return self.__class__( - self.antecedent.simplify(bindings), self.consequent.simplify(bindings) - ) - - def unify(self, other, bindings): - """ - Both the antecedent and consequent of 'self' and 'other' must unify. - - :param other: ``ImpExpression`` - :param bindings: ``BindingDict`` A dictionary of all current bindings - :return: ``BindingDict`` A new combined dictionary of of 'bindings' and any new bindings - :raise UnificationException: If 'self' and 'other' cannot be unified in the context of 'bindings' - """ - assert isinstance(other, ImpExpression) - try: - return ( - bindings - + self.antecedent.unify(other.antecedent, bindings) - + self.consequent.unify(other.consequent, bindings) - ) - except VariableBindingException as e: - raise UnificationException(self, other, bindings) from e - - def compile_pos(self, index_counter, glueFormulaFactory): - """ - From Iddo Lev's PhD Dissertation p108-109 - - :param index_counter: ``Counter`` for unique indices - :param glueFormulaFactory: ``GlueFormula`` for creating new glue formulas - :return: (``Expression``,set) for the compiled linear logic and any newly created glue formulas - """ - (a, a_new) = self.antecedent.compile_neg(index_counter, glueFormulaFactory) - (c, c_new) = self.consequent.compile_pos(index_counter, glueFormulaFactory) - return (ImpExpression(a, c), a_new + c_new) - - def compile_neg(self, index_counter, glueFormulaFactory): - """ - From Iddo Lev's PhD Dissertation p108-109 - - :param index_counter: ``Counter`` for unique indices - :param glueFormulaFactory: ``GlueFormula`` for creating new glue formulas - :return: (``Expression``,list of ``GlueFormula``) for the compiled linear logic and any newly created glue formulas - """ - (a, a_new) = self.antecedent.compile_pos(index_counter, glueFormulaFactory) - (c, c_new) = self.consequent.compile_neg(index_counter, glueFormulaFactory) - fresh_index = index_counter.get() - c.dependencies.append(fresh_index) - new_v = glueFormulaFactory("v%s" % fresh_index, a, {fresh_index}) - return (c, a_new + c_new + [new_v]) - - def initialize_labels(self, fstruct): - self.antecedent.initialize_labels(fstruct) - self.consequent.initialize_labels(fstruct) - - def __eq__(self, other): - return ( - self.__class__ == other.__class__ - and self.antecedent == other.antecedent - and self.consequent == other.consequent - ) - - def __ne__(self, other): - return not self == other - - def __str__(self): - return "{}{} {} {}{}".format( - Tokens.OPEN, - self.antecedent, - Tokens.IMP, - self.consequent, - Tokens.CLOSE, - ) - - def __hash__(self): - return hash(f"{hash(self.antecedent)}{Tokens.IMP}{hash(self.consequent)}") - - -class ApplicationExpression(Expression): - def __init__(self, function, argument, argument_indices=None): - """ - :param function: ``Expression`` for the function - :param argument: ``Expression`` for the argument - :param argument_indices: set for the indices of the glue formula from which the argument came - :raise LinearLogicApplicationException: If 'function' cannot be applied to 'argument' given 'argument_indices'. - """ - function_simp = function.simplify() - argument_simp = argument.simplify() - - assert isinstance(function_simp, ImpExpression) - assert isinstance(argument_simp, Expression) - - bindings = BindingDict() - - try: - if isinstance(function, ApplicationExpression): - bindings += function.bindings - if isinstance(argument, ApplicationExpression): - bindings += argument.bindings - bindings += function_simp.antecedent.unify(argument_simp, bindings) - except UnificationException as e: - raise LinearLogicApplicationException( - f"Cannot apply {function_simp} to {argument_simp}. {e}" - ) from e - - # If you are running it on complied premises, more conditions apply - if argument_indices: - # A.dependencies of (A -o (B -o C)) must be a proper subset of argument_indices - if not set(function_simp.antecedent.dependencies) < argument_indices: - raise LinearLogicApplicationException( - "Dependencies unfulfilled when attempting to apply Linear Logic formula %s to %s" - % (function_simp, argument_simp) - ) - if set(function_simp.antecedent.dependencies) == argument_indices: - raise LinearLogicApplicationException( - "Dependencies not a proper subset of indices when attempting to apply Linear Logic formula %s to %s" - % (function_simp, argument_simp) - ) - - self.function = function - self.argument = argument - self.bindings = bindings - - def simplify(self, bindings=None): - """ - Since function is an implication, return its consequent. There should be - no need to check that the application is valid since the checking is done - by the constructor. - - :param bindings: ``BindingDict`` A dictionary of bindings used to simplify - :return: ``Expression`` - """ - if not bindings: - bindings = self.bindings - - return self.function.simplify(bindings).consequent - - def __eq__(self, other): - return ( - self.__class__ == other.__class__ - and self.function == other.function - and self.argument == other.argument - ) - - def __ne__(self, other): - return not self == other - - def __str__(self): - return "%s" % self.function + Tokens.OPEN + "%s" % self.argument + Tokens.CLOSE - - def __hash__(self): - return hash(f"{hash(self.antecedent)}{Tokens.OPEN}{hash(self.consequent)}") - - -class BindingDict: - def __init__(self, bindings=None): - """ - :param bindings: - list [(``VariableExpression``, ``AtomicExpression``)] to initialize the dictionary - dict {``VariableExpression``: ``AtomicExpression``} to initialize the dictionary - """ - self.d = {} - - if isinstance(bindings, dict): - bindings = bindings.items() - - if bindings: - for (v, b) in bindings: - self[v] = b - - def __setitem__(self, variable, binding): - """ - A binding is consistent with the dict if its variable is not already bound, OR if its - variable is already bound to its argument. - - :param variable: ``VariableExpression`` The variable bind - :param binding: ``Expression`` The expression to which 'variable' should be bound - :raise VariableBindingException: If the variable cannot be bound in this dictionary - """ - assert isinstance(variable, VariableExpression) - assert isinstance(binding, Expression) - - assert variable != binding - - existing = self.d.get(variable, None) - - if not existing or binding == existing: - self.d[variable] = binding - else: - raise VariableBindingException( - "Variable %s already bound to another value" % (variable) - ) - - def __getitem__(self, variable): - """ - Return the expression to which 'variable' is bound - """ - assert isinstance(variable, VariableExpression) - - intermediate = self.d[variable] - while intermediate: - try: - intermediate = self.d[intermediate] - except KeyError: - return intermediate - - def __contains__(self, item): - return item in self.d - - def __add__(self, other): - """ - :param other: ``BindingDict`` The dict with which to combine self - :return: ``BindingDict`` A new dict containing all the elements of both parameters - :raise VariableBindingException: If the parameter dictionaries are not consistent with each other - """ - try: - combined = BindingDict() - for v in self.d: - combined[v] = self.d[v] - for v in other.d: - combined[v] = other.d[v] - return combined - except VariableBindingException as e: - raise VariableBindingException( - "Attempting to add two contradicting" - " VariableBindingsLists: %s, %s" % (self, other) - ) from e - - def __ne__(self, other): - return not self == other - - def __eq__(self, other): - if not isinstance(other, BindingDict): - raise TypeError - return self.d == other.d - - def __str__(self): - return "{" + ", ".join(f"{v}: {self.d[v]}" for v in sorted(self.d.keys())) + "}" - - def __repr__(self): - return "BindingDict: %s" % self - - -class VariableBindingException(Exception): - pass - - -class UnificationException(Exception): - def __init__(self, a, b, bindings): - Exception.__init__(self, f"Cannot unify {a} with {b} given {bindings}") - - -class LinearLogicApplicationException(Exception): - pass - - -def demo(): - lexpr = Expression.fromstring - - print(lexpr(r"f")) - print(lexpr(r"(g -o f)")) - print(lexpr(r"((g -o G) -o G)")) - print(lexpr(r"g -o h -o f")) - print(lexpr(r"(g -o f)(g)").simplify()) - print(lexpr(r"(H -o f)(g)").simplify()) - print(lexpr(r"((g -o G) -o G)((g -o f))").simplify()) - print(lexpr(r"(H -o H)((g -o f))").simplify()) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/sem/logic.py b/pipeline/nltk/sem/logic.py deleted file mode 100644 index aed3a118760b0a9111fc0445df870231f943e1e3..0000000000000000000000000000000000000000 --- a/pipeline/nltk/sem/logic.py +++ /dev/null @@ -1,2065 +0,0 @@ -# Natural Language Toolkit: Logic -# -# Author: Dan Garrette -# -# Copyright (C) 2001-2023 NLTK Project -# URL: -# For license information, see LICENSE.TXT - -""" -A version of first order predicate logic, built on -top of the typed lambda calculus. -""" - -import operator -import re -from collections import defaultdict -from functools import reduce, total_ordering - -from nltk.internals import Counter -from nltk.util import Trie - -APP = "APP" - -_counter = Counter() - - -class Tokens: - LAMBDA = "\\" - LAMBDA_LIST = ["\\"] - - # Quantifiers - EXISTS = "exists" - EXISTS_LIST = ["some", "exists", "exist"] - ALL = "all" - ALL_LIST = ["all", "forall"] - IOTA = "iota" - IOTA_LIST = ["iota"] - - # Punctuation - DOT = "." - OPEN = "(" - CLOSE = ")" - COMMA = "," - - # Operations - NOT = "-" - NOT_LIST = ["not", "-", "!"] - AND = "&" - AND_LIST = ["and", "&", "^"] - OR = "|" - OR_LIST = ["or", "|"] - IMP = "->" - IMP_LIST = ["implies", "->", "=>"] - IFF = "<->" - IFF_LIST = ["iff", "<->", "<=>"] - EQ = "=" - EQ_LIST = ["=", "=="] - NEQ = "!=" - NEQ_LIST = ["!="] - - # Collections of tokens - BINOPS = AND_LIST + OR_LIST + IMP_LIST + IFF_LIST - QUANTS = EXISTS_LIST + ALL_LIST + IOTA_LIST - PUNCT = [DOT, OPEN, CLOSE, COMMA] - - TOKENS = BINOPS + EQ_LIST + NEQ_LIST + QUANTS + LAMBDA_LIST + PUNCT + NOT_LIST - - # Special - SYMBOLS = [x for x in TOKENS if re.match(r"^[-\\.(),!&^|>=<]*$", x)] - - -def boolean_ops(): - """ - Boolean operators - """ - names = ["negation", "conjunction", "disjunction", "implication", "equivalence"] - for pair in zip(names, [Tokens.NOT, Tokens.AND, Tokens.OR, Tokens.IMP, Tokens.IFF]): - print("%-15s\t%s" % pair) - - -def equality_preds(): - """ - Equality predicates - """ - names = ["equality", "inequality"] - for pair in zip(names, [Tokens.EQ, Tokens.NEQ]): - print("%-15s\t%s" % pair) - - -def binding_ops(): - """ - Binding operators - """ - names = ["existential", "universal", "lambda"] - for pair in zip(names, [Tokens.EXISTS, Tokens.ALL, Tokens.LAMBDA, Tokens.IOTA]): - print("%-15s\t%s" % pair) - - -class LogicParser: - """A lambda calculus expression parser.""" - - def __init__(self, type_check=False): - """ - :param type_check: should type checking be performed - to their types? - :type type_check: bool - """ - assert isinstance(type_check, bool) - - self._currentIndex = 0 - self._buffer = [] - self.type_check = type_check - - """A list of tuples of quote characters. The 4-tuple is comprised - of the start character, the end character, the escape character, and - a boolean indicating whether the quotes should be included in the - result. Quotes are used to signify that a token should be treated as - atomic, ignoring any special characters within the token. The escape - character allows the quote end character to be used within the quote. - If True, the boolean indicates that the final token should contain the - quote and escape characters. - This method exists to be overridden""" - self.quote_chars = [] - - self.operator_precedence = dict( - [(x, 1) for x in Tokens.LAMBDA_LIST] - + [(x, 2) for x in Tokens.NOT_LIST] - + [(APP, 3)] - + [(x, 4) for x in Tokens.EQ_LIST + Tokens.NEQ_LIST] - + [(x, 5) for x in Tokens.QUANTS] - + [(x, 6) for x in Tokens.AND_LIST] - + [(x, 7) for x in Tokens.OR_LIST] - + [(x, 8) for x in Tokens.IMP_LIST] - + [(x, 9) for x in Tokens.IFF_LIST] - + [(None, 10)] - ) - self.right_associated_operations = [APP] - - def parse(self, data, signature=None): - """ - Parse the expression. - - :param data: str for the input to be parsed - :param signature: ``dict`` that maps variable names to type - strings - :returns: a parsed Expression - """ - data = data.rstrip() - - self._currentIndex = 0 - self._buffer, mapping = self.process(data) - - try: - result = self.process_next_expression(None) - if self.inRange(0): - raise UnexpectedTokenException(self._currentIndex + 1, self.token(0)) - except LogicalExpressionException as e: - msg = "{}\n{}\n{}^".format(e, data, " " * mapping[e.index - 1]) - raise LogicalExpressionException(None, msg) from e - - if self.type_check: - result.typecheck(signature) - - return result - - def process(self, data): - """Split the data into tokens""" - out = [] - mapping = {} - tokenTrie = Trie(self.get_all_symbols()) - token = "" - data_idx = 0 - token_start_idx = data_idx - while data_idx < len(data): - cur_data_idx = data_idx - quoted_token, data_idx = self.process_quoted_token(data_idx, data) - if quoted_token: - if not token: - token_start_idx = cur_data_idx - token += quoted_token - continue - - st = tokenTrie - c = data[data_idx] - symbol = "" - while c in st: - symbol += c - st = st[c] - if len(data) - data_idx > len(symbol): - c = data[data_idx + len(symbol)] - else: - break - if Trie.LEAF in st: - # token is a complete symbol - if token: - mapping[len(out)] = token_start_idx - out.append(token) - token = "" - mapping[len(out)] = data_idx - out.append(symbol) - data_idx += len(symbol) - else: - if data[data_idx] in " \t\n": # any whitespace - if token: - mapping[len(out)] = token_start_idx - out.append(token) - token = "" - else: - if not token: - token_start_idx = data_idx - token += data[data_idx] - data_idx += 1 - if token: - mapping[len(out)] = token_start_idx - out.append(token) - mapping[len(out)] = len(data) - mapping[len(out) + 1] = len(data) + 1 - return out, mapping - - def process_quoted_token(self, data_idx, data): - token = "" - c = data[data_idx] - i = data_idx - for start, end, escape, incl_quotes in self.quote_chars: - if c == start: - if incl_quotes: - token += c - i += 1 - while data[i] != end: - if data[i] == escape: - if incl_quotes: - token += data[i] - i += 1 - if len(data) == i: # if there are no more chars - raise LogicalExpressionException( - None, - "End of input reached. " - "Escape character [%s] found at end." % escape, - ) - token += data[i] - else: - token += data[i] - i += 1 - if len(data) == i: - raise LogicalExpressionException( - None, "End of input reached. " "Expected: [%s]" % end - ) - if incl_quotes: - token += data[i] - i += 1 - if not token: - raise LogicalExpressionException(None, "Empty quoted token found") - break - return token, i - - def get_all_symbols(self): - """This method exists to be overridden""" - return Tokens.SYMBOLS - - def inRange(self, location): - """Return TRUE if the given location is within the buffer""" - return self._currentIndex + location < len(self._buffer) - - def token(self, location=None): - """Get the next waiting token. If a location is given, then - return the token at currentIndex+location without advancing - currentIndex; setting it gives lookahead/lookback capability.""" - try: - if location is None: - tok = self._buffer[self._currentIndex] - self._currentIndex += 1 - else: - tok = self._buffer[self._currentIndex + location] - return tok - except IndexError as e: - raise ExpectedMoreTokensException(self._currentIndex + 1) from e - - def isvariable(self, tok): - return tok not in Tokens.TOKENS - - def process_next_expression(self, context): - """Parse the next complete expression from the stream and return it.""" - try: - tok = self.token() - except ExpectedMoreTokensException as e: - raise ExpectedMoreTokensException( - self._currentIndex + 1, message="Expression expected." - ) from e - - accum = self.handle(tok, context) - - if not accum: - raise UnexpectedTokenException( - self._currentIndex, tok, message="Expression expected." - ) - - return self.attempt_adjuncts(accum, context) - - def handle(self, tok, context): - """This method is intended to be overridden for logics that - use different operators or expressions""" - if self.isvariable(tok): - return self.handle_variable(tok, context) - - elif tok in Tokens.NOT_LIST: - return self.handle_negation(tok, context) - - elif tok in Tokens.LAMBDA_LIST: - return self.handle_lambda(tok, context) - - elif tok in Tokens.QUANTS: - return self.handle_quant(tok, context) - - elif tok == Tokens.OPEN: - return self.handle_open(tok, context) - - def attempt_adjuncts(self, expression, context): - cur_idx = None - while cur_idx != self._currentIndex: # while adjuncts are added - cur_idx = self._currentIndex - expression = self.attempt_EqualityExpression(expression, context) - expression = self.attempt_ApplicationExpression(expression, context) - expression = self.attempt_BooleanExpression(expression, context) - return expression - - def handle_negation(self, tok, context): - return self.make_NegatedExpression(self.process_next_expression(Tokens.NOT)) - - def make_NegatedExpression(self, expression): - return NegatedExpression(expression) - - def handle_variable(self, tok, context): - # It's either: 1) a predicate expression: sees(x,y) - # 2) an application expression: P(x) - # 3) a solo variable: john OR x - accum = self.make_VariableExpression(tok) - if self.inRange(0) and self.token(0) == Tokens.OPEN: - # The predicate has arguments - if not isinstance(accum, FunctionVariableExpression) and not isinstance( - accum, ConstantExpression - ): - raise LogicalExpressionException( - self._currentIndex, - "'%s' is an illegal predicate name. " - "Individual variables may not be used as " - "predicates." % tok, - ) - self.token() # swallow the Open Paren - - # curry the arguments - accum = self.make_ApplicationExpression( - accum, self.process_next_expression(APP) - ) - while self.inRange(0) and self.token(0) == Tokens.COMMA: - self.token() # swallow the comma - accum = self.make_ApplicationExpression( - accum, self.process_next_expression(APP) - ) - self.assertNextToken(Tokens.CLOSE) - return accum - - def get_next_token_variable(self, description): - try: - tok = self.token() - except ExpectedMoreTokensException as e: - raise ExpectedMoreTokensException(e.index, "Variable expected.") from e - if isinstance(self.make_VariableExpression(tok), ConstantExpression): - raise LogicalExpressionException( - self._currentIndex, - "'%s' is an illegal variable name. " - "Constants may not be %s." % (tok, description), - ) - return Variable(tok) - - def handle_lambda(self, tok, context): - # Expression is a lambda expression - if not self.inRange(0): - raise ExpectedMoreTokensException( - self._currentIndex + 2, - message="Variable and Expression expected following lambda operator.", - ) - vars = [self.get_next_token_variable("abstracted")] - while True: - if not self.inRange(0) or ( - self.token(0) == Tokens.DOT and not self.inRange(1) - ): - raise ExpectedMoreTokensException( - self._currentIndex + 2, message="Expression expected." - ) - if not self.isvariable(self.token(0)): - break - # Support expressions like: \x y.M == \x.\y.M - vars.append(self.get_next_token_variable("abstracted")) - if self.inRange(0) and self.token(0) == Tokens.DOT: - self.token() # swallow the dot - - accum = self.process_next_expression(tok) - while vars: - accum = self.make_LambdaExpression(vars.pop(), accum) - return accum - - def handle_quant(self, tok, context): - # Expression is a quantified expression: some x.M - factory = self.get_QuantifiedExpression_factory(tok) - - if not self.inRange(0): - raise ExpectedMoreTokensException( - self._currentIndex + 2, - message="Variable and Expression expected following quantifier '%s'." - % tok, - ) - vars = [self.get_next_token_variable("quantified")] - while True: - if not self.inRange(0) or ( - self.token(0) == Tokens.DOT and not self.inRange(1) - ): - raise ExpectedMoreTokensException( - self._currentIndex + 2, message="Expression expected." - ) - if not self.isvariable(self.token(0)): - break - # Support expressions like: some x y.M == some x.some y.M - vars.append(self.get_next_token_variable("quantified")) - if self.inRange(0) and self.token(0) == Tokens.DOT: - self.token() # swallow the dot - - accum = self.process_next_expression(tok) - while vars: - accum = self.make_QuanifiedExpression(factory, vars.pop(), accum) - return accum - - def get_QuantifiedExpression_factory(self, tok): - """This method serves as a hook for other logic parsers that - have different quantifiers""" - if tok in Tokens.EXISTS_LIST: - return ExistsExpression - elif tok in Tokens.ALL_LIST: - return AllExpression - elif tok in Tokens.IOTA_LIST: - return IotaExpression - else: - self.assertToken(tok, Tokens.QUANTS) - - def make_QuanifiedExpression(self, factory, variable, term): - return factory(variable, term) - - def handle_open(self, tok, context): - # Expression is in parens - accum = self.process_next_expression(None) - self.assertNextToken(Tokens.CLOSE) - return accum - - def attempt_EqualityExpression(self, expression, context): - """Attempt to make an equality expression. If the next token is an - equality operator, then an EqualityExpression will be returned. - Otherwise, the parameter will be returned.""" - if self.inRange(0): - tok = self.token(0) - if tok in Tokens.EQ_LIST + Tokens.NEQ_LIST and self.has_priority( - tok, context - ): - self.token() # swallow the "=" or "!=" - expression = self.make_EqualityExpression( - expression, self.process_next_expression(tok) - ) - if tok in Tokens.NEQ_LIST: - expression = self.make_NegatedExpression(expression) - return expression - - def make_EqualityExpression(self, first, second): - """This method serves as a hook for other logic parsers that - have different equality expression classes""" - return EqualityExpression(first, second) - - def attempt_BooleanExpression(self, expression, context): - """Attempt to make a boolean expression. If the next token is a boolean - operator, then a BooleanExpression will be returned. Otherwise, the - parameter will be returned.""" - while self.inRange(0): - tok = self.token(0) - factory = self.get_BooleanExpression_factory(tok) - if factory and self.has_priority(tok, context): - self.token() # swallow the operator - expression = self.make_BooleanExpression( - factory, expression, self.process_next_expression(tok) - ) - else: - break - return expression - - def get_BooleanExpression_factory(self, tok): - """This method serves as a hook for other logic parsers that - have different boolean operators""" - if tok in Tokens.AND_LIST: - return AndExpression - elif tok in Tokens.OR_LIST: - return OrExpression - elif tok in Tokens.IMP_LIST: - return ImpExpression - elif tok in Tokens.IFF_LIST: - return IffExpression - else: - return None - - def make_BooleanExpression(self, factory, first, second): - return factory(first, second) - - def attempt_ApplicationExpression(self, expression, context): - """Attempt to make an application expression. The next tokens are - a list of arguments in parens, then the argument expression is a - function being applied to the arguments. Otherwise, return the - argument expression.""" - if self.has_priority(APP, context): - if self.inRange(0) and self.token(0) == Tokens.OPEN: - if ( - not isinstance(expression, LambdaExpression) - and not isinstance(expression, ApplicationExpression) - and not isinstance(expression, FunctionVariableExpression) - and not isinstance(expression, ConstantExpression) - ): - raise LogicalExpressionException( - self._currentIndex, - ("The function '%s" % expression) - + "' is not a Lambda Expression, an " - "Application Expression, or a " - "functional predicate, so it may " - "not take arguments.", - ) - self.token() # swallow then open paren - # curry the arguments - accum = self.make_ApplicationExpression( - expression, self.process_next_expression(APP) - ) - while self.inRange(0) and self.token(0) == Tokens.COMMA: - self.token() # swallow the comma - accum = self.make_ApplicationExpression( - accum, self.process_next_expression(APP) - ) - self.assertNextToken(Tokens.CLOSE) - return accum - return expression - - def make_ApplicationExpression(self, function, argument): - return ApplicationExpression(function, argument) - - def make_VariableExpression(self, name): - return VariableExpression(Variable(name)) - - def make_LambdaExpression(self, variable, term): - return LambdaExpression(variable, term) - - def has_priority(self, operation, context): - return self.operator_precedence[operation] < self.operator_precedence[ - context - ] or ( - operation in self.right_associated_operations - and self.operator_precedence[operation] == self.operator_precedence[context] - ) - - def assertNextToken(self, expected): - try: - tok = self.token() - except ExpectedMoreTokensException as e: - raise ExpectedMoreTokensException( - e.index, message="Expected token '%s'." % expected - ) from e - - if isinstance(expected, list): - if tok not in expected: - raise UnexpectedTokenException(self._currentIndex, tok, expected) - else: - if tok != expected: - raise UnexpectedTokenException(self._currentIndex, tok, expected) - - def assertToken(self, tok, expected): - if isinstance(expected, list): - if tok not in expected: - raise UnexpectedTokenException(self._currentIndex, tok, expected) - else: - if tok != expected: - raise UnexpectedTokenException(self._currentIndex, tok, expected) - - def __repr__(self): - if self.inRange(0): - msg = "Next token: " + self.token(0) - else: - msg = "No more tokens" - return "<" + self.__class__.__name__ + ": " + msg + ">" - - -def read_logic(s, logic_parser=None, encoding=None): - """ - Convert a file of First Order Formulas into a list of {Expression}s. - - :param s: the contents of the file - :type s: str - :param logic_parser: The parser to be used to parse the logical expression - :type logic_parser: LogicParser - :param encoding: the encoding of the input string, if it is binary - :type encoding: str - :return: a list of parsed formulas. - :rtype: list(Expression) - """ - if encoding is not None: - s = s.decode(encoding) - if logic_parser is None: - logic_parser = LogicParser() - - statements = [] - for linenum, line in enumerate(s.splitlines()): - line = line.strip() - if line.startswith("#") or line == "": - continue - try: - statements.append(logic_parser.parse(line)) - except LogicalExpressionException as e: - raise ValueError(f"Unable to parse line {linenum}: {line}") from e - return statements - - -@total_ordering -class Variable: - def __init__(self, name): - """ - :param name: the name of the variable - """ - assert isinstance(name, str), "%s is not a string" % name - self.name = name - - def __eq__(self, other): - return isinstance(other, Variable) and self.name == other.name - - def __ne__(self, other): - return not self == other - - def __lt__(self, other): - if not isinstance(other, Variable): - raise TypeError - return self.name < other.name - - def substitute_bindings(self, bindings): - return bindings.get(self, self) - - def __hash__(self): - return hash(self.name) - - def __str__(self): - return self.name - - def __repr__(self): - return "Variable('%s')" % self.name - - -def unique_variable(pattern=None, ignore=None): - """ - Return a new, unique variable. - - :param pattern: ``Variable`` that is being replaced. The new variable must - be the same type. - :param term: a set of ``Variable`` objects that should not be returned from - this function. - :rtype: Variable - """ - if pattern is not None: - if is_indvar(pattern.name): - prefix = "z" - elif is_funcvar(pattern.name): - prefix = "F" - elif is_eventvar(pattern.name): - prefix = "e0" - else: - assert False, "Cannot generate a unique constant" - else: - prefix = "z" - - v = Variable(f"{prefix}{_counter.get()}") - while ignore is not None and v in ignore: - v = Variable(f"{prefix}{_counter.get()}") - return v - - -def skolem_function(univ_scope=None): - """ - Return a skolem function over the variables in univ_scope - param univ_scope - """ - skolem = VariableExpression(Variable("F%s" % _counter.get())) - if univ_scope: - for v in list(univ_scope): - skolem = skolem(VariableExpression(v)) - return skolem - - -class Type: - def __repr__(self): - return "%s" % self - - def __hash__(self): - return hash("%s" % self) - - @classmethod - def fromstring(cls, s): - return read_type(s) - - -class ComplexType(Type): - def __init__(self, first, second): - assert isinstance(first, Type), "%s is not a Type" % first - assert isinstance(second, Type), "%s is not a Type" % second - self.first = first - self.second = second - - def __eq__(self, other): - return ( - isinstance(other, ComplexType) - and self.first == other.first - and self.second == other.second - ) - - def __ne__(self, other): - return not self == other - - __hash__ = Type.__hash__ - - def matches(self, other): - if isinstance(other, ComplexType): - return self.first.matches(other.first) and self.second.matches(other.second) - else: - return self == ANY_TYPE - - def resolve(self, other): - if other == ANY_TYPE: - return self - elif isinstance(other, ComplexType): - f = self.first.resolve(other.first) - s = self.second.resolve(other.second) - if f and s: - return ComplexType(f, s) - else: - return None - elif self == ANY_TYPE: - return other - else: - return None - - def __str__(self): - if self == ANY_TYPE: - return "%s" % ANY_TYPE - else: - return f"<{self.first},{self.second}>" - - def str(self): - if self == ANY_TYPE: - return ANY_TYPE.str() - else: - return f"({self.first.str()} -> {self.second.str()})" - - -class BasicType(Type): - def __eq__(self, other): - return isinstance(other, BasicType) and ("%s" % self) == ("%s" % other) - - def __ne__(self, other): - return not self == other - - __hash__ = Type.__hash__ - - def matches(self, other): - return other == ANY_TYPE or self == other - - def resolve(self, other): - if self.matches(other): - return self - else: - return None - - -class EntityType(BasicType): - def __str__(self): - return "e" - - def str(self): - return "IND" - - -class TruthValueType(BasicType): - def __str__(self): - return "t" - - def str(self): - return "BOOL" - - -class EventType(BasicType): - def __str__(self): - return "v" - - def str(self): - return "EVENT" - - -class AnyType(BasicType, ComplexType): - def __init__(self): - pass - - @property - def first(self): - return self - - @property - def second(self): - return self - - def __eq__(self, other): - return isinstance(other, AnyType) or other.__eq__(self) - - def __ne__(self, other): - return not self == other - - __hash__ = Type.__hash__ - - def matches(self, other): - return True - - def resolve(self, other): - return other - - def __str__(self): - return "?" - - def str(self): - return "ANY" - - -TRUTH_TYPE = TruthValueType() -ENTITY_TYPE = EntityType() -EVENT_TYPE = EventType() -ANY_TYPE = AnyType() - - -def read_type(type_string): - assert isinstance(type_string, str) - type_string = type_string.replace(" ", "") # remove spaces - - if type_string[0] == "<": - assert type_string[-1] == ">" - paren_count = 0 - for i, char in enumerate(type_string): - if char == "<": - paren_count += 1 - elif char == ">": - paren_count -= 1 - assert paren_count > 0 - elif char == ",": - if paren_count == 1: - break - return ComplexType( - read_type(type_string[1:i]), read_type(type_string[i + 1 : -1]) - ) - elif type_string[0] == "%s" % ENTITY_TYPE: - return ENTITY_TYPE - elif type_string[0] == "%s" % TRUTH_TYPE: - return TRUTH_TYPE - elif type_string[0] == "%s" % ANY_TYPE: - return ANY_TYPE - else: - raise LogicalExpressionException( - None, "Unexpected character: '%s'." % type_string[0] - ) - - -class TypeException(Exception): - def __init__(self, msg): - super().__init__(msg) - - -class InconsistentTypeHierarchyException(TypeException): - def __init__(self, variable, expression=None): - if expression: - msg = ( - "The variable '%s' was found in multiple places with different" - " types in '%s'." % (variable, expression) - ) - else: - msg = ( - "The variable '%s' was found in multiple places with different" - " types." % (variable) - ) - super().__init__(msg) - - -class TypeResolutionException(TypeException): - def __init__(self, expression, other_type): - super().__init__( - "The type of '%s', '%s', cannot be resolved with type '%s'" - % (expression, expression.type, other_type) - ) - - -class IllegalTypeException(TypeException): - def __init__(self, expression, other_type, allowed_type): - super().__init__( - "Cannot set type of %s '%s' to '%s'; must match type '%s'." - % (expression.__class__.__name__, expression, other_type, allowed_type) - ) - - -def typecheck(expressions, signature=None): - """ - Ensure correct typing across a collection of ``Expression`` objects. - :param expressions: a collection of expressions - :param signature: dict that maps variable names to types (or string - representations of types) - """ - # typecheck and create master signature - for expression in expressions: - signature = expression.typecheck(signature) - # apply master signature to all expressions - for expression in expressions[:-1]: - expression.typecheck(signature) - return signature - - -class SubstituteBindingsI: - """ - An interface for classes that can perform substitutions for - variables. - """ - - def substitute_bindings(self, bindings): - """ - :return: The object that is obtained by replacing - each variable bound by ``bindings`` with its values. - Aliases are already resolved. (maybe?) - :rtype: (any) - """ - raise NotImplementedError() - - def variables(self): - """ - :return: A list of all variables in this object. - """ - raise NotImplementedError() - - -class Expression(SubstituteBindingsI): - """This is the base abstract object for all logical expressions""" - - _logic_parser = LogicParser() - _type_checking_logic_parser = LogicParser(type_check=True) - - @classmethod - def fromstring(cls, s, type_check=False, signature=None): - if type_check: - return cls._type_checking_logic_parser.parse(s, signature) - else: - return cls._logic_parser.parse(s, signature) - - def __call__(self, other, *additional): - accum = self.applyto(other) - for a in additional: - accum = accum(a) - return accum - - def applyto(self, other): - assert isinstance(other, Expression), "%s is not an Expression" % other - return ApplicationExpression(self, other) - - def __neg__(self): - return NegatedExpression(self) - - def negate(self): - """If this is a negated expression, remove the negation. - Otherwise add a negation.""" - return -self - - def __and__(self, other): - if not isinstance(other, Expression): - raise TypeError("%s is not an Expression" % other) - return AndExpression(self, other) - - def __or__(self, other): - if not isinstance(other, Expression): - raise TypeError("%s is not an Expression" % other) - return OrExpression(self, other) - - def __gt__(self, other): - if not isinstance(other, Expression): - raise TypeError("%s is not an Expression" % other) - return ImpExpression(self, other) - - def __lt__(self, other): - if not isinstance(other, Expression): - raise TypeError("%s is not an Expression" % other) - return IffExpression(self, other) - - def __eq__(self, other): - return NotImplemented - - def __ne__(self, other): - return not self == other - - def equiv(self, other, prover=None): - """ - Check for logical equivalence. - Pass the expression (self <-> other) to the theorem prover. - If the prover says it is valid, then the self and other are equal. - - :param other: an ``Expression`` to check equality against - :param prover: a ``nltk.inference.api.Prover`` - """ - assert isinstance(other, Expression), "%s is not an Expression" % other - - if prover is None: - from nltk.inference import Prover9 - - prover = Prover9() - bicond = IffExpression(self.simplify(), other.simplify()) - return prover.prove(bicond) - - def __hash__(self): - return hash(repr(self)) - - def substitute_bindings(self, bindings): - expr = self - for var in expr.variables(): - if var in bindings: - val = bindings[var] - if isinstance(val, Variable): - val = self.make_VariableExpression(val) - elif not isinstance(val, Expression): - raise ValueError( - "Can not substitute a non-expression " - "value into an expression: %r" % (val,) - ) - # Substitute bindings in the target value. - val = val.substitute_bindings(bindings) - # Replace var w/ the target value. - expr = expr.replace(var, val) - return expr.simplify() - - def typecheck(self, signature=None): - """ - Infer and check types. Raise exceptions if necessary. - - :param signature: dict that maps variable names to types (or string - representations of types) - :return: the signature, plus any additional type mappings - """ - sig = defaultdict(list) - if signature: - for key in signature: - val = signature[key] - varEx = VariableExpression(Variable(key)) - if isinstance(val, Type): - varEx.type = val - else: - varEx.type = read_type(val) - sig[key].append(varEx) - - self._set_type(signature=sig) - - return {key: sig[key][0].type for key in sig} - - def findtype(self, variable): - """ - Find the type of the given variable as it is used in this expression. - For example, finding the type of "P" in "P(x) & Q(x,y)" yields "" - - :param variable: Variable - """ - raise NotImplementedError() - - def _set_type(self, other_type=ANY_TYPE, signature=None): - """ - Set the type of this expression to be the given type. Raise type - exceptions where applicable. - - :param other_type: Type - :param signature: dict(str -> list(AbstractVariableExpression)) - """ - raise NotImplementedError() - - def replace(self, variable, expression, replace_bound=False, alpha_convert=True): - """ - Replace every instance of 'variable' with 'expression' - :param variable: ``Variable`` The variable to replace - :param expression: ``Expression`` The expression with which to replace it - :param replace_bound: bool Should bound variables be replaced? - :param alpha_convert: bool Alpha convert automatically to avoid name clashes? - """ - assert isinstance(variable, Variable), "%s is not a Variable" % variable - assert isinstance(expression, Expression), ( - "%s is not an Expression" % expression - ) - - return self.visit_structured( - lambda e: e.replace(variable, expression, replace_bound, alpha_convert), - self.__class__, - ) - - def normalize(self, newvars=None): - """Rename auto-generated unique variables""" - - def get_indiv_vars(e): - if isinstance(e, IndividualVariableExpression): - return {e} - elif isinstance(e, AbstractVariableExpression): - return set() - else: - return e.visit( - get_indiv_vars, lambda parts: reduce(operator.or_, parts, set()) - ) - - result = self - for i, e in enumerate(sorted(get_indiv_vars(self), key=lambda e: e.variable)): - if isinstance(e, EventVariableExpression): - newVar = e.__class__(Variable("e0%s" % (i + 1))) - elif isinstance(e, IndividualVariableExpression): - newVar = e.__class__(Variable("z%s" % (i + 1))) - else: - newVar = e - result = result.replace(e.variable, newVar, True) - return result - - def visit(self, function, combinator): - """ - Recursively visit subexpressions. Apply 'function' to each - subexpression and pass the result of each function application - to the 'combinator' for aggregation: - - return combinator(map(function, self.subexpressions)) - - Bound variables are neither applied upon by the function nor given to - the combinator. - :param function: ``Function`` to call on each subexpression - :param combinator: ``Function,R>`` to combine the results of the - function calls - :return: result of combination ``R`` - """ - raise NotImplementedError() - - def visit_structured(self, function, combinator): - """ - Recursively visit subexpressions. Apply 'function' to each - subexpression and pass the result of each function application - to the 'combinator' for aggregation. The combinator must have - the same signature as the constructor. The function is not - applied to bound variables, but they are passed to the - combinator. - :param function: ``Function`` to call on each subexpression - :param combinator: ``Function`` with the same signature as the - constructor, to combine the results of the function calls - :return: result of combination - """ - return self.visit(function, lambda parts: combinator(*parts)) - - def __repr__(self): - return f"<{self.__class__.__name__} {self}>" - - def __str__(self): - return self.str() - - def variables(self): - """ - Return a set of all the variables for binding substitution. - The variables returned include all free (non-bound) individual - variables and any variable starting with '?' or '@'. - :return: set of ``Variable`` objects - """ - return self.free() | { - p for p in self.predicates() | self.constants() if re.match("^[?@]", p.name) - } - - def free(self): - """ - Return a set of all the free (non-bound) variables. This includes - both individual and predicate variables, but not constants. - :return: set of ``Variable`` objects - """ - return self.visit( - lambda e: e.free(), lambda parts: reduce(operator.or_, parts, set()) - ) - - def constants(self): - """ - Return a set of individual constants (non-predicates). - :return: set of ``Variable`` objects - """ - return self.visit( - lambda e: e.constants(), lambda parts: reduce(operator.or_, parts, set()) - ) - - def predicates(self): - """ - Return a set of predicates (constants, not variables). - :return: set of ``Variable`` objects - """ - return self.visit( - lambda e: e.predicates(), lambda parts: reduce(operator.or_, parts, set()) - ) - - def simplify(self): - """ - :return: beta-converted version of this expression - """ - return self.visit_structured(lambda e: e.simplify(), self.__class__) - - def make_VariableExpression(self, variable): - return VariableExpression(variable) - - -class ApplicationExpression(Expression): - r""" - This class is used to represent two related types of logical expressions. - - The first is a Predicate Expression, such as "P(x,y)". A predicate - expression is comprised of a ``FunctionVariableExpression`` or - ``ConstantExpression`` as the predicate and a list of Expressions as the - arguments. - - The second is a an application of one expression to another, such as - "(\x.dog(x))(fido)". - - The reason Predicate Expressions are treated as Application Expressions is - that the Variable Expression predicate of the expression may be replaced - with another Expression, such as a LambdaExpression, which would mean that - the Predicate should be thought of as being applied to the arguments. - - The logical expression reader will always curry arguments in a application expression. - So, "\x y.see(x,y)(john,mary)" will be represented internally as - "((\x y.(see(x))(y))(john))(mary)". This simplifies the internals since - there will always be exactly one argument in an application. - - The str() method will usually print the curried forms of application - expressions. The one exception is when the the application expression is - really a predicate expression (ie, underlying function is an - ``AbstractVariableExpression``). This means that the example from above - will be returned as "(\x y.see(x,y)(john))(mary)". - """ - - def __init__(self, function, argument): - """ - :param function: ``Expression``, for the function expression - :param argument: ``Expression``, for the argument - """ - assert isinstance(function, Expression), "%s is not an Expression" % function - assert isinstance(argument, Expression), "%s is not an Expression" % argument - self.function = function - self.argument = argument - - def simplify(self): - function = self.function.simplify() - argument = self.argument.simplify() - if isinstance(function, LambdaExpression): - return function.term.replace(function.variable, argument).simplify() - else: - return self.__class__(function, argument) - - @property - def type(self): - if isinstance(self.function.type, ComplexType): - return self.function.type.second - else: - return ANY_TYPE - - def _set_type(self, other_type=ANY_TYPE, signature=None): - """:see Expression._set_type()""" - assert isinstance(other_type, Type) - - if signature is None: - signature = defaultdict(list) - - self.argument._set_type(ANY_TYPE, signature) - try: - self.function._set_type( - ComplexType(self.argument.type, other_type), signature - ) - except TypeResolutionException as e: - raise TypeException( - "The function '%s' is of type '%s' and cannot be applied " - "to '%s' of type '%s'. Its argument must match type '%s'." - % ( - self.function, - self.function.type, - self.argument, - self.argument.type, - self.function.type.first, - ) - ) from e - - def findtype(self, variable): - """:see Expression.findtype()""" - assert isinstance(variable, Variable), "%s is not a Variable" % variable - if self.is_atom(): - function, args = self.uncurry() - else: - # It's not a predicate expression ("P(x,y)"), so leave args curried - function = self.function - args = [self.argument] - - found = [arg.findtype(variable) for arg in [function] + args] - - unique = [] - for f in found: - if f != ANY_TYPE: - if unique: - for u in unique: - if f.matches(u): - break - else: - unique.append(f) - - if len(unique) == 1: - return list(unique)[0] - else: - return ANY_TYPE - - def constants(self): - """:see: Expression.constants()""" - if isinstance(self.function, AbstractVariableExpression): - function_constants = set() - else: - function_constants = self.function.constants() - return function_constants | self.argument.constants() - - def predicates(self): - """:see: Expression.predicates()""" - if isinstance(self.function, ConstantExpression): - function_preds = {self.function.variable} - else: - function_preds = self.function.predicates() - return function_preds | self.argument.predicates() - - def visit(self, function, combinator): - """:see: Expression.visit()""" - return combinator([function(self.function), function(self.argument)]) - - def __eq__(self, other): - return ( - isinstance(other, ApplicationExpression) - and self.function == other.function - and self.argument == other.argument - ) - - def __ne__(self, other): - return not self == other - - __hash__ = Expression.__hash__ - - def __str__(self): - # uncurry the arguments and find the base function - if self.is_atom(): - function, args = self.uncurry() - arg_str = ",".join("%s" % arg for arg in args) - else: - # Leave arguments curried - function = self.function - arg_str = "%s" % self.argument - - function_str = "%s" % function - parenthesize_function = False - if isinstance(function, LambdaExpression): - if isinstance(function.term, ApplicationExpression): - if not isinstance(function.term.function, AbstractVariableExpression): - parenthesize_function = True - elif not isinstance(function.term, BooleanExpression): - parenthesize_function = True - elif isinstance(function, ApplicationExpression): - parenthesize_function = True - - if parenthesize_function: - function_str = Tokens.OPEN + function_str + Tokens.CLOSE - - return function_str + Tokens.OPEN + arg_str + Tokens.CLOSE - - def uncurry(self): - """ - Uncurry this application expression - - return: A tuple (base-function, arg-list) - """ - function = self.function - args = [self.argument] - while isinstance(function, ApplicationExpression): - # (\x.\y.sees(x,y)(john))(mary) - args.insert(0, function.argument) - function = function.function - return (function, args) - - @property - def pred(self): - """ - Return uncurried base-function. - If this is an atom, then the result will be a variable expression. - Otherwise, it will be a lambda expression. - """ - return self.uncurry()[0] - - @property - def args(self): - """ - Return uncurried arg-list - """ - return self.uncurry()[1] - - def is_atom(self): - """ - Is this expression an atom (as opposed to a lambda expression applied - to a term)? - """ - return isinstance(self.pred, AbstractVariableExpression) - - -@total_ordering -class AbstractVariableExpression(Expression): - """This class represents a variable to be used as a predicate or entity""" - - def __init__(self, variable): - """ - :param variable: ``Variable``, for the variable - """ - assert isinstance(variable, Variable), "%s is not a Variable" % variable - self.variable = variable - - def simplify(self): - return self - - def replace(self, variable, expression, replace_bound=False, alpha_convert=True): - """:see: Expression.replace()""" - assert isinstance(variable, Variable), "%s is not an Variable" % variable - assert isinstance(expression, Expression), ( - "%s is not an Expression" % expression - ) - if self.variable == variable: - return expression - else: - return self - - def _set_type(self, other_type=ANY_TYPE, signature=None): - """:see Expression._set_type()""" - assert isinstance(other_type, Type) - - if signature is None: - signature = defaultdict(list) - - resolution = other_type - for varEx in signature[self.variable.name]: - resolution = varEx.type.resolve(resolution) - if not resolution: - raise InconsistentTypeHierarchyException(self) - - signature[self.variable.name].append(self) - for varEx in signature[self.variable.name]: - varEx.type = resolution - - def findtype(self, variable): - """:see Expression.findtype()""" - assert isinstance(variable, Variable), "%s is not a Variable" % variable - if self.variable == variable: - return self.type - else: - return ANY_TYPE - - def predicates(self): - """:see: Expression.predicates()""" - return set() - - def __eq__(self, other): - """Allow equality between instances of ``AbstractVariableExpression`` - subtypes.""" - return ( - isinstance(other, AbstractVariableExpression) - and self.variable == other.variable - ) - - def __ne__(self, other): - return not self == other - - def __lt__(self, other): - if not isinstance(other, AbstractVariableExpression): - raise TypeError - return self.variable < other.variable - - __hash__ = Expression.__hash__ - - def __str__(self): - return "%s" % self.variable - - -class IndividualVariableExpression(AbstractVariableExpression): - """This class represents variables that take the form of a single lowercase - character (other than 'e') followed by zero or more digits.""" - - def _set_type(self, other_type=ANY_TYPE, signature=None): - """:see Expression._set_type()""" - assert isinstance(other_type, Type) - - if signature is None: - signature = defaultdict(list) - - if not other_type.matches(ENTITY_TYPE): - raise IllegalTypeException(self, other_type, ENTITY_TYPE) - - signature[self.variable.name].append(self) - - def _get_type(self): - return ENTITY_TYPE - - type = property(_get_type, _set_type) - - def free(self): - """:see: Expression.free()""" - return {self.variable} - - def constants(self): - """:see: Expression.constants()""" - return set() - - -class FunctionVariableExpression(AbstractVariableExpression): - """This class represents variables that take the form of a single uppercase - character followed by zero or more digits.""" - - type = ANY_TYPE - - def free(self): - """:see: Expression.free()""" - return {self.variable} - - def constants(self): - """:see: Expression.constants()""" - return set() - - -class EventVariableExpression(IndividualVariableExpression): - """This class represents variables that take the form of a single lowercase - 'e' character followed by zero or more digits.""" - - type = EVENT_TYPE - - -class ConstantExpression(AbstractVariableExpression): - """This class represents variables that do not take the form of a single - character followed by zero or more digits.""" - - type = ENTITY_TYPE - - def _set_type(self, other_type=ANY_TYPE, signature=None): - """:see Expression._set_type()""" - assert isinstance(other_type, Type) - - if signature is None: - signature = defaultdict(list) - - if other_type == ANY_TYPE: - # entity type by default, for individuals - resolution = ENTITY_TYPE - else: - resolution = other_type - if self.type != ENTITY_TYPE: - resolution = resolution.resolve(self.type) - - for varEx in signature[self.variable.name]: - resolution = varEx.type.resolve(resolution) - if not resolution: - raise InconsistentTypeHierarchyException(self) - - signature[self.variable.name].append(self) - for varEx in signature[self.variable.name]: - varEx.type = resolution - - def free(self): - """:see: Expression.free()""" - return set() - - def constants(self): - """:see: Expression.constants()""" - return {self.variable} - - -def VariableExpression(variable): - """ - This is a factory method that instantiates and returns a subtype of - ``AbstractVariableExpression`` appropriate for the given variable. - """ - assert isinstance(variable, Variable), "%s is not a Variable" % variable - if is_indvar(variable.name): - return IndividualVariableExpression(variable) - elif is_funcvar(variable.name): - return FunctionVariableExpression(variable) - elif is_eventvar(variable.name): - return EventVariableExpression(variable) - else: - return ConstantExpression(variable) - - -class VariableBinderExpression(Expression): - """This an abstract class for any Expression that binds a variable in an - Expression. This includes LambdaExpressions and Quantified Expressions""" - - def __init__(self, variable, term): - """ - :param variable: ``Variable``, for the variable - :param term: ``Expression``, for the term - """ - assert isinstance(variable, Variable), "%s is not a Variable" % variable - assert isinstance(term, Expression), "%s is not an Expression" % term - self.variable = variable - self.term = term - - def replace(self, variable, expression, replace_bound=False, alpha_convert=True): - """:see: Expression.replace()""" - assert isinstance(variable, Variable), "%s is not a Variable" % variable - assert isinstance(expression, Expression), ( - "%s is not an Expression" % expression - ) - # if the bound variable is the thing being replaced - if self.variable == variable: - if replace_bound: - assert isinstance(expression, AbstractVariableExpression), ( - "%s is not a AbstractVariableExpression" % expression - ) - return self.__class__( - expression.variable, - self.term.replace(variable, expression, True, alpha_convert), - ) - else: - return self - else: - # if the bound variable appears in the expression, then it must - # be alpha converted to avoid a conflict - if alpha_convert and self.variable in expression.free(): - self = self.alpha_convert(unique_variable(pattern=self.variable)) - - # replace in the term - return self.__class__( - self.variable, - self.term.replace(variable, expression, replace_bound, alpha_convert), - ) - - def alpha_convert(self, newvar): - """Rename all occurrences of the variable introduced by this variable - binder in the expression to ``newvar``. - :param newvar: ``Variable``, for the new variable - """ - assert isinstance(newvar, Variable), "%s is not a Variable" % newvar - return self.__class__( - newvar, self.term.replace(self.variable, VariableExpression(newvar), True) - ) - - def free(self): - """:see: Expression.free()""" - return self.term.free() - {self.variable} - - def findtype(self, variable): - """:see Expression.findtype()""" - assert isinstance(variable, Variable), "%s is not a Variable" % variable - if variable == self.variable: - return ANY_TYPE - else: - return self.term.findtype(variable) - - def visit(self, function, combinator): - """:see: Expression.visit()""" - return combinator([function(self.term)]) - - def visit_structured(self, function, combinator): - """:see: Expression.visit_structured()""" - return combinator(self.variable, function(self.term)) - - def __eq__(self, other): - r"""Defines equality modulo alphabetic variance. If we are comparing - \x.M and \y.N, then check equality of M and N[x/y].""" - if isinstance(self, other.__class__) or isinstance(other, self.__class__): - if self.variable == other.variable: - return self.term == other.term - else: - # Comparing \x.M and \y.N. Relabel y in N with x and continue. - varex = VariableExpression(self.variable) - return self.term == other.term.replace(other.variable, varex) - else: - return False - - def __ne__(self, other): - return not self == other - - __hash__ = Expression.__hash__ - - -class LambdaExpression(VariableBinderExpression): - @property - def type(self): - return ComplexType(self.term.findtype(self.variable), self.term.type) - - def _set_type(self, other_type=ANY_TYPE, signature=None): - """:see Expression._set_type()""" - assert isinstance(other_type, Type) - - if signature is None: - signature = defaultdict(list) - - self.term._set_type(other_type.second, signature) - if not self.type.resolve(other_type): - raise TypeResolutionException(self, other_type) - - def __str__(self): - variables = [self.variable] - term = self.term - while term.__class__ == self.__class__: - variables.append(term.variable) - term = term.term - return ( - Tokens.LAMBDA - + " ".join("%s" % v for v in variables) - + Tokens.DOT - + "%s" % term - ) - - -class QuantifiedExpression(VariableBinderExpression): - @property - def type(self): - return TRUTH_TYPE - - def _set_type(self, other_type=ANY_TYPE, signature=None): - """:see Expression._set_type()""" - assert isinstance(other_type, Type) - - if signature is None: - signature = defaultdict(list) - - if not other_type.matches(TRUTH_TYPE): - raise IllegalTypeException(self, other_type, TRUTH_TYPE) - self.term._set_type(TRUTH_TYPE, signature) - - def __str__(self): - variables = [self.variable] - term = self.term - while term.__class__ == self.__class__: - variables.append(term.variable) - term = term.term - return ( - self.getQuantifier() - + " " - + " ".join("%s" % v for v in variables) - + Tokens.DOT - + "%s" % term - ) - - -class ExistsExpression(QuantifiedExpression): - def getQuantifier(self): - return Tokens.EXISTS - - -class AllExpression(QuantifiedExpression): - def getQuantifier(self): - return Tokens.ALL - - -class IotaExpression(QuantifiedExpression): - def getQuantifier(self): - return Tokens.IOTA - - -class NegatedExpression(Expression): - def __init__(self, term): - assert isinstance(term, Expression), "%s is not an Expression" % term - self.term = term - - @property - def type(self): - return TRUTH_TYPE - - def _set_type(self, other_type=ANY_TYPE, signature=None): - """:see Expression._set_type()""" - assert isinstance(other_type, Type) - - if signature is None: - signature = defaultdict(list) - - if not other_type.matches(TRUTH_TYPE): - raise IllegalTypeException(self, other_type, TRUTH_TYPE) - self.term._set_type(TRUTH_TYPE, signature) - - def findtype(self, variable): - assert isinstance(variable, Variable), "%s is not a Variable" % variable - return self.term.findtype(variable) - - def visit(self, function, combinator): - """:see: Expression.visit()""" - return combinator([function(self.term)]) - - def negate(self): - """:see: Expression.negate()""" - return self.term - - def __eq__(self, other): - return isinstance(other, NegatedExpression) and self.term == other.term - - def __ne__(self, other): - return not self == other - - __hash__ = Expression.__hash__ - - def __str__(self): - return Tokens.NOT + "%s" % self.term - - -class BinaryExpression(Expression): - def __init__(self, first, second): - assert isinstance(first, Expression), "%s is not an Expression" % first - assert isinstance(second, Expression), "%s is not an Expression" % second - self.first = first - self.second = second - - @property - def type(self): - return TRUTH_TYPE - - def findtype(self, variable): - """:see Expression.findtype()""" - assert isinstance(variable, Variable), "%s is not a Variable" % variable - f = self.first.findtype(variable) - s = self.second.findtype(variable) - if f == s or s == ANY_TYPE: - return f - elif f == ANY_TYPE: - return s - else: - return ANY_TYPE - - def visit(self, function, combinator): - """:see: Expression.visit()""" - return combinator([function(self.first), function(self.second)]) - - def __eq__(self, other): - return ( - (isinstance(self, other.__class__) or isinstance(other, self.__class__)) - and self.first == other.first - and self.second == other.second - ) - - def __ne__(self, other): - return not self == other - - __hash__ = Expression.__hash__ - - def __str__(self): - first = self._str_subex(self.first) - second = self._str_subex(self.second) - return Tokens.OPEN + first + " " + self.getOp() + " " + second + Tokens.CLOSE - - def _str_subex(self, subex): - return "%s" % subex - - -class BooleanExpression(BinaryExpression): - def _set_type(self, other_type=ANY_TYPE, signature=None): - """:see Expression._set_type()""" - assert isinstance(other_type, Type) - - if signature is None: - signature = defaultdict(list) - - if not other_type.matches(TRUTH_TYPE): - raise IllegalTypeException(self, other_type, TRUTH_TYPE) - self.first._set_type(TRUTH_TYPE, signature) - self.second._set_type(TRUTH_TYPE, signature) - - -class AndExpression(BooleanExpression): - """This class represents conjunctions""" - - def getOp(self): - return Tokens.AND - - def _str_subex(self, subex): - s = "%s" % subex - if isinstance(subex, AndExpression): - return s[1:-1] - return s - - -class OrExpression(BooleanExpression): - """This class represents disjunctions""" - - def getOp(self): - return Tokens.OR - - def _str_subex(self, subex): - s = "%s" % subex - if isinstance(subex, OrExpression): - return s[1:-1] - return s - - -class ImpExpression(BooleanExpression): - """This class represents implications""" - - def getOp(self): - return Tokens.IMP - - -class IffExpression(BooleanExpression): - """This class represents biconditionals""" - - def getOp(self): - return Tokens.IFF - - -class EqualityExpression(BinaryExpression): - """This class represents equality expressions like "(x = y)".""" - - def _set_type(self, other_type=ANY_TYPE, signature=None): - """:see Expression._set_type()""" - assert isinstance(other_type, Type) - - if signature is None: - signature = defaultdict(list) - - if not other_type.matches(TRUTH_TYPE): - raise IllegalTypeException(self, other_type, TRUTH_TYPE) - self.first._set_type(ENTITY_TYPE, signature) - self.second._set_type(ENTITY_TYPE, signature) - - def getOp(self): - return Tokens.EQ - - -### Utilities - - -class LogicalExpressionException(Exception): - def __init__(self, index, message): - self.index = index - Exception.__init__(self, message) - - -class UnexpectedTokenException(LogicalExpressionException): - def __init__(self, index, unexpected=None, expected=None, message=None): - if unexpected and expected: - msg = "Unexpected token: '%s'. " "Expected token '%s'." % ( - unexpected, - expected, - ) - elif unexpected: - msg = "Unexpected token: '%s'." % unexpected - if message: - msg += " " + message - else: - msg = "Expected token '%s'." % expected - LogicalExpressionException.__init__(self, index, msg) - - -class ExpectedMoreTokensException(LogicalExpressionException): - def __init__(self, index, message=None): - if not message: - message = "More tokens expected." - LogicalExpressionException.__init__( - self, index, "End of input found. " + message - ) - - -def is_indvar(expr): - """ - An individual variable must be a single lowercase character other than 'e', - followed by zero or more digits. - - :param expr: str - :return: bool True if expr is of the correct form - """ - assert isinstance(expr, str), "%s is not a string" % expr - return re.match(r"^[a-df-z]\d*$", expr) is not None - - -def is_funcvar(expr): - """ - A function variable must be a single uppercase character followed by - zero or more digits. - - :param expr: str - :return: bool True if expr is of the correct form - """ - assert isinstance(expr, str), "%s is not a string" % expr - return re.match(r"^[A-Z]\d*$", expr) is not None - - -def is_eventvar(expr): - """ - An event variable must be a single lowercase 'e' character followed by - zero or more digits. - - :param expr: str - :return: bool True if expr is of the correct form - """ - assert isinstance(expr, str), "%s is not a string" % expr - return re.match(r"^e\d*$", expr) is not None - - -def demo(): - lexpr = Expression.fromstring - print("=" * 20 + "Test reader" + "=" * 20) - print(lexpr(r"john")) - print(lexpr(r"man(x)")) - print(lexpr(r"-man(x)")) - print(lexpr(r"(man(x) & tall(x) & walks(x))")) - print(lexpr(r"exists x.(man(x) & tall(x) & walks(x))")) - print(lexpr(r"\x.man(x)")) - print(lexpr(r"\x.man(x)(john)")) - print(lexpr(r"\x y.sees(x,y)")) - print(lexpr(r"\x y.sees(x,y)(a,b)")) - print(lexpr(r"(\x.exists y.walks(x,y))(x)")) - print(lexpr(r"exists x.x = y")) - print(lexpr(r"exists x.(x = y)")) - print(lexpr("P(x) & x=y & P(y)")) - print(lexpr(r"\P Q.exists x.(P(x) & Q(x))")) - print(lexpr(r"man(x) <-> tall(x)")) - - print("=" * 20 + "Test simplify" + "=" * 20) - print(lexpr(r"\x.\y.sees(x,y)(john)(mary)").simplify()) - print(lexpr(r"\x.\y.sees(x,y)(john, mary)").simplify()) - print(lexpr(r"all x.(man(x) & (\x.exists y.walks(x,y))(x))").simplify()) - print(lexpr(r"(\P.\Q.exists x.(P(x) & Q(x)))(\x.dog(x))(\x.bark(x))").simplify()) - - print("=" * 20 + "Test alpha conversion and binder expression equality" + "=" * 20) - e1 = lexpr("exists x.P(x)") - print(e1) - e2 = e1.alpha_convert(Variable("z")) - print(e2) - print(e1 == e2) - - -def demo_errors(): - print("=" * 20 + "Test reader errors" + "=" * 20) - demoException("(P(x) & Q(x)") - demoException("((P(x) &) & Q(x))") - demoException("P(x) -> ") - demoException("P(x") - demoException("P(x,") - demoException("P(x,)") - demoException("exists") - demoException("exists x.") - demoException("\\") - demoException("\\ x y.") - demoException("P(x)Q(x)") - demoException("(P(x)Q(x)") - demoException("exists x -> y") - - -def demoException(s): - try: - Expression.fromstring(s) - except LogicalExpressionException as e: - print(f"{e.__class__.__name__}: {e}") - - -def printtype(ex): - print(f"{ex.str()} : {ex.type}") - - -if __name__ == "__main__": - demo() -# demo_errors() diff --git a/pipeline/nltk/sem/relextract.py b/pipeline/nltk/sem/relextract.py deleted file mode 100644 index fcf755a3c4ab91678ae2965b96e79235a7c59120..0000000000000000000000000000000000000000 --- a/pipeline/nltk/sem/relextract.py +++ /dev/null @@ -1,539 +0,0 @@ -# Natural Language Toolkit: Relation Extraction -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ewan Klein -# URL: -# For license information, see LICENSE.TXT - -""" -Code for extracting relational triples from the ieer and conll2002 corpora. - -Relations are stored internally as dictionaries ('reldicts'). - -The two serialization outputs are "rtuple" and "clause". - -- An rtuple is a tuple of the form ``(subj, filler, obj)``, - where ``subj`` and ``obj`` are pairs of Named Entity mentions, and ``filler`` is the string of words - occurring between ``sub`` and ``obj`` (with no intervening NEs). Strings are printed via ``repr()`` to - circumvent locale variations in rendering utf-8 encoded strings. -- A clause is an atom of the form ``relsym(subjsym, objsym)``, - where the relation, subject and object have been canonicalized to single strings. -""" - -# todo: get a more general solution to canonicalized symbols for clauses -- maybe use xmlcharrefs? - -import html -import re -from collections import defaultdict - -# Dictionary that associates corpora with NE classes -NE_CLASSES = { - "ieer": [ - "LOCATION", - "ORGANIZATION", - "PERSON", - "DURATION", - "DATE", - "CARDINAL", - "PERCENT", - "MONEY", - "MEASURE", - ], - "conll2002": ["LOC", "PER", "ORG"], - "ace": [ - "LOCATION", - "ORGANIZATION", - "PERSON", - "DURATION", - "DATE", - "CARDINAL", - "PERCENT", - "MONEY", - "MEASURE", - "FACILITY", - "GPE", - ], -} - -# Allow abbreviated class labels -short2long = dict(LOC="LOCATION", ORG="ORGANIZATION", PER="PERSON") -long2short = dict(LOCATION="LOC", ORGANIZATION="ORG", PERSON="PER") - - -def _expand(type): - """ - Expand an NE class name. - :type type: str - :rtype: str - """ - try: - return short2long[type] - except KeyError: - return type - - -def class_abbrev(type): - """ - Abbreviate an NE class name. - :type type: str - :rtype: str - """ - try: - return long2short[type] - except KeyError: - return type - - -def _join(lst, sep=" ", untag=False): - """ - Join a list into a string, turning tags tuples into tag strings or just words. - :param untag: if ``True``, omit the tag from tagged input strings. - :type lst: list - :rtype: str - """ - try: - return sep.join(lst) - except TypeError: - if untag: - return sep.join(tup[0] for tup in lst) - from nltk.tag import tuple2str - - return sep.join(tuple2str(tup) for tup in lst) - - -def descape_entity(m, defs=html.entities.entitydefs): - """ - Translate one entity to its ISO Latin value. - Inspired by example from effbot.org - - - """ - try: - return defs[m.group(1)] - - except KeyError: - return m.group(0) # use as is - - -def list2sym(lst): - """ - Convert a list of strings into a canonical symbol. - :type lst: list - :return: a Unicode string without whitespace - :rtype: unicode - """ - sym = _join(lst, "_", untag=True) - sym = sym.lower() - ENT = re.compile(r"&(\w+?);") - sym = ENT.sub(descape_entity, sym) - sym = sym.replace(".", "") - return sym - - -def tree2semi_rel(tree): - """ - Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``). - - In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this - identifies pairs whose first member is a list (possibly empty) of terminal - strings, and whose second member is a ``Tree`` of the form (NE_label, terminals). - - :param tree: a chunk tree - :return: a list of pairs (list(str), ``Tree``) - :rtype: list of tuple - """ - - from nltk.tree import Tree - - semi_rels = [] - semi_rel = [[], None] - - for dtr in tree: - if not isinstance(dtr, Tree): - semi_rel[0].append(dtr) - else: - # dtr is a Tree - semi_rel[1] = dtr - semi_rels.append(semi_rel) - semi_rel = [[], None] - return semi_rels - - -def semi_rel2reldict(pairs, window=5, trace=False): - """ - Converts the pairs generated by ``tree2semi_rel`` into a 'reldict': a dictionary which - stores information about the subject and object NEs plus the filler between them. - Additionally, a left and right context of length =< window are captured (within - a given input sentence). - - :param pairs: a pair of list(str) and ``Tree``, as generated by - :param window: a threshold for the number of items to include in the left and right context - :type window: int - :return: 'relation' dictionaries whose keys are 'lcon', 'subjclass', 'subjtext', 'subjsym', 'filler', objclass', objtext', 'objsym' and 'rcon' - :rtype: list(defaultdict) - """ - result = [] - while len(pairs) > 2: - reldict = defaultdict(str) - reldict["lcon"] = _join(pairs[0][0][-window:]) - reldict["subjclass"] = pairs[0][1].label() - reldict["subjtext"] = _join(pairs[0][1].leaves()) - reldict["subjsym"] = list2sym(pairs[0][1].leaves()) - reldict["filler"] = _join(pairs[1][0]) - reldict["untagged_filler"] = _join(pairs[1][0], untag=True) - reldict["objclass"] = pairs[1][1].label() - reldict["objtext"] = _join(pairs[1][1].leaves()) - reldict["objsym"] = list2sym(pairs[1][1].leaves()) - reldict["rcon"] = _join(pairs[2][0][:window]) - if trace: - print( - "(%s(%s, %s)" - % ( - reldict["untagged_filler"], - reldict["subjclass"], - reldict["objclass"], - ) - ) - result.append(reldict) - pairs = pairs[1:] - return result - - -def extract_rels(subjclass, objclass, doc, corpus="ace", pattern=None, window=10): - """ - Filter the output of ``semi_rel2reldict`` according to specified NE classes and a filler pattern. - - The parameters ``subjclass`` and ``objclass`` can be used to restrict the - Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION', - 'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'). - - :param subjclass: the class of the subject Named Entity. - :type subjclass: str - :param objclass: the class of the object Named Entity. - :type objclass: str - :param doc: input document - :type doc: ieer document or a list of chunk trees - :param corpus: name of the corpus to take as input; possible values are - 'ieer' and 'conll2002' - :type corpus: str - :param pattern: a regular expression for filtering the fillers of - retrieved triples. - :type pattern: SRE_Pattern - :param window: filters out fillers which exceed this threshold - :type window: int - :return: see ``mk_reldicts`` - :rtype: list(defaultdict) - """ - - if subjclass and subjclass not in NE_CLASSES[corpus]: - if _expand(subjclass) in NE_CLASSES[corpus]: - subjclass = _expand(subjclass) - else: - raise ValueError( - "your value for the subject type has not been recognized: %s" - % subjclass - ) - if objclass and objclass not in NE_CLASSES[corpus]: - if _expand(objclass) in NE_CLASSES[corpus]: - objclass = _expand(objclass) - else: - raise ValueError( - "your value for the object type has not been recognized: %s" % objclass - ) - - if corpus == "ace" or corpus == "conll2002": - pairs = tree2semi_rel(doc) - elif corpus == "ieer": - pairs = tree2semi_rel(doc.text) + tree2semi_rel(doc.headline) - else: - raise ValueError("corpus type not recognized") - - reldicts = semi_rel2reldict(pairs) - - relfilter = lambda x: ( - x["subjclass"] == subjclass - and len(x["filler"].split()) <= window - and pattern.match(x["filler"]) - and x["objclass"] == objclass - ) - - return list(filter(relfilter, reldicts)) - - -def rtuple(reldict, lcon=False, rcon=False): - """ - Pretty print the reldict as an rtuple. - :param reldict: a relation dictionary - :type reldict: defaultdict - """ - items = [ - class_abbrev(reldict["subjclass"]), - reldict["subjtext"], - reldict["filler"], - class_abbrev(reldict["objclass"]), - reldict["objtext"], - ] - format = "[%s: %r] %r [%s: %r]" - if lcon: - items = [reldict["lcon"]] + items - format = "...%r)" + format - if rcon: - items.append(reldict["rcon"]) - format = format + "(%r..." - printargs = tuple(items) - return format % printargs - - -def clause(reldict, relsym): - """ - Print the relation in clausal form. - :param reldict: a relation dictionary - :type reldict: defaultdict - :param relsym: a label for the relation - :type relsym: str - """ - items = (relsym, reldict["subjsym"], reldict["objsym"]) - return "%s(%r, %r)" % items - - -####################################################### -# Demos of relation extraction with regular expressions -####################################################### - -############################################ -# Example of in(ORG, LOC) -############################################ -def in_demo(trace=0, sql=True): - """ - Select pairs of organizations and locations whose mentions occur with an - intervening occurrence of the preposition "in". - - If the sql parameter is set to True, then the entity pairs are loaded into - an in-memory database, and subsequently pulled out using an SQL "SELECT" - query. - """ - from nltk.corpus import ieer - - if sql: - try: - import sqlite3 - - connection = sqlite3.connect(":memory:") - cur = connection.cursor() - cur.execute( - """create table Locations - (OrgName text, LocationName text, DocID text)""" - ) - except ImportError: - import warnings - - warnings.warn("Cannot import sqlite; sql flag will be ignored.") - - IN = re.compile(r".*\bin\b(?!\b.+ing)") - - print() - print("IEER: in(ORG, LOC) -- just the clauses:") - print("=" * 45) - - for file in ieer.fileids(): - for doc in ieer.parsed_docs(file): - if trace: - print(doc.docno) - print("=" * 15) - for rel in extract_rels("ORG", "LOC", doc, corpus="ieer", pattern=IN): - print(clause(rel, relsym="IN")) - if sql: - try: - rtuple = (rel["subjtext"], rel["objtext"], doc.docno) - cur.execute( - """insert into Locations - values (?, ?, ?)""", - rtuple, - ) - connection.commit() - except NameError: - pass - - if sql: - try: - cur.execute( - """select OrgName from Locations - where LocationName = 'Atlanta'""" - ) - print() - print("Extract data from SQL table: ORGs in Atlanta") - print("-" * 15) - for row in cur: - print(row) - except NameError: - pass - - -############################################ -# Example of has_role(PER, LOC) -############################################ - - -def roles_demo(trace=0): - from nltk.corpus import ieer - - roles = r""" - (.*( # assorted roles - analyst| - chair(wo)?man| - commissioner| - counsel| - director| - economist| - editor| - executive| - foreman| - governor| - head| - lawyer| - leader| - librarian).*)| - manager| - partner| - president| - producer| - professor| - researcher| - spokes(wo)?man| - writer| - ,\sof\sthe?\s* # "X, of (the) Y" - """ - ROLES = re.compile(roles, re.VERBOSE) - - print() - print("IEER: has_role(PER, ORG) -- raw rtuples:") - print("=" * 45) - - for file in ieer.fileids(): - for doc in ieer.parsed_docs(file): - lcon = rcon = False - if trace: - print(doc.docno) - print("=" * 15) - lcon = rcon = True - for rel in extract_rels("PER", "ORG", doc, corpus="ieer", pattern=ROLES): - print(rtuple(rel, lcon=lcon, rcon=rcon)) - - -############################################## -### Show what's in the IEER Headlines -############################################## - - -def ieer_headlines(): - - from nltk.corpus import ieer - from nltk.tree import Tree - - print("IEER: First 20 Headlines") - print("=" * 45) - - trees = [ - (doc.docno, doc.headline) - for file in ieer.fileids() - for doc in ieer.parsed_docs(file) - ] - for tree in trees[:20]: - print() - print("%s:\n%s" % tree) - - -############################################# -## Dutch CONLL2002: take_on_role(PER, ORG -############################################# - - -def conllned(trace=1): - """ - Find the copula+'van' relation ('of') in the Dutch tagged training corpus - from CoNLL 2002. - """ - - from nltk.corpus import conll2002 - - vnv = """ - ( - is/V| # 3rd sing present and - was/V| # past forms of the verb zijn ('be') - werd/V| # and also present - wordt/V # past of worden ('become) - ) - .* # followed by anything - van/Prep # followed by van ('of') - """ - VAN = re.compile(vnv, re.VERBOSE) - - print() - print("Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:") - print("=" * 45) - - for doc in conll2002.chunked_sents("ned.train"): - lcon = rcon = False - if trace: - lcon = rcon = True - for rel in extract_rels( - "PER", "ORG", doc, corpus="conll2002", pattern=VAN, window=10 - ): - print(rtuple(rel, lcon=lcon, rcon=rcon)) - - -############################################# -## Spanish CONLL2002: (PER, ORG) -############################################# - - -def conllesp(): - from nltk.corpus import conll2002 - - de = """ - .* - ( - de/SP| - del/SP - ) - """ - DE = re.compile(de, re.VERBOSE) - - print() - print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:") - print("=" * 45) - rels = [ - rel - for doc in conll2002.chunked_sents("esp.train") - for rel in extract_rels("ORG", "LOC", doc, corpus="conll2002", pattern=DE) - ] - for r in rels[:10]: - print(clause(r, relsym="DE")) - print() - - -def ne_chunked(): - print() - print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker") - print("=" * 45) - ROLE = re.compile( - r".*(chairman|president|trader|scientist|economist|analyst|partner).*" - ) - rels = [] - for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]): - sent = nltk.ne_chunk(sent) - rels = extract_rels("PER", "ORG", sent, corpus="ace", pattern=ROLE, window=7) - for rel in rels: - print(f"{i:<5}{rtuple(rel)}") - - -if __name__ == "__main__": - import nltk - from nltk.sem import relextract - - in_demo(trace=0) - roles_demo(trace=0) - conllned() - conllesp() - ieer_headlines() - ne_chunked() diff --git a/pipeline/nltk/sem/skolemize.py b/pipeline/nltk/sem/skolemize.py deleted file mode 100644 index 6f98437cee85ecf4a023a71a3f4518e25893ef8d..0000000000000000000000000000000000000000 --- a/pipeline/nltk/sem/skolemize.py +++ /dev/null @@ -1,148 +0,0 @@ -# Natural Language Toolkit: Semantic Interpretation -# -# Author: Ewan Klein -# -# Copyright (C) 2001-2023 NLTK Project -# URL: -# For license information, see LICENSE.TXT - -from nltk.sem.logic import ( - AllExpression, - AndExpression, - ApplicationExpression, - EqualityExpression, - ExistsExpression, - IffExpression, - ImpExpression, - NegatedExpression, - OrExpression, - VariableExpression, - skolem_function, - unique_variable, -) - - -def skolemize(expression, univ_scope=None, used_variables=None): - """ - Skolemize the expression and convert to conjunctive normal form (CNF) - """ - if univ_scope is None: - univ_scope = set() - if used_variables is None: - used_variables = set() - - if isinstance(expression, AllExpression): - term = skolemize( - expression.term, - univ_scope | {expression.variable}, - used_variables | {expression.variable}, - ) - return term.replace( - expression.variable, - VariableExpression(unique_variable(ignore=used_variables)), - ) - elif isinstance(expression, AndExpression): - return skolemize(expression.first, univ_scope, used_variables) & skolemize( - expression.second, univ_scope, used_variables - ) - elif isinstance(expression, OrExpression): - return to_cnf( - skolemize(expression.first, univ_scope, used_variables), - skolemize(expression.second, univ_scope, used_variables), - ) - elif isinstance(expression, ImpExpression): - return to_cnf( - skolemize(-expression.first, univ_scope, used_variables), - skolemize(expression.second, univ_scope, used_variables), - ) - elif isinstance(expression, IffExpression): - return to_cnf( - skolemize(-expression.first, univ_scope, used_variables), - skolemize(expression.second, univ_scope, used_variables), - ) & to_cnf( - skolemize(expression.first, univ_scope, used_variables), - skolemize(-expression.second, univ_scope, used_variables), - ) - elif isinstance(expression, EqualityExpression): - return expression - elif isinstance(expression, NegatedExpression): - negated = expression.term - if isinstance(negated, AllExpression): - term = skolemize( - -negated.term, univ_scope, used_variables | {negated.variable} - ) - if univ_scope: - return term.replace(negated.variable, skolem_function(univ_scope)) - else: - skolem_constant = VariableExpression( - unique_variable(ignore=used_variables) - ) - return term.replace(negated.variable, skolem_constant) - elif isinstance(negated, AndExpression): - return to_cnf( - skolemize(-negated.first, univ_scope, used_variables), - skolemize(-negated.second, univ_scope, used_variables), - ) - elif isinstance(negated, OrExpression): - return skolemize(-negated.first, univ_scope, used_variables) & skolemize( - -negated.second, univ_scope, used_variables - ) - elif isinstance(negated, ImpExpression): - return skolemize(negated.first, univ_scope, used_variables) & skolemize( - -negated.second, univ_scope, used_variables - ) - elif isinstance(negated, IffExpression): - return to_cnf( - skolemize(-negated.first, univ_scope, used_variables), - skolemize(-negated.second, univ_scope, used_variables), - ) & to_cnf( - skolemize(negated.first, univ_scope, used_variables), - skolemize(negated.second, univ_scope, used_variables), - ) - elif isinstance(negated, EqualityExpression): - return expression - elif isinstance(negated, NegatedExpression): - return skolemize(negated.term, univ_scope, used_variables) - elif isinstance(negated, ExistsExpression): - term = skolemize( - -negated.term, - univ_scope | {negated.variable}, - used_variables | {negated.variable}, - ) - return term.replace( - negated.variable, - VariableExpression(unique_variable(ignore=used_variables)), - ) - elif isinstance(negated, ApplicationExpression): - return expression - else: - raise Exception("'%s' cannot be skolemized" % expression) - elif isinstance(expression, ExistsExpression): - term = skolemize( - expression.term, univ_scope, used_variables | {expression.variable} - ) - if univ_scope: - return term.replace(expression.variable, skolem_function(univ_scope)) - else: - skolem_constant = VariableExpression(unique_variable(ignore=used_variables)) - return term.replace(expression.variable, skolem_constant) - elif isinstance(expression, ApplicationExpression): - return expression - else: - raise Exception("'%s' cannot be skolemized" % expression) - - -def to_cnf(first, second): - """ - Convert this split disjunction to conjunctive normal form (CNF) - """ - if isinstance(first, AndExpression): - r_first = to_cnf(first.first, second) - r_second = to_cnf(first.second, second) - return r_first & r_second - elif isinstance(second, AndExpression): - r_first = to_cnf(first, second.first) - r_second = to_cnf(first, second.second) - return r_first & r_second - else: - return first | second diff --git a/pipeline/nltk/sem/util.py b/pipeline/nltk/sem/util.py deleted file mode 100644 index 8d119db424331b9b9873733a0acc6e9b3754a5cb..0000000000000000000000000000000000000000 --- a/pipeline/nltk/sem/util.py +++ /dev/null @@ -1,309 +0,0 @@ -# Natural Language Toolkit: Semantic Interpretation -# -# Author: Ewan Klein -# -# Copyright (C) 2001-2023 NLTK Project -# URL: -# For license information, see LICENSE.TXT - -""" -Utility functions for batch-processing sentences: parsing and -extraction of the semantic representation of the root node of the the -syntax tree, followed by evaluation of the semantic representation in -a first-order model. -""" - -import codecs - -from nltk.sem import evaluate - -############################################################## -## Utility functions for connecting parse output to semantics -############################################################## - - -def parse_sents(inputs, grammar, trace=0): - """ - Convert input sentences into syntactic trees. - - :param inputs: sentences to be parsed - :type inputs: list(str) - :param grammar: ``FeatureGrammar`` or name of feature-based grammar - :type grammar: nltk.grammar.FeatureGrammar - :rtype: list(nltk.tree.Tree) or dict(list(str)): list(Tree) - :return: a mapping from input sentences to a list of ``Tree`` instances. - """ - # put imports here to avoid circult dependencies - from nltk.grammar import FeatureGrammar - from nltk.parse import FeatureChartParser, load_parser - - if isinstance(grammar, FeatureGrammar): - cp = FeatureChartParser(grammar) - else: - cp = load_parser(grammar, trace=trace) - parses = [] - for sent in inputs: - tokens = sent.split() # use a tokenizer? - syntrees = list(cp.parse(tokens)) - parses.append(syntrees) - return parses - - -def root_semrep(syntree, semkey="SEM"): - """ - Find the semantic representation at the root of a tree. - - :param syntree: a parse ``Tree`` - :param semkey: the feature label to use for the root semantics in the tree - :return: the semantic representation at the root of a ``Tree`` - :rtype: sem.Expression - """ - from nltk.grammar import FeatStructNonterminal - - node = syntree.label() - assert isinstance(node, FeatStructNonterminal) - try: - return node[semkey] - except KeyError: - print(node, end=" ") - print("has no specification for the feature %s" % semkey) - raise - - -def interpret_sents(inputs, grammar, semkey="SEM", trace=0): - """ - Add the semantic representation to each syntactic parse tree - of each input sentence. - - :param inputs: a list of sentences - :type inputs: list(str) - :param grammar: ``FeatureGrammar`` or name of feature-based grammar - :type grammar: nltk.grammar.FeatureGrammar - :return: a mapping from sentences to lists of pairs (parse-tree, semantic-representations) - :rtype: list(list(tuple(nltk.tree.Tree, nltk.sem.logic.ConstantExpression))) - """ - return [ - [(syn, root_semrep(syn, semkey)) for syn in syntrees] - for syntrees in parse_sents(inputs, grammar, trace=trace) - ] - - -def evaluate_sents(inputs, grammar, model, assignment, trace=0): - """ - Add the truth-in-a-model value to each semantic representation - for each syntactic parse of each input sentences. - - :param inputs: a list of sentences - :type inputs: list(str) - :param grammar: ``FeatureGrammar`` or name of feature-based grammar - :type grammar: nltk.grammar.FeatureGrammar - :return: a mapping from sentences to lists of triples (parse-tree, semantic-representations, evaluation-in-model) - :rtype: list(list(tuple(nltk.tree.Tree, nltk.sem.logic.ConstantExpression, bool or dict(str): bool))) - """ - return [ - [ - (syn, sem, model.evaluate("%s" % sem, assignment, trace=trace)) - for (syn, sem) in interpretations - ] - for interpretations in interpret_sents(inputs, grammar) - ] - - -def demo_model0(): - global m0, g0 - # Initialize a valuation of non-logical constants.""" - v = [ - ("john", "b1"), - ("mary", "g1"), - ("suzie", "g2"), - ("fido", "d1"), - ("tess", "d2"), - ("noosa", "n"), - ("girl", {"g1", "g2"}), - ("boy", {"b1", "b2"}), - ("dog", {"d1", "d2"}), - ("bark", {"d1", "d2"}), - ("walk", {"b1", "g2", "d1"}), - ("chase", {("b1", "g1"), ("b2", "g1"), ("g1", "d1"), ("g2", "d2")}), - ( - "see", - {("b1", "g1"), ("b2", "d2"), ("g1", "b1"), ("d2", "b1"), ("g2", "n")}, - ), - ("in", {("b1", "n"), ("b2", "n"), ("d2", "n")}), - ("with", {("b1", "g1"), ("g1", "b1"), ("d1", "b1"), ("b1", "d1")}), - ] - # Read in the data from ``v`` - val = evaluate.Valuation(v) - # Bind ``dom`` to the ``domain`` property of ``val`` - dom = val.domain - # Initialize a model with parameters ``dom`` and ``val``. - m0 = evaluate.Model(dom, val) - # Initialize a variable assignment with parameter ``dom`` - g0 = evaluate.Assignment(dom) - - -def read_sents(filename, encoding="utf8"): - with codecs.open(filename, "r", encoding) as fp: - sents = [l.rstrip() for l in fp] - - # get rid of blank lines - sents = [l for l in sents if len(l) > 0] - sents = [l for l in sents if not l[0] == "#"] - return sents - - -def demo_legacy_grammar(): - """ - Check that interpret_sents() is compatible with legacy grammars that use - a lowercase 'sem' feature. - - Define 'test.fcfg' to be the following - - """ - from nltk.grammar import FeatureGrammar - - g = FeatureGrammar.fromstring( - """ - % start S - S[sem=] -> 'hello' - """ - ) - print("Reading grammar: %s" % g) - print("*" * 20) - for reading in interpret_sents(["hello"], g, semkey="sem"): - syn, sem = reading[0] - print() - print("output: ", sem) - - -def demo(): - import sys - from optparse import OptionParser - - description = """ - Parse and evaluate some sentences. - """ - - opts = OptionParser(description=description) - - opts.set_defaults( - evaluate=True, - beta=True, - syntrace=0, - semtrace=0, - demo="default", - grammar="", - sentences="", - ) - - opts.add_option( - "-d", - "--demo", - dest="demo", - help="choose demo D; omit this for the default demo, or specify 'chat80'", - metavar="D", - ) - opts.add_option( - "-g", "--gram", dest="grammar", help="read in grammar G", metavar="G" - ) - opts.add_option( - "-m", - "--model", - dest="model", - help="import model M (omit '.py' suffix)", - metavar="M", - ) - opts.add_option( - "-s", - "--sentences", - dest="sentences", - help="read in a file of test sentences S", - metavar="S", - ) - opts.add_option( - "-e", - "--no-eval", - action="store_false", - dest="evaluate", - help="just do a syntactic analysis", - ) - opts.add_option( - "-b", - "--no-beta-reduction", - action="store_false", - dest="beta", - help="don't carry out beta-reduction", - ) - opts.add_option( - "-t", - "--syntrace", - action="count", - dest="syntrace", - help="set syntactic tracing on; requires '-e' option", - ) - opts.add_option( - "-T", - "--semtrace", - action="count", - dest="semtrace", - help="set semantic tracing on", - ) - - (options, args) = opts.parse_args() - - SPACER = "-" * 30 - - demo_model0() - - sents = [ - "Fido sees a boy with Mary", - "John sees Mary", - "every girl chases a dog", - "every boy chases a girl", - "John walks with a girl in Noosa", - "who walks", - ] - - gramfile = "grammars/sample_grammars/sem2.fcfg" - - if options.sentences: - sentsfile = options.sentences - if options.grammar: - gramfile = options.grammar - if options.model: - exec("import %s as model" % options.model) - - if sents is None: - sents = read_sents(sentsfile) - - # Set model and assignment - model = m0 - g = g0 - - if options.evaluate: - evaluations = evaluate_sents(sents, gramfile, model, g, trace=options.semtrace) - else: - semreps = interpret_sents(sents, gramfile, trace=options.syntrace) - - for i, sent in enumerate(sents): - n = 1 - print("\nSentence: %s" % sent) - print(SPACER) - if options.evaluate: - - for (syntree, semrep, value) in evaluations[i]: - if isinstance(value, dict): - value = set(value.keys()) - print("%d: %s" % (n, semrep)) - print(value) - n += 1 - else: - - for (syntree, semrep) in semreps[i]: - print("%d: %s" % (n, semrep)) - n += 1 - - -if __name__ == "__main__": - demo() - demo_legacy_grammar() diff --git a/pipeline/nltk/sentiment/__init__.py b/pipeline/nltk/sentiment/__init__.py deleted file mode 100644 index 37c21108d41d8daafbcee02c34646291db597e88..0000000000000000000000000000000000000000 --- a/pipeline/nltk/sentiment/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Natural Language Toolkit: Sentiment Analysis -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ewan Klein -# URL: -# For license information, see LICENSE.TXT - -""" -NLTK Sentiment Analysis Package - -""" -from nltk.sentiment.sentiment_analyzer import SentimentAnalyzer -from nltk.sentiment.vader import SentimentIntensityAnalyzer diff --git a/pipeline/nltk/sentiment/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/sentiment/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 9776170bcd07c9fd92985a782fff01338ef96168..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/sentiment/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/sentiment/__pycache__/sentiment_analyzer.cpython-39.pyc b/pipeline/nltk/sentiment/__pycache__/sentiment_analyzer.cpython-39.pyc deleted file mode 100644 index d73b2535fab9e87c749c2f8f74c1952082d000fa..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/sentiment/__pycache__/sentiment_analyzer.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/sentiment/__pycache__/util.cpython-39.pyc b/pipeline/nltk/sentiment/__pycache__/util.cpython-39.pyc deleted file mode 100644 index 40f1844bffa5972bfefbf2028ee08e5f74d353ae..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/sentiment/__pycache__/util.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/sentiment/__pycache__/vader.cpython-39.pyc b/pipeline/nltk/sentiment/__pycache__/vader.cpython-39.pyc deleted file mode 100644 index b1eeb72b00cb06aaf64fc69155fc6195daec02b7..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/sentiment/__pycache__/vader.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/sentiment/sentiment_analyzer.py b/pipeline/nltk/sentiment/sentiment_analyzer.py deleted file mode 100644 index 6654de34d8fbe801bcdde6ef37e1951e207f5ff9..0000000000000000000000000000000000000000 --- a/pipeline/nltk/sentiment/sentiment_analyzer.py +++ /dev/null @@ -1,255 +0,0 @@ -# -# Natural Language Toolkit: Sentiment Analyzer -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Pierpaolo Pantone <24alsecondo@gmail.com> -# URL: -# For license information, see LICENSE.TXT - -""" -A SentimentAnalyzer is a tool to implement and facilitate Sentiment Analysis tasks -using NLTK features and classifiers, especially for teaching and demonstrative -purposes. -""" - -import sys -from collections import defaultdict - -from nltk.classify.util import accuracy as eval_accuracy -from nltk.classify.util import apply_features -from nltk.collocations import BigramCollocationFinder -from nltk.metrics import BigramAssocMeasures -from nltk.metrics import f_measure as eval_f_measure -from nltk.metrics import precision as eval_precision -from nltk.metrics import recall as eval_recall -from nltk.probability import FreqDist - - -class SentimentAnalyzer: - """ - A Sentiment Analysis tool based on machine learning approaches. - """ - - def __init__(self, classifier=None): - self.feat_extractors = defaultdict(list) - self.classifier = classifier - - def all_words(self, documents, labeled=None): - """ - Return all words/tokens from the documents (with duplicates). - - :param documents: a list of (words, label) tuples. - :param labeled: if `True`, assume that each document is represented by a - (words, label) tuple: (list(str), str). If `False`, each document is - considered as being a simple list of strings: list(str). - :rtype: list(str) - :return: A list of all words/tokens in `documents`. - """ - all_words = [] - if labeled is None: - labeled = documents and isinstance(documents[0], tuple) - if labeled: - for words, _sentiment in documents: - all_words.extend(words) - elif not labeled: - for words in documents: - all_words.extend(words) - return all_words - - def apply_features(self, documents, labeled=None): - """ - Apply all feature extractor functions to the documents. This is a wrapper - around `nltk.classify.util.apply_features`. - - If `labeled=False`, return featuresets as: - [feature_func(doc) for doc in documents] - If `labeled=True`, return featuresets as: - [(feature_func(tok), label) for (tok, label) in toks] - - :param documents: a list of documents. `If labeled=True`, the method expects - a list of (words, label) tuples. - :rtype: LazyMap - """ - return apply_features(self.extract_features, documents, labeled) - - def unigram_word_feats(self, words, top_n=None, min_freq=0): - """ - Return most common top_n word features. - - :param words: a list of words/tokens. - :param top_n: number of best words/tokens to use, sorted by frequency. - :rtype: list(str) - :return: A list of `top_n` words/tokens (with no duplicates) sorted by - frequency. - """ - # Stopwords are not removed - unigram_feats_freqs = FreqDist(word for word in words) - return [ - w - for w, f in unigram_feats_freqs.most_common(top_n) - if unigram_feats_freqs[w] > min_freq - ] - - def bigram_collocation_feats( - self, documents, top_n=None, min_freq=3, assoc_measure=BigramAssocMeasures.pmi - ): - """ - Return `top_n` bigram features (using `assoc_measure`). - Note that this method is based on bigram collocations measures, and not - on simple bigram frequency. - - :param documents: a list (or iterable) of tokens. - :param top_n: number of best words/tokens to use, sorted by association - measure. - :param assoc_measure: bigram association measure to use as score function. - :param min_freq: the minimum number of occurrencies of bigrams to take - into consideration. - - :return: `top_n` ngrams scored by the given association measure. - """ - finder = BigramCollocationFinder.from_documents(documents) - finder.apply_freq_filter(min_freq) - return finder.nbest(assoc_measure, top_n) - - def classify(self, instance): - """ - Classify a single instance applying the features that have already been - stored in the SentimentAnalyzer. - - :param instance: a list (or iterable) of tokens. - :return: the classification result given by applying the classifier. - """ - instance_feats = self.apply_features([instance], labeled=False) - return self.classifier.classify(instance_feats[0]) - - def add_feat_extractor(self, function, **kwargs): - """ - Add a new function to extract features from a document. This function will - be used in extract_features(). - Important: in this step our kwargs are only representing additional parameters, - and NOT the document we have to parse. The document will always be the first - parameter in the parameter list, and it will be added in the extract_features() - function. - - :param function: the extractor function to add to the list of feature extractors. - :param kwargs: additional parameters required by the `function` function. - """ - self.feat_extractors[function].append(kwargs) - - def extract_features(self, document): - """ - Apply extractor functions (and their parameters) to the present document. - We pass `document` as the first parameter of the extractor functions. - If we want to use the same extractor function multiple times, we have to - add it to the extractors with `add_feat_extractor` using multiple sets of - parameters (one for each call of the extractor function). - - :param document: the document that will be passed as argument to the - feature extractor functions. - :return: A dictionary of populated features extracted from the document. - :rtype: dict - """ - all_features = {} - for extractor in self.feat_extractors: - for param_set in self.feat_extractors[extractor]: - feats = extractor(document, **param_set) - all_features.update(feats) - return all_features - - def train(self, trainer, training_set, save_classifier=None, **kwargs): - """ - Train classifier on the training set, optionally saving the output in the - file specified by `save_classifier`. - Additional arguments depend on the specific trainer used. For example, - a MaxentClassifier can use `max_iter` parameter to specify the number - of iterations, while a NaiveBayesClassifier cannot. - - :param trainer: `train` method of a classifier. - E.g.: NaiveBayesClassifier.train - :param training_set: the training set to be passed as argument to the - classifier `train` method. - :param save_classifier: the filename of the file where the classifier - will be stored (optional). - :param kwargs: additional parameters that will be passed as arguments to - the classifier `train` function. - :return: A classifier instance trained on the training set. - :rtype: - """ - print("Training classifier") - self.classifier = trainer(training_set, **kwargs) - if save_classifier: - self.save_file(self.classifier, save_classifier) - - return self.classifier - - def save_file(self, content, filename): - """ - Store `content` in `filename`. Can be used to store a SentimentAnalyzer. - """ - print("Saving", filename, file=sys.stderr) - with open(filename, "wb") as storage_file: - import pickle - - # The protocol=2 parameter is for python2 compatibility - pickle.dump(content, storage_file, protocol=2) - - def evaluate( - self, - test_set, - classifier=None, - accuracy=True, - f_measure=True, - precision=True, - recall=True, - verbose=False, - ): - """ - Evaluate and print classifier performance on the test set. - - :param test_set: A list of (tokens, label) tuples to use as gold set. - :param classifier: a classifier instance (previously trained). - :param accuracy: if `True`, evaluate classifier accuracy. - :param f_measure: if `True`, evaluate classifier f_measure. - :param precision: if `True`, evaluate classifier precision. - :param recall: if `True`, evaluate classifier recall. - :return: evaluation results. - :rtype: dict(str): float - """ - if classifier is None: - classifier = self.classifier - print(f"Evaluating {type(classifier).__name__} results...") - metrics_results = {} - if accuracy: - accuracy_score = eval_accuracy(classifier, test_set) - metrics_results["Accuracy"] = accuracy_score - - gold_results = defaultdict(set) - test_results = defaultdict(set) - labels = set() - for i, (feats, label) in enumerate(test_set): - labels.add(label) - gold_results[label].add(i) - observed = classifier.classify(feats) - test_results[observed].add(i) - - for label in labels: - if precision: - precision_score = eval_precision( - gold_results[label], test_results[label] - ) - metrics_results[f"Precision [{label}]"] = precision_score - if recall: - recall_score = eval_recall(gold_results[label], test_results[label]) - metrics_results[f"Recall [{label}]"] = recall_score - if f_measure: - f_measure_score = eval_f_measure( - gold_results[label], test_results[label] - ) - metrics_results[f"F-measure [{label}]"] = f_measure_score - - # Print evaluation results (in alphabetical order) - if verbose: - for result in sorted(metrics_results): - print(f"{result}: {metrics_results[result]}") - - return metrics_results diff --git a/pipeline/nltk/sentiment/util.py b/pipeline/nltk/sentiment/util.py deleted file mode 100644 index 0a698981e1d2be99e97e5e474f016781921a2595..0000000000000000000000000000000000000000 --- a/pipeline/nltk/sentiment/util.py +++ /dev/null @@ -1,887 +0,0 @@ -# -# Natural Language Toolkit: Sentiment Analyzer -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Pierpaolo Pantone <24alsecondo@gmail.com> -# URL: -# For license information, see LICENSE.TXT - -""" -Utility methods for Sentiment Analysis. -""" - -import codecs -import csv -import json -import pickle -import random -import re -import sys -import time -from copy import deepcopy - -import nltk -from nltk.corpus import CategorizedPlaintextCorpusReader -from nltk.data import load -from nltk.tokenize.casual import EMOTICON_RE - -# //////////////////////////////////////////////////////////// -# { Regular expressions -# //////////////////////////////////////////////////////////// - -# Regular expression for negation by Christopher Potts -NEGATION = r""" - (?: - ^(?:never|no|nothing|nowhere|noone|none|not| - havent|hasnt|hadnt|cant|couldnt|shouldnt| - wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint - )$ - ) - | - n't""" - -NEGATION_RE = re.compile(NEGATION, re.VERBOSE) - -CLAUSE_PUNCT = r"^[.:;!?]$" -CLAUSE_PUNCT_RE = re.compile(CLAUSE_PUNCT) - -# Happy and sad emoticons - -HAPPY = { - ":-)", - ":)", - ";)", - ":o)", - ":]", - ":3", - ":c)", - ":>", - "=]", - "8)", - "=)", - ":}", - ":^)", - ":-D", - ":D", - "8-D", - "8D", - "x-D", - "xD", - "X-D", - "XD", - "=-D", - "=D", - "=-3", - "=3", - ":-))", - ":'-)", - ":')", - ":*", - ":^*", - ">:P", - ":-P", - ":P", - "X-P", - "x-p", - "xp", - "XP", - ":-p", - ":p", - "=p", - ":-b", - ":b", - ">:)", - ">;)", - ">:-)", - "<3", -} - -SAD = { - ":L", - ":-/", - ">:/", - ":S", - ">:[", - ":@", - ":-(", - ":[", - ":-||", - "=L", - ":<", - ":-[", - ":-<", - "=\\", - "=/", - ">:(", - ":(", - ">.<", - ":'-(", - ":'(", - ":\\", - ":-c", - ":c", - ":{", - ">:\\", - ";(", -} - - -def timer(method): - """ - A timer decorator to measure execution performance of methods. - """ - - def timed(*args, **kw): - start = time.time() - result = method(*args, **kw) - end = time.time() - tot_time = end - start - hours = tot_time // 3600 - mins = tot_time // 60 % 60 - # in Python 2.x round() will return a float, so we convert it to int - secs = int(round(tot_time % 60)) - if hours == 0 and mins == 0 and secs < 10: - print(f"[TIMER] {method.__name__}(): {method.__name__:.3f} seconds") - else: - print(f"[TIMER] {method.__name__}(): {hours}h {mins}m {secs}s") - return result - - return timed - - -# //////////////////////////////////////////////////////////// -# { Feature extractor functions -# //////////////////////////////////////////////////////////// -""" -Feature extractor functions are declared outside the SentimentAnalyzer class. -Users should have the possibility to create their own feature extractors -without modifying SentimentAnalyzer. -""" - - -def extract_unigram_feats(document, unigrams, handle_negation=False): - """ - Populate a dictionary of unigram features, reflecting the presence/absence in - the document of each of the tokens in `unigrams`. - - :param document: a list of words/tokens. - :param unigrams: a list of words/tokens whose presence/absence has to be - checked in `document`. - :param handle_negation: if `handle_negation == True` apply `mark_negation` - method to `document` before checking for unigram presence/absence. - :return: a dictionary of unigram features {unigram : boolean}. - - >>> words = ['ice', 'police', 'riot'] - >>> document = 'ice is melting due to global warming'.split() - >>> sorted(extract_unigram_feats(document, words).items()) - [('contains(ice)', True), ('contains(police)', False), ('contains(riot)', False)] - """ - features = {} - if handle_negation: - document = mark_negation(document) - for word in unigrams: - features[f"contains({word})"] = word in set(document) - return features - - -def extract_bigram_feats(document, bigrams): - """ - Populate a dictionary of bigram features, reflecting the presence/absence in - the document of each of the tokens in `bigrams`. This extractor function only - considers contiguous bigrams obtained by `nltk.bigrams`. - - :param document: a list of words/tokens. - :param unigrams: a list of bigrams whose presence/absence has to be - checked in `document`. - :return: a dictionary of bigram features {bigram : boolean}. - - >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')] - >>> document = 'ice is melting due to global warming'.split() - >>> sorted(extract_bigram_feats(document, bigrams).items()) # doctest: +NORMALIZE_WHITESPACE - [('contains(global - warming)', True), ('contains(love - you)', False), - ('contains(police - prevented)', False)] - """ - features = {} - for bigr in bigrams: - features[f"contains({bigr[0]} - {bigr[1]})"] = bigr in nltk.bigrams(document) - return features - - -# //////////////////////////////////////////////////////////// -# { Helper Functions -# //////////////////////////////////////////////////////////// - - -def mark_negation(document, double_neg_flip=False, shallow=False): - """ - Append _NEG suffix to words that appear in the scope between a negation - and a punctuation mark. - - :param document: a list of words/tokens, or a tuple (words, label). - :param shallow: if True, the method will modify the original document in place. - :param double_neg_flip: if True, double negation is considered affirmation - (we activate/deactivate negation scope every time we find a negation). - :return: if `shallow == True` the method will modify the original document - and return it. If `shallow == False` the method will return a modified - document, leaving the original unmodified. - - >>> sent = "I didn't like this movie . It was bad .".split() - >>> mark_negation(sent) - ['I', "didn't", 'like_NEG', 'this_NEG', 'movie_NEG', '.', 'It', 'was', 'bad', '.'] - """ - if not shallow: - document = deepcopy(document) - # check if the document is labeled. If so, do not consider the label. - labeled = document and isinstance(document[0], (tuple, list)) - if labeled: - doc = document[0] - else: - doc = document - neg_scope = False - for i, word in enumerate(doc): - if NEGATION_RE.search(word): - if not neg_scope or (neg_scope and double_neg_flip): - neg_scope = not neg_scope - continue - else: - doc[i] += "_NEG" - elif neg_scope and CLAUSE_PUNCT_RE.search(word): - neg_scope = not neg_scope - elif neg_scope and not CLAUSE_PUNCT_RE.search(word): - doc[i] += "_NEG" - - return document - - -def output_markdown(filename, **kwargs): - """ - Write the output of an analysis to a file. - """ - with codecs.open(filename, "at") as outfile: - text = "\n*** \n\n" - text += "{} \n\n".format(time.strftime("%d/%m/%Y, %H:%M")) - for k in sorted(kwargs): - if isinstance(kwargs[k], dict): - dictionary = kwargs[k] - text += f" - **{k}:**\n" - for entry in sorted(dictionary): - text += f" - {entry}: {dictionary[entry]} \n" - elif isinstance(kwargs[k], list): - text += f" - **{k}:**\n" - for entry in kwargs[k]: - text += f" - {entry}\n" - else: - text += f" - **{k}:** {kwargs[k]} \n" - outfile.write(text) - - -def split_train_test(all_instances, n=None): - """ - Randomly split `n` instances of the dataset into train and test sets. - - :param all_instances: a list of instances (e.g. documents) that will be split. - :param n: the number of instances to consider (in case we want to use only a - subset). - :return: two lists of instances. Train set is 8/10 of the total and test set - is 2/10 of the total. - """ - random.seed(12345) - random.shuffle(all_instances) - if not n or n > len(all_instances): - n = len(all_instances) - train_set = all_instances[: int(0.8 * n)] - test_set = all_instances[int(0.8 * n) : n] - - return train_set, test_set - - -def _show_plot(x_values, y_values, x_labels=None, y_labels=None): - try: - import matplotlib.pyplot as plt - except ImportError as e: - raise ImportError( - "The plot function requires matplotlib to be installed." - "See https://matplotlib.org/" - ) from e - - plt.locator_params(axis="y", nbins=3) - axes = plt.axes() - axes.yaxis.grid() - plt.plot(x_values, y_values, "ro", color="red") - plt.ylim(ymin=-1.2, ymax=1.2) - plt.tight_layout(pad=5) - if x_labels: - plt.xticks(x_values, x_labels, rotation="vertical") - if y_labels: - plt.yticks([-1, 0, 1], y_labels, rotation="horizontal") - # Pad margins so that markers are not clipped by the axes - plt.margins(0.2) - plt.show() - - -# //////////////////////////////////////////////////////////// -# { Parsing and conversion functions -# //////////////////////////////////////////////////////////// - - -def json2csv_preprocess( - json_file, - outfile, - fields, - encoding="utf8", - errors="replace", - gzip_compress=False, - skip_retweets=True, - skip_tongue_tweets=True, - skip_ambiguous_tweets=True, - strip_off_emoticons=True, - remove_duplicates=True, - limit=None, -): - """ - Convert json file to csv file, preprocessing each row to obtain a suitable - dataset for tweets Semantic Analysis. - - :param json_file: the original json file containing tweets. - :param outfile: the output csv filename. - :param fields: a list of fields that will be extracted from the json file and - kept in the output csv file. - :param encoding: the encoding of the files. - :param errors: the error handling strategy for the output writer. - :param gzip_compress: if True, create a compressed GZIP file. - - :param skip_retweets: if True, remove retweets. - :param skip_tongue_tweets: if True, remove tweets containing ":P" and ":-P" - emoticons. - :param skip_ambiguous_tweets: if True, remove tweets containing both happy - and sad emoticons. - :param strip_off_emoticons: if True, strip off emoticons from all tweets. - :param remove_duplicates: if True, remove tweets appearing more than once. - :param limit: an integer to set the number of tweets to convert. After the - limit is reached the conversion will stop. It can be useful to create - subsets of the original tweets json data. - """ - with codecs.open(json_file, encoding=encoding) as fp: - (writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress) - # write the list of fields as header - writer.writerow(fields) - - if remove_duplicates == True: - tweets_cache = [] - i = 0 - for line in fp: - tweet = json.loads(line) - row = extract_fields(tweet, fields) - try: - text = row[fields.index("text")] - # Remove retweets - if skip_retweets == True: - if re.search(r"\bRT\b", text): - continue - # Remove tweets containing ":P" and ":-P" emoticons - if skip_tongue_tweets == True: - if re.search(r"\:\-?P\b", text): - continue - # Remove tweets containing both happy and sad emoticons - if skip_ambiguous_tweets == True: - all_emoticons = EMOTICON_RE.findall(text) - if all_emoticons: - if (set(all_emoticons) & HAPPY) and (set(all_emoticons) & SAD): - continue - # Strip off emoticons from all tweets - if strip_off_emoticons == True: - row[fields.index("text")] = re.sub( - r"(?!\n)\s+", " ", EMOTICON_RE.sub("", text) - ) - # Remove duplicate tweets - if remove_duplicates == True: - if row[fields.index("text")] in tweets_cache: - continue - else: - tweets_cache.append(row[fields.index("text")]) - except ValueError: - pass - writer.writerow(row) - i += 1 - if limit and i >= limit: - break - outf.close() - - -def parse_tweets_set( - filename, label, word_tokenizer=None, sent_tokenizer=None, skip_header=True -): - """ - Parse csv file containing tweets and output data a list of (text, label) tuples. - - :param filename: the input csv filename. - :param label: the label to be appended to each tweet contained in the csv file. - :param word_tokenizer: the tokenizer instance that will be used to tokenize - each sentence into tokens (e.g. WordPunctTokenizer() or BlanklineTokenizer()). - If no word_tokenizer is specified, tweets will not be tokenized. - :param sent_tokenizer: the tokenizer that will be used to split each tweet into - sentences. - :param skip_header: if True, skip the first line of the csv file (which usually - contains headers). - - :return: a list of (text, label) tuples. - """ - tweets = [] - if not sent_tokenizer: - sent_tokenizer = load("tokenizers/punkt/english.pickle") - - with codecs.open(filename, "rt") as csvfile: - reader = csv.reader(csvfile) - if skip_header == True: - next(reader, None) # skip the header - i = 0 - for tweet_id, text in reader: - # text = text[1] - i += 1 - sys.stdout.write(f"Loaded {i} tweets\r") - # Apply sentence and word tokenizer to text - if word_tokenizer: - tweet = [ - w - for sent in sent_tokenizer.tokenize(text) - for w in word_tokenizer.tokenize(sent) - ] - else: - tweet = text - tweets.append((tweet, label)) - - print(f"Loaded {i} tweets") - return tweets - - -# //////////////////////////////////////////////////////////// -# { Demos -# //////////////////////////////////////////////////////////// - - -def demo_tweets(trainer, n_instances=None, output=None): - """ - Train and test Naive Bayes classifier on 10000 tweets, tokenized using - TweetTokenizer. - Features are composed of: - - - 1000 most frequent unigrams - - 100 top bigrams (using BigramAssocMeasures.pmi) - - :param trainer: `train` method of a classifier. - :param n_instances: the number of total tweets that have to be used for - training and testing. Tweets will be equally split between positive and - negative. - :param output: the output file where results have to be reported. - """ - from nltk.corpus import stopwords, twitter_samples - from nltk.sentiment import SentimentAnalyzer - from nltk.tokenize import TweetTokenizer - - # Different customizations for the TweetTokenizer - tokenizer = TweetTokenizer(preserve_case=False) - # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True) - # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True) - - if n_instances is not None: - n_instances = int(n_instances / 2) - - fields = ["id", "text"] - positive_json = twitter_samples.abspath("positive_tweets.json") - positive_csv = "positive_tweets.csv" - json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances) - - negative_json = twitter_samples.abspath("negative_tweets.json") - negative_csv = "negative_tweets.csv" - json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances) - - neg_docs = parse_tweets_set(negative_csv, label="neg", word_tokenizer=tokenizer) - pos_docs = parse_tweets_set(positive_csv, label="pos", word_tokenizer=tokenizer) - - # We separately split subjective and objective instances to keep a balanced - # uniform class distribution in both train and test sets. - train_pos_docs, test_pos_docs = split_train_test(pos_docs) - train_neg_docs, test_neg_docs = split_train_test(neg_docs) - - training_tweets = train_pos_docs + train_neg_docs - testing_tweets = test_pos_docs + test_neg_docs - - sentim_analyzer = SentimentAnalyzer() - # stopwords = stopwords.words('english') - # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords] - all_words = [word for word in sentim_analyzer.all_words(training_tweets)] - - # Add simple unigram word features - unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000) - sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) - - # Add bigram collocation features - bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats( - [tweet[0] for tweet in training_tweets], top_n=100, min_freq=12 - ) - sentim_analyzer.add_feat_extractor( - extract_bigram_feats, bigrams=bigram_collocs_feats - ) - - training_set = sentim_analyzer.apply_features(training_tweets) - test_set = sentim_analyzer.apply_features(testing_tweets) - - classifier = sentim_analyzer.train(trainer, training_set) - # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4) - try: - classifier.show_most_informative_features() - except AttributeError: - print( - "Your classifier does not provide a show_most_informative_features() method." - ) - results = sentim_analyzer.evaluate(test_set) - - if output: - extr = [f.__name__ for f in sentim_analyzer.feat_extractors] - output_markdown( - output, - Dataset="labeled_tweets", - Classifier=type(classifier).__name__, - Tokenizer=tokenizer.__class__.__name__, - Feats=extr, - Results=results, - Instances=n_instances, - ) - - -def demo_movie_reviews(trainer, n_instances=None, output=None): - """ - Train classifier on all instances of the Movie Reviews dataset. - The corpus has been preprocessed using the default sentence tokenizer and - WordPunctTokenizer. - Features are composed of: - - - most frequent unigrams - - :param trainer: `train` method of a classifier. - :param n_instances: the number of total reviews that have to be used for - training and testing. Reviews will be equally split between positive and - negative. - :param output: the output file where results have to be reported. - """ - from nltk.corpus import movie_reviews - from nltk.sentiment import SentimentAnalyzer - - if n_instances is not None: - n_instances = int(n_instances / 2) - - pos_docs = [ - (list(movie_reviews.words(pos_id)), "pos") - for pos_id in movie_reviews.fileids("pos")[:n_instances] - ] - neg_docs = [ - (list(movie_reviews.words(neg_id)), "neg") - for neg_id in movie_reviews.fileids("neg")[:n_instances] - ] - # We separately split positive and negative instances to keep a balanced - # uniform class distribution in both train and test sets. - train_pos_docs, test_pos_docs = split_train_test(pos_docs) - train_neg_docs, test_neg_docs = split_train_test(neg_docs) - - training_docs = train_pos_docs + train_neg_docs - testing_docs = test_pos_docs + test_neg_docs - - sentim_analyzer = SentimentAnalyzer() - all_words = sentim_analyzer.all_words(training_docs) - - # Add simple unigram word features - unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4) - sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) - # Apply features to obtain a feature-value representation of our datasets - training_set = sentim_analyzer.apply_features(training_docs) - test_set = sentim_analyzer.apply_features(testing_docs) - - classifier = sentim_analyzer.train(trainer, training_set) - try: - classifier.show_most_informative_features() - except AttributeError: - print( - "Your classifier does not provide a show_most_informative_features() method." - ) - results = sentim_analyzer.evaluate(test_set) - - if output: - extr = [f.__name__ for f in sentim_analyzer.feat_extractors] - output_markdown( - output, - Dataset="Movie_reviews", - Classifier=type(classifier).__name__, - Tokenizer="WordPunctTokenizer", - Feats=extr, - Results=results, - Instances=n_instances, - ) - - -def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None): - """ - Train and test a classifier on instances of the Subjective Dataset by Pang and - Lee. The dataset is made of 5000 subjective and 5000 objective sentences. - All tokens (words and punctuation marks) are separated by a whitespace, so - we use the basic WhitespaceTokenizer to parse the data. - - :param trainer: `train` method of a classifier. - :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file. - :param n_instances: the number of total sentences that have to be used for - training and testing. Sentences will be equally split between positive - and negative. - :param output: the output file where results have to be reported. - """ - from nltk.corpus import subjectivity - from nltk.sentiment import SentimentAnalyzer - - if n_instances is not None: - n_instances = int(n_instances / 2) - - subj_docs = [ - (sent, "subj") for sent in subjectivity.sents(categories="subj")[:n_instances] - ] - obj_docs = [ - (sent, "obj") for sent in subjectivity.sents(categories="obj")[:n_instances] - ] - - # We separately split subjective and objective instances to keep a balanced - # uniform class distribution in both train and test sets. - train_subj_docs, test_subj_docs = split_train_test(subj_docs) - train_obj_docs, test_obj_docs = split_train_test(obj_docs) - - training_docs = train_subj_docs + train_obj_docs - testing_docs = test_subj_docs + test_obj_docs - - sentim_analyzer = SentimentAnalyzer() - all_words_neg = sentim_analyzer.all_words( - [mark_negation(doc) for doc in training_docs] - ) - - # Add simple unigram word features handling negation - unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) - sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) - - # Apply features to obtain a feature-value representation of our datasets - training_set = sentim_analyzer.apply_features(training_docs) - test_set = sentim_analyzer.apply_features(testing_docs) - - classifier = sentim_analyzer.train(trainer, training_set) - try: - classifier.show_most_informative_features() - except AttributeError: - print( - "Your classifier does not provide a show_most_informative_features() method." - ) - results = sentim_analyzer.evaluate(test_set) - - if save_analyzer == True: - sentim_analyzer.save_file(sentim_analyzer, "sa_subjectivity.pickle") - - if output: - extr = [f.__name__ for f in sentim_analyzer.feat_extractors] - output_markdown( - output, - Dataset="subjectivity", - Classifier=type(classifier).__name__, - Tokenizer="WhitespaceTokenizer", - Feats=extr, - Instances=n_instances, - Results=results, - ) - - return sentim_analyzer - - -def demo_sent_subjectivity(text): - """ - Classify a single sentence as subjective or objective using a stored - SentimentAnalyzer. - - :param text: a sentence whose subjectivity has to be classified. - """ - from nltk.classify import NaiveBayesClassifier - from nltk.tokenize import regexp - - word_tokenizer = regexp.WhitespaceTokenizer() - try: - sentim_analyzer = load("sa_subjectivity.pickle") - except LookupError: - print("Cannot find the sentiment analyzer you want to load.") - print("Training a new one using NaiveBayesClassifier.") - sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True) - - # Tokenize and convert to lower case - tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)] - print(sentim_analyzer.classify(tokenized_text)) - - -def demo_liu_hu_lexicon(sentence, plot=False): - """ - Basic example of sentiment classification using Liu and Hu opinion lexicon. - This function simply counts the number of positive, negative and neutral words - in the sentence and classifies it depending on which polarity is more represented. - Words that do not appear in the lexicon are considered as neutral. - - :param sentence: a sentence whose polarity has to be classified. - :param plot: if True, plot a visual representation of the sentence polarity. - """ - from nltk.corpus import opinion_lexicon - from nltk.tokenize import treebank - - tokenizer = treebank.TreebankWordTokenizer() - pos_words = 0 - neg_words = 0 - tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)] - - x = list(range(len(tokenized_sent))) # x axis for the plot - y = [] - - for word in tokenized_sent: - if word in opinion_lexicon.positive(): - pos_words += 1 - y.append(1) # positive - elif word in opinion_lexicon.negative(): - neg_words += 1 - y.append(-1) # negative - else: - y.append(0) # neutral - - if pos_words > neg_words: - print("Positive") - elif pos_words < neg_words: - print("Negative") - elif pos_words == neg_words: - print("Neutral") - - if plot == True: - _show_plot( - x, y, x_labels=tokenized_sent, y_labels=["Negative", "Neutral", "Positive"] - ) - - -def demo_vader_instance(text): - """ - Output polarity scores for a text using Vader approach. - - :param text: a text whose polarity has to be evaluated. - """ - from nltk.sentiment import SentimentIntensityAnalyzer - - vader_analyzer = SentimentIntensityAnalyzer() - print(vader_analyzer.polarity_scores(text)) - - -def demo_vader_tweets(n_instances=None, output=None): - """ - Classify 10000 positive and negative tweets using Vader approach. - - :param n_instances: the number of total tweets that have to be classified. - :param output: the output file where results have to be reported. - """ - from collections import defaultdict - - from nltk.corpus import twitter_samples - from nltk.metrics import accuracy as eval_accuracy - from nltk.metrics import f_measure as eval_f_measure - from nltk.metrics import precision as eval_precision - from nltk.metrics import recall as eval_recall - from nltk.sentiment import SentimentIntensityAnalyzer - - if n_instances is not None: - n_instances = int(n_instances / 2) - - fields = ["id", "text"] - positive_json = twitter_samples.abspath("positive_tweets.json") - positive_csv = "positive_tweets.csv" - json2csv_preprocess( - positive_json, - positive_csv, - fields, - strip_off_emoticons=False, - limit=n_instances, - ) - - negative_json = twitter_samples.abspath("negative_tweets.json") - negative_csv = "negative_tweets.csv" - json2csv_preprocess( - negative_json, - negative_csv, - fields, - strip_off_emoticons=False, - limit=n_instances, - ) - - pos_docs = parse_tweets_set(positive_csv, label="pos") - neg_docs = parse_tweets_set(negative_csv, label="neg") - - # We separately split subjective and objective instances to keep a balanced - # uniform class distribution in both train and test sets. - train_pos_docs, test_pos_docs = split_train_test(pos_docs) - train_neg_docs, test_neg_docs = split_train_test(neg_docs) - - training_tweets = train_pos_docs + train_neg_docs - testing_tweets = test_pos_docs + test_neg_docs - - vader_analyzer = SentimentIntensityAnalyzer() - - gold_results = defaultdict(set) - test_results = defaultdict(set) - acc_gold_results = [] - acc_test_results = [] - labels = set() - num = 0 - for i, (text, label) in enumerate(testing_tweets): - labels.add(label) - gold_results[label].add(i) - acc_gold_results.append(label) - score = vader_analyzer.polarity_scores(text)["compound"] - if score > 0: - observed = "pos" - else: - observed = "neg" - num += 1 - acc_test_results.append(observed) - test_results[observed].add(i) - metrics_results = {} - for label in labels: - accuracy_score = eval_accuracy(acc_gold_results, acc_test_results) - metrics_results["Accuracy"] = accuracy_score - precision_score = eval_precision(gold_results[label], test_results[label]) - metrics_results[f"Precision [{label}]"] = precision_score - recall_score = eval_recall(gold_results[label], test_results[label]) - metrics_results[f"Recall [{label}]"] = recall_score - f_measure_score = eval_f_measure(gold_results[label], test_results[label]) - metrics_results[f"F-measure [{label}]"] = f_measure_score - - for result in sorted(metrics_results): - print(f"{result}: {metrics_results[result]}") - - if output: - output_markdown( - output, - Approach="Vader", - Dataset="labeled_tweets", - Instances=n_instances, - Results=metrics_results, - ) - - -if __name__ == "__main__": - from sklearn.svm import LinearSVC - - from nltk.classify import MaxentClassifier, NaiveBayesClassifier - from nltk.classify.scikitlearn import SklearnClassifier - from nltk.twitter.common import _outf_writer, extract_fields - - naive_bayes = NaiveBayesClassifier.train - svm = SklearnClassifier(LinearSVC()).train - maxent = MaxentClassifier.train - - demo_tweets(naive_bayes) - # demo_movie_reviews(svm) - # demo_subjectivity(svm) - # demo_sent_subjectivity("she's an artist , but hasn't picked up a brush in a year . ") - # demo_liu_hu_lexicon("This movie was actually neither that funny, nor super witty.", plot=True) - # demo_vader_instance("This movie was actually neither that funny, nor super witty.") - # demo_vader_tweets() diff --git a/pipeline/nltk/sentiment/vader.py b/pipeline/nltk/sentiment/vader.py deleted file mode 100644 index 2381b39a3a0da7750506283db9b2d3a5fe1d4633..0000000000000000000000000000000000000000 --- a/pipeline/nltk/sentiment/vader.py +++ /dev/null @@ -1,633 +0,0 @@ -# Natural Language Toolkit: vader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: C.J. Hutto -# Ewan Klein (modifications) -# Pierpaolo Pantone <24alsecondo@gmail.com> (modifications) -# George Berry (modifications) -# Malavika Suresh (modifications) -# URL: -# For license information, see LICENSE.TXT -# -# Modifications to the original VADER code have been made in order to -# integrate it into NLTK. These have involved changes to -# ensure Python 3 compatibility, and refactoring to achieve greater modularity. - -""" -If you use the VADER sentiment analysis tools, please cite: - -Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for -Sentiment Analysis of Social Media Text. Eighth International Conference on -Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014. -""" - -import math -import re -import string -from itertools import product - -import nltk.data -from nltk.util import pairwise - - -class VaderConstants: - """ - A class to keep the Vader lists and constants. - """ - - ##Constants## - # (empirically derived mean sentiment intensity rating increase for booster words) - B_INCR = 0.293 - B_DECR = -0.293 - - # (empirically derived mean sentiment intensity rating increase for using - # ALLCAPs to emphasize a word) - C_INCR = 0.733 - - N_SCALAR = -0.74 - - NEGATE = { - "aint", - "arent", - "cannot", - "cant", - "couldnt", - "darent", - "didnt", - "doesnt", - "ain't", - "aren't", - "can't", - "couldn't", - "daren't", - "didn't", - "doesn't", - "dont", - "hadnt", - "hasnt", - "havent", - "isnt", - "mightnt", - "mustnt", - "neither", - "don't", - "hadn't", - "hasn't", - "haven't", - "isn't", - "mightn't", - "mustn't", - "neednt", - "needn't", - "never", - "none", - "nope", - "nor", - "not", - "nothing", - "nowhere", - "oughtnt", - "shant", - "shouldnt", - "uhuh", - "wasnt", - "werent", - "oughtn't", - "shan't", - "shouldn't", - "uh-uh", - "wasn't", - "weren't", - "without", - "wont", - "wouldnt", - "won't", - "wouldn't", - "rarely", - "seldom", - "despite", - } - - # booster/dampener 'intensifiers' or 'degree adverbs' - # https://en.wiktionary.org/wiki/Category:English_degree_adverbs - - BOOSTER_DICT = { - "absolutely": B_INCR, - "amazingly": B_INCR, - "awfully": B_INCR, - "completely": B_INCR, - "considerably": B_INCR, - "decidedly": B_INCR, - "deeply": B_INCR, - "effing": B_INCR, - "enormously": B_INCR, - "entirely": B_INCR, - "especially": B_INCR, - "exceptionally": B_INCR, - "extremely": B_INCR, - "fabulously": B_INCR, - "flipping": B_INCR, - "flippin": B_INCR, - "fricking": B_INCR, - "frickin": B_INCR, - "frigging": B_INCR, - "friggin": B_INCR, - "fully": B_INCR, - "fucking": B_INCR, - "greatly": B_INCR, - "hella": B_INCR, - "highly": B_INCR, - "hugely": B_INCR, - "incredibly": B_INCR, - "intensely": B_INCR, - "majorly": B_INCR, - "more": B_INCR, - "most": B_INCR, - "particularly": B_INCR, - "purely": B_INCR, - "quite": B_INCR, - "really": B_INCR, - "remarkably": B_INCR, - "so": B_INCR, - "substantially": B_INCR, - "thoroughly": B_INCR, - "totally": B_INCR, - "tremendously": B_INCR, - "uber": B_INCR, - "unbelievably": B_INCR, - "unusually": B_INCR, - "utterly": B_INCR, - "very": B_INCR, - "almost": B_DECR, - "barely": B_DECR, - "hardly": B_DECR, - "just enough": B_DECR, - "kind of": B_DECR, - "kinda": B_DECR, - "kindof": B_DECR, - "kind-of": B_DECR, - "less": B_DECR, - "little": B_DECR, - "marginally": B_DECR, - "occasionally": B_DECR, - "partly": B_DECR, - "scarcely": B_DECR, - "slightly": B_DECR, - "somewhat": B_DECR, - "sort of": B_DECR, - "sorta": B_DECR, - "sortof": B_DECR, - "sort-of": B_DECR, - } - - # check for special case idioms using a sentiment-laden keyword known to SAGE - SPECIAL_CASE_IDIOMS = { - "the shit": 3, - "the bomb": 3, - "bad ass": 1.5, - "yeah right": -2, - "cut the mustard": 2, - "kiss of death": -1.5, - "hand to mouth": -2, - } - - # for removing punctuation - REGEX_REMOVE_PUNCTUATION = re.compile(f"[{re.escape(string.punctuation)}]") - - PUNC_LIST = [ - ".", - "!", - "?", - ",", - ";", - ":", - "-", - "'", - '"', - "!!", - "!!!", - "??", - "???", - "?!?", - "!?!", - "?!?!", - "!?!?", - ] - - def __init__(self): - pass - - def negated(self, input_words, include_nt=True): - """ - Determine if input contains negation words - """ - neg_words = self.NEGATE - if any(word.lower() in neg_words for word in input_words): - return True - if include_nt: - if any("n't" in word.lower() for word in input_words): - return True - for first, second in pairwise(input_words): - if second.lower() == "least" and first.lower() != "at": - return True - return False - - def normalize(self, score, alpha=15): - """ - Normalize the score to be between -1 and 1 using an alpha that - approximates the max expected value - """ - norm_score = score / math.sqrt((score * score) + alpha) - return norm_score - - def scalar_inc_dec(self, word, valence, is_cap_diff): - """ - Check if the preceding words increase, decrease, or negate/nullify the - valence - """ - scalar = 0.0 - word_lower = word.lower() - if word_lower in self.BOOSTER_DICT: - scalar = self.BOOSTER_DICT[word_lower] - if valence < 0: - scalar *= -1 - # check if booster/dampener word is in ALLCAPS (while others aren't) - if word.isupper() and is_cap_diff: - if valence > 0: - scalar += self.C_INCR - else: - scalar -= self.C_INCR - return scalar - - -class SentiText: - """ - Identify sentiment-relevant string-level properties of input text. - """ - - def __init__(self, text, punc_list, regex_remove_punctuation): - if not isinstance(text, str): - text = str(text.encode("utf-8")) - self.text = text - self.PUNC_LIST = punc_list - self.REGEX_REMOVE_PUNCTUATION = regex_remove_punctuation - self.words_and_emoticons = self._words_and_emoticons() - # doesn't separate words from - # adjacent punctuation (keeps emoticons & contractions) - self.is_cap_diff = self.allcap_differential(self.words_and_emoticons) - - def _words_plus_punc(self): - """ - Returns mapping of form: - { - 'cat,': 'cat', - ',cat': 'cat', - } - """ - no_punc_text = self.REGEX_REMOVE_PUNCTUATION.sub("", self.text) - # removes punctuation (but loses emoticons & contractions) - words_only = no_punc_text.split() - # remove singletons - words_only = {w for w in words_only if len(w) > 1} - # the product gives ('cat', ',') and (',', 'cat') - punc_before = {"".join(p): p[1] for p in product(self.PUNC_LIST, words_only)} - punc_after = {"".join(p): p[0] for p in product(words_only, self.PUNC_LIST)} - words_punc_dict = punc_before - words_punc_dict.update(punc_after) - return words_punc_dict - - def _words_and_emoticons(self): - """ - Removes leading and trailing puncutation - Leaves contractions and most emoticons - Does not preserve punc-plus-letter emoticons (e.g. :D) - """ - wes = self.text.split() - words_punc_dict = self._words_plus_punc() - wes = [we for we in wes if len(we) > 1] - for i, we in enumerate(wes): - if we in words_punc_dict: - wes[i] = words_punc_dict[we] - return wes - - def allcap_differential(self, words): - """ - Check whether just some words in the input are ALL CAPS - - :param list words: The words to inspect - :returns: `True` if some but not all items in `words` are ALL CAPS - """ - is_different = False - allcap_words = 0 - for word in words: - if word.isupper(): - allcap_words += 1 - cap_differential = len(words) - allcap_words - if 0 < cap_differential < len(words): - is_different = True - return is_different - - -class SentimentIntensityAnalyzer: - """ - Give a sentiment intensity score to sentences. - """ - - def __init__( - self, - lexicon_file="sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt", - ): - self.lexicon_file = nltk.data.load(lexicon_file) - self.lexicon = self.make_lex_dict() - self.constants = VaderConstants() - - def make_lex_dict(self): - """ - Convert lexicon file to a dictionary - """ - lex_dict = {} - for line in self.lexicon_file.split("\n"): - (word, measure) = line.strip().split("\t")[0:2] - lex_dict[word] = float(measure) - return lex_dict - - def polarity_scores(self, text): - """ - Return a float for sentiment strength based on the input text. - Positive values are positive valence, negative value are negative - valence. - - :note: Hashtags are not taken into consideration (e.g. #BAD is neutral). If you - are interested in processing the text in the hashtags too, then we recommend - preprocessing your data to remove the #, after which the hashtag text may be - matched as if it was a normal word in the sentence. - """ - # text, words_and_emoticons, is_cap_diff = self.preprocess(text) - sentitext = SentiText( - text, self.constants.PUNC_LIST, self.constants.REGEX_REMOVE_PUNCTUATION - ) - sentiments = [] - words_and_emoticons = sentitext.words_and_emoticons - for item in words_and_emoticons: - valence = 0 - i = words_and_emoticons.index(item) - if ( - i < len(words_and_emoticons) - 1 - and item.lower() == "kind" - and words_and_emoticons[i + 1].lower() == "of" - ) or item.lower() in self.constants.BOOSTER_DICT: - sentiments.append(valence) - continue - - sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments) - - sentiments = self._but_check(words_and_emoticons, sentiments) - - return self.score_valence(sentiments, text) - - def sentiment_valence(self, valence, sentitext, item, i, sentiments): - is_cap_diff = sentitext.is_cap_diff - words_and_emoticons = sentitext.words_and_emoticons - item_lowercase = item.lower() - if item_lowercase in self.lexicon: - # get the sentiment valence - valence = self.lexicon[item_lowercase] - - # check if sentiment laden word is in ALL CAPS (while others aren't) - if item.isupper() and is_cap_diff: - if valence > 0: - valence += self.constants.C_INCR - else: - valence -= self.constants.C_INCR - - for start_i in range(0, 3): - if ( - i > start_i - and words_and_emoticons[i - (start_i + 1)].lower() - not in self.lexicon - ): - # dampen the scalar modifier of preceding words and emoticons - # (excluding the ones that immediately preceed the item) based - # on their distance from the current item. - s = self.constants.scalar_inc_dec( - words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff - ) - if start_i == 1 and s != 0: - s = s * 0.95 - if start_i == 2 and s != 0: - s = s * 0.9 - valence = valence + s - valence = self._never_check( - valence, words_and_emoticons, start_i, i - ) - if start_i == 2: - valence = self._idioms_check(valence, words_and_emoticons, i) - - # future work: consider other sentiment-laden idioms - # other_idioms = - # {"back handed": -2, "blow smoke": -2, "blowing smoke": -2, - # "upper hand": 1, "break a leg": 2, - # "cooking with gas": 2, "in the black": 2, "in the red": -2, - # "on the ball": 2,"under the weather": -2} - - valence = self._least_check(valence, words_and_emoticons, i) - - sentiments.append(valence) - return sentiments - - def _least_check(self, valence, words_and_emoticons, i): - # check for negation case using "least" - if ( - i > 1 - and words_and_emoticons[i - 1].lower() not in self.lexicon - and words_and_emoticons[i - 1].lower() == "least" - ): - if ( - words_and_emoticons[i - 2].lower() != "at" - and words_and_emoticons[i - 2].lower() != "very" - ): - valence = valence * self.constants.N_SCALAR - elif ( - i > 0 - and words_and_emoticons[i - 1].lower() not in self.lexicon - and words_and_emoticons[i - 1].lower() == "least" - ): - valence = valence * self.constants.N_SCALAR - return valence - - def _but_check(self, words_and_emoticons, sentiments): - words_and_emoticons = [w_e.lower() for w_e in words_and_emoticons] - but = {"but"} & set(words_and_emoticons) - if but: - bi = words_and_emoticons.index(next(iter(but))) - for sidx, sentiment in enumerate(sentiments): - if sidx < bi: - sentiments[sidx] = sentiment * 0.5 - elif sidx > bi: - sentiments[sidx] = sentiment * 1.5 - return sentiments - - def _idioms_check(self, valence, words_and_emoticons, i): - onezero = f"{words_and_emoticons[i - 1]} {words_and_emoticons[i]}" - - twoonezero = "{} {} {}".format( - words_and_emoticons[i - 2], - words_and_emoticons[i - 1], - words_and_emoticons[i], - ) - - twoone = f"{words_and_emoticons[i - 2]} {words_and_emoticons[i - 1]}" - - threetwoone = "{} {} {}".format( - words_and_emoticons[i - 3], - words_and_emoticons[i - 2], - words_and_emoticons[i - 1], - ) - - threetwo = "{} {}".format( - words_and_emoticons[i - 3], words_and_emoticons[i - 2] - ) - - sequences = [onezero, twoonezero, twoone, threetwoone, threetwo] - - for seq in sequences: - if seq in self.constants.SPECIAL_CASE_IDIOMS: - valence = self.constants.SPECIAL_CASE_IDIOMS[seq] - break - - if len(words_and_emoticons) - 1 > i: - zeroone = f"{words_and_emoticons[i]} {words_and_emoticons[i + 1]}" - if zeroone in self.constants.SPECIAL_CASE_IDIOMS: - valence = self.constants.SPECIAL_CASE_IDIOMS[zeroone] - if len(words_and_emoticons) - 1 > i + 1: - zeroonetwo = "{} {} {}".format( - words_and_emoticons[i], - words_and_emoticons[i + 1], - words_and_emoticons[i + 2], - ) - if zeroonetwo in self.constants.SPECIAL_CASE_IDIOMS: - valence = self.constants.SPECIAL_CASE_IDIOMS[zeroonetwo] - - # check for booster/dampener bi-grams such as 'sort of' or 'kind of' - if ( - threetwo in self.constants.BOOSTER_DICT - or twoone in self.constants.BOOSTER_DICT - ): - valence = valence + self.constants.B_DECR - return valence - - def _never_check(self, valence, words_and_emoticons, start_i, i): - if start_i == 0: - if self.constants.negated([words_and_emoticons[i - 1]]): - valence = valence * self.constants.N_SCALAR - if start_i == 1: - if words_and_emoticons[i - 2] == "never" and ( - words_and_emoticons[i - 1] == "so" - or words_and_emoticons[i - 1] == "this" - ): - valence = valence * 1.5 - elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]): - valence = valence * self.constants.N_SCALAR - if start_i == 2: - if ( - words_and_emoticons[i - 3] == "never" - and ( - words_and_emoticons[i - 2] == "so" - or words_and_emoticons[i - 2] == "this" - ) - or ( - words_and_emoticons[i - 1] == "so" - or words_and_emoticons[i - 1] == "this" - ) - ): - valence = valence * 1.25 - elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]): - valence = valence * self.constants.N_SCALAR - return valence - - def _punctuation_emphasis(self, sum_s, text): - # add emphasis from exclamation points and question marks - ep_amplifier = self._amplify_ep(text) - qm_amplifier = self._amplify_qm(text) - punct_emph_amplifier = ep_amplifier + qm_amplifier - return punct_emph_amplifier - - def _amplify_ep(self, text): - # check for added emphasis resulting from exclamation points (up to 4 of them) - ep_count = text.count("!") - if ep_count > 4: - ep_count = 4 - # (empirically derived mean sentiment intensity rating increase for - # exclamation points) - ep_amplifier = ep_count * 0.292 - return ep_amplifier - - def _amplify_qm(self, text): - # check for added emphasis resulting from question marks (2 or 3+) - qm_count = text.count("?") - qm_amplifier = 0 - if qm_count > 1: - if qm_count <= 3: - # (empirically derived mean sentiment intensity rating increase for - # question marks) - qm_amplifier = qm_count * 0.18 - else: - qm_amplifier = 0.96 - return qm_amplifier - - def _sift_sentiment_scores(self, sentiments): - # want separate positive versus negative sentiment scores - pos_sum = 0.0 - neg_sum = 0.0 - neu_count = 0 - for sentiment_score in sentiments: - if sentiment_score > 0: - pos_sum += ( - float(sentiment_score) + 1 - ) # compensates for neutral words that are counted as 1 - if sentiment_score < 0: - neg_sum += ( - float(sentiment_score) - 1 - ) # when used with math.fabs(), compensates for neutrals - if sentiment_score == 0: - neu_count += 1 - return pos_sum, neg_sum, neu_count - - def score_valence(self, sentiments, text): - if sentiments: - sum_s = float(sum(sentiments)) - # compute and add emphasis from punctuation in text - punct_emph_amplifier = self._punctuation_emphasis(sum_s, text) - if sum_s > 0: - sum_s += punct_emph_amplifier - elif sum_s < 0: - sum_s -= punct_emph_amplifier - - compound = self.constants.normalize(sum_s) - # discriminate between positive, negative and neutral sentiment scores - pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments) - - if pos_sum > math.fabs(neg_sum): - pos_sum += punct_emph_amplifier - elif pos_sum < math.fabs(neg_sum): - neg_sum -= punct_emph_amplifier - - total = pos_sum + math.fabs(neg_sum) + neu_count - pos = math.fabs(pos_sum / total) - neg = math.fabs(neg_sum / total) - neu = math.fabs(neu_count / total) - - else: - compound = 0.0 - pos = 0.0 - neg = 0.0 - neu = 0.0 - - sentiment_dict = { - "neg": round(neg, 3), - "neu": round(neu, 3), - "pos": round(pos, 3), - "compound": round(compound, 4), - } - - return sentiment_dict diff --git a/pipeline/nltk/stem/__init__.py b/pipeline/nltk/stem/__init__.py deleted file mode 100644 index 5f46ec0c26303eea6837bc070d8e77b56b48e29f..0000000000000000000000000000000000000000 --- a/pipeline/nltk/stem/__init__.py +++ /dev/null @@ -1,34 +0,0 @@ -# Natural Language Toolkit: Stemmers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Trevor Cohn -# Edward Loper -# Steven Bird -# URL: -# For license information, see LICENSE.TXT - -""" -NLTK Stemmers - -Interfaces used to remove morphological affixes from words, leaving -only the word stem. Stemming algorithms aim to remove those affixes -required for eg. grammatical role, tense, derivational morphology -leaving only the stem of the word. This is a difficult problem due to -irregular words (eg. common verbs in English), complicated -morphological rules, and part-of-speech and sense ambiguities -(eg. ``ceil-`` is not the stem of ``ceiling``). - -StemmerI defines a standard interface for stemmers. -""" - -from nltk.stem.api import StemmerI -from nltk.stem.arlstem import ARLSTem -from nltk.stem.arlstem2 import ARLSTem2 -from nltk.stem.cistem import Cistem -from nltk.stem.isri import ISRIStemmer -from nltk.stem.lancaster import LancasterStemmer -from nltk.stem.porter import PorterStemmer -from nltk.stem.regexp import RegexpStemmer -from nltk.stem.rslp import RSLPStemmer -from nltk.stem.snowball import SnowballStemmer -from nltk.stem.wordnet import WordNetLemmatizer diff --git a/pipeline/nltk/stem/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/stem/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 6b12813c25bd36961798853d963596ebf5147ea1..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/stem/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/stem/__pycache__/api.cpython-39.pyc b/pipeline/nltk/stem/__pycache__/api.cpython-39.pyc deleted file mode 100644 index 82dd6cffd5eb70ff7c4bb7441faeeffe07fa9c78..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/stem/__pycache__/api.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/stem/__pycache__/arlstem.cpython-39.pyc b/pipeline/nltk/stem/__pycache__/arlstem.cpython-39.pyc deleted file mode 100644 index b1b54a4ab846203c5e0a8bcfaf6408321297d1f0..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/stem/__pycache__/arlstem.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/stem/__pycache__/arlstem2.cpython-39.pyc b/pipeline/nltk/stem/__pycache__/arlstem2.cpython-39.pyc deleted file mode 100644 index d633a84a14da802a613519916f97f845bf1c4a41..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/stem/__pycache__/arlstem2.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/stem/__pycache__/cistem.cpython-39.pyc b/pipeline/nltk/stem/__pycache__/cistem.cpython-39.pyc deleted file mode 100644 index a612bca8626b006a67b0ec6a0c38c011d43ed2c9..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/stem/__pycache__/cistem.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/stem/__pycache__/isri.cpython-39.pyc b/pipeline/nltk/stem/__pycache__/isri.cpython-39.pyc deleted file mode 100644 index d20867c6687ed9ec3d60fed1ec5e9aabb4090975..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/stem/__pycache__/isri.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/stem/__pycache__/lancaster.cpython-39.pyc b/pipeline/nltk/stem/__pycache__/lancaster.cpython-39.pyc deleted file mode 100644 index d435dfe5d812f65ae531991dde2cf486afab0712..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/stem/__pycache__/lancaster.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/stem/__pycache__/porter.cpython-39.pyc b/pipeline/nltk/stem/__pycache__/porter.cpython-39.pyc deleted file mode 100644 index e2d953af1542bccf6afcc9bc6345261df2c1b273..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/stem/__pycache__/porter.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/stem/__pycache__/regexp.cpython-39.pyc b/pipeline/nltk/stem/__pycache__/regexp.cpython-39.pyc deleted file mode 100644 index 6057785f6389e829ab94a42162dbe6d7bb6c5d0e..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/stem/__pycache__/regexp.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/stem/__pycache__/rslp.cpython-39.pyc b/pipeline/nltk/stem/__pycache__/rslp.cpython-39.pyc deleted file mode 100644 index 6b208c6dc4fabd9f1af8cf47a6f7b8a3aacccad4..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/stem/__pycache__/rslp.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/stem/__pycache__/snowball.cpython-39.pyc b/pipeline/nltk/stem/__pycache__/snowball.cpython-39.pyc deleted file mode 100644 index d23feab74418e849e1f382c7fad7abde940b333c..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/stem/__pycache__/snowball.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/stem/__pycache__/util.cpython-39.pyc b/pipeline/nltk/stem/__pycache__/util.cpython-39.pyc deleted file mode 100644 index d71f0fa97b9422b3fc2f2ba41742068808ab0b25..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/stem/__pycache__/util.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/stem/__pycache__/wordnet.cpython-39.pyc b/pipeline/nltk/stem/__pycache__/wordnet.cpython-39.pyc deleted file mode 100644 index 62c5c4b6597c52b0259d83e5f3658674e28f6e01..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/stem/__pycache__/wordnet.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/stem/api.py b/pipeline/nltk/stem/api.py deleted file mode 100644 index 7a58c059a10ca2649faeb695d042a0c6cbb9ec69..0000000000000000000000000000000000000000 --- a/pipeline/nltk/stem/api.py +++ /dev/null @@ -1,27 +0,0 @@ -# Natural Language Toolkit: Stemmer Interface -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Trevor Cohn -# Edward Loper -# Steven Bird -# URL: -# For license information, see LICENSE.TXT - -from abc import ABCMeta, abstractmethod - - -class StemmerI(metaclass=ABCMeta): - """ - A processing interface for removing morphological affixes from - words. This process is known as stemming. - - """ - - @abstractmethod - def stem(self, token): - """ - Strip affixes from the token and return the stem. - - :param token: The token that should be stemmed. - :type token: str - """ diff --git a/pipeline/nltk/stem/arlstem.py b/pipeline/nltk/stem/arlstem.py deleted file mode 100644 index 566a4dd36b37e148a24eb840b5e9478dd24d4b55..0000000000000000000000000000000000000000 --- a/pipeline/nltk/stem/arlstem.py +++ /dev/null @@ -1,361 +0,0 @@ -# -# Natural Language Toolkit: ARLSTem Stemmer -# -# Copyright (C) 2001-2023 NLTK Project -# -# Author: Kheireddine Abainia (x-programer) -# Algorithms: Kheireddine Abainia -# Siham Ouamour -# Halim Sayoud -# URL: -# For license information, see LICENSE.TXT - - -""" -ARLSTem Arabic Stemmer -The details about the implementation of this algorithm are described in: -K. Abainia, S. Ouamour and H. Sayoud, A Novel Robust Arabic Light Stemmer , -Journal of Experimental & Theoretical Artificial Intelligence (JETAI'17), -Vol. 29, No. 3, 2017, pp. 557-573. -The ARLSTem is a light Arabic stemmer that is based on removing the affixes -from the word (i.e. prefixes, suffixes and infixes). It was evaluated and -compared to several other stemmers using Paice's parameters (under-stemming -index, over-stemming index and stemming weight), and the results showed that -ARLSTem is promising and producing high performances. This stemmer is not -based on any dictionary and can be used on-line effectively. -""" -import re - -from nltk.stem.api import StemmerI - - -class ARLSTem(StemmerI): - """ - ARLSTem stemmer : a light Arabic Stemming algorithm without any dictionary. - Department of Telecommunication & Information Processing. USTHB University, - Algiers, Algeria. - ARLSTem.stem(token) returns the Arabic stem for the input token. - The ARLSTem Stemmer requires that all tokens are encoded using Unicode - encoding. - """ - - def __init__(self): - # different Alif with hamza - self.re_hamzated_alif = re.compile(r"[\u0622\u0623\u0625]") - self.re_alifMaqsura = re.compile(r"[\u0649]") - self.re_diacritics = re.compile(r"[\u064B-\u065F]") - - # Alif Laam, Laam Laam, Fa Laam, Fa Ba - self.pr2 = ["\u0627\u0644", "\u0644\u0644", "\u0641\u0644", "\u0641\u0628"] - # Ba Alif Laam, Kaaf Alif Laam, Waaw Alif Laam - self.pr3 = ["\u0628\u0627\u0644", "\u0643\u0627\u0644", "\u0648\u0627\u0644"] - # Fa Laam Laam, Waaw Laam Laam - self.pr32 = ["\u0641\u0644\u0644", "\u0648\u0644\u0644"] - # Fa Ba Alif Laam, Waaw Ba Alif Laam, Fa Kaaf Alif Laam - self.pr4 = [ - "\u0641\u0628\u0627\u0644", - "\u0648\u0628\u0627\u0644", - "\u0641\u0643\u0627\u0644", - ] - - # Kaf Yaa, Kaf Miim - self.su2 = ["\u0643\u064A", "\u0643\u0645"] - # Ha Alif, Ha Miim - self.su22 = ["\u0647\u0627", "\u0647\u0645"] - # Kaf Miim Alif, Kaf Noon Shadda - self.su3 = ["\u0643\u0645\u0627", "\u0643\u0646\u0651"] - # Ha Miim Alif, Ha Noon Shadda - self.su32 = ["\u0647\u0645\u0627", "\u0647\u0646\u0651"] - - # Alif Noon, Ya Noon, Waaw Noon - self.pl_si2 = ["\u0627\u0646", "\u064A\u0646", "\u0648\u0646"] - # Taa Alif Noon, Taa Ya Noon - self.pl_si3 = ["\u062A\u0627\u0646", "\u062A\u064A\u0646"] - - # Alif Noon, Waaw Noon - self.verb_su2 = ["\u0627\u0646", "\u0648\u0646"] - # Siin Taa, Siin Yaa - self.verb_pr2 = ["\u0633\u062A", "\u0633\u064A"] - # Siin Alif, Siin Noon - self.verb_pr22 = ["\u0633\u0627", "\u0633\u0646"] - # Lam Noon, Lam Taa, Lam Yaa, Lam Hamza - self.verb_pr33 = [ - "\u0644\u0646", - "\u0644\u062A", - "\u0644\u064A", - "\u0644\u0623", - ] - # Taa Miim Alif, Taa Noon Shadda - self.verb_suf3 = ["\u062A\u0645\u0627", "\u062A\u0646\u0651"] - # Noon Alif, Taa Miim, Taa Alif, Waaw Alif - self.verb_suf2 = [ - "\u0646\u0627", - "\u062A\u0645", - "\u062A\u0627", - "\u0648\u0627", - ] - # Taa, Alif, Noon - self.verb_suf1 = ["\u062A", "\u0627", "\u0646"] - - def stem(self, token): - """ - call this function to get the word's stem based on ARLSTem . - """ - try: - if token is None: - raise ValueError( - "The word could not be stemmed, because \ - it is empty !" - ) - # remove Arabic diacritics and replace some letters with others - token = self.norm(token) - # strip common prefixes of the nouns - pre = self.pref(token) - if pre is not None: - token = pre - # strip the suffixes which are common to nouns and verbs - token = self.suff(token) - # transform a plural noun to a singular noun - ps = self.plur2sing(token) - if ps is None: - # transform from the feminine form to the masculine form - fm = self.fem2masc(token) - if fm is not None: - return fm - else: - if pre is None: # if the prefixes are not stripped - # strip the verb prefixes and suffixes - return self.verb(token) - else: - return ps - return token - except ValueError as e: - print(e) - - def norm(self, token): - """ - normalize the word by removing diacritics, replacing hamzated Alif - with Alif replacing AlifMaqsura with Yaa and removing Waaw at the - beginning. - """ - # strip Arabic diacritics - token = self.re_diacritics.sub("", token) - # replace Hamzated Alif with Alif bare - token = self.re_hamzated_alif.sub("\u0627", token) - # replace alifMaqsura with Yaa - token = self.re_alifMaqsura.sub("\u064A", token) - # strip the Waaw from the word beginning if the remaining is 3 letters - # at least - if token.startswith("\u0648") and len(token) > 3: - token = token[1:] - return token - - def pref(self, token): - """ - remove prefixes from the words' beginning. - """ - if len(token) > 5: - for p3 in self.pr3: - if token.startswith(p3): - return token[3:] - if len(token) > 6: - for p4 in self.pr4: - if token.startswith(p4): - return token[4:] - if len(token) > 5: - for p3 in self.pr32: - if token.startswith(p3): - return token[3:] - if len(token) > 4: - for p2 in self.pr2: - if token.startswith(p2): - return token[2:] - - def suff(self, token): - """ - remove suffixes from the word's end. - """ - if token.endswith("\u0643") and len(token) > 3: - return token[:-1] - if len(token) > 4: - for s2 in self.su2: - if token.endswith(s2): - return token[:-2] - if len(token) > 5: - for s3 in self.su3: - if token.endswith(s3): - return token[:-3] - if token.endswith("\u0647") and len(token) > 3: - token = token[:-1] - return token - if len(token) > 4: - for s2 in self.su22: - if token.endswith(s2): - return token[:-2] - if len(token) > 5: - for s3 in self.su32: - if token.endswith(s3): - return token[:-3] - if token.endswith("\u0646\u0627") and len(token) > 4: - return token[:-2] - return token - - def fem2masc(self, token): - """ - transform the word from the feminine form to the masculine form. - """ - if token.endswith("\u0629") and len(token) > 3: - return token[:-1] - - def plur2sing(self, token): - """ - transform the word from the plural form to the singular form. - """ - if len(token) > 4: - for ps2 in self.pl_si2: - if token.endswith(ps2): - return token[:-2] - if len(token) > 5: - for ps3 in self.pl_si3: - if token.endswith(ps3): - return token[:-3] - if len(token) > 3 and token.endswith("\u0627\u062A"): - return token[:-2] - if len(token) > 3 and token.startswith("\u0627") and token[2] == "\u0627": - return token[:2] + token[3:] - if len(token) > 4 and token.startswith("\u0627") and token[-2] == "\u0627": - return token[1:-2] + token[-1] - - def verb(self, token): - """ - stem the verb prefixes and suffixes or both - """ - vb = self.verb_t1(token) - if vb is not None: - return vb - vb = self.verb_t2(token) - if vb is not None: - return vb - vb = self.verb_t3(token) - if vb is not None: - return vb - vb = self.verb_t4(token) - if vb is not None: - return vb - vb = self.verb_t5(token) - if vb is not None: - return vb - return self.verb_t6(token) - - def verb_t1(self, token): - """ - stem the present prefixes and suffixes - """ - if len(token) > 5 and token.startswith("\u062A"): # Taa - for s2 in self.pl_si2: - if token.endswith(s2): - return token[1:-2] - if len(token) > 5 and token.startswith("\u064A"): # Yaa - for s2 in self.verb_su2: - if token.endswith(s2): - return token[1:-2] - if len(token) > 4 and token.startswith("\u0627"): # Alif - # Waaw Alif - if len(token) > 5 and token.endswith("\u0648\u0627"): - return token[1:-2] - # Yaa - if token.endswith("\u064A"): - return token[1:-1] - # Alif - if token.endswith("\u0627"): - return token[1:-1] - # Noon - if token.endswith("\u0646"): - return token[1:-1] - # ^Yaa, Noon$ - if len(token) > 4 and token.startswith("\u064A") and token.endswith("\u0646"): - return token[1:-1] - # ^Taa, Noon$ - if len(token) > 4 and token.startswith("\u062A") and token.endswith("\u0646"): - return token[1:-1] - - def verb_t2(self, token): - """ - stem the future prefixes and suffixes - """ - if len(token) > 6: - for s2 in self.pl_si2: - # ^Siin Taa - if token.startswith(self.verb_pr2[0]) and token.endswith(s2): - return token[2:-2] - # ^Siin Yaa, Alif Noon$ - if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[0]): - return token[2:-2] - # ^Siin Yaa, Waaw Noon$ - if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[2]): - return token[2:-2] - # ^Siin Taa, Noon$ - if ( - len(token) > 5 - and token.startswith(self.verb_pr2[0]) - and token.endswith("\u0646") - ): - return token[2:-1] - # ^Siin Yaa, Noon$ - if ( - len(token) > 5 - and token.startswith(self.verb_pr2[1]) - and token.endswith("\u0646") - ): - return token[2:-1] - - def verb_t3(self, token): - """ - stem the present suffixes - """ - if len(token) > 5: - for su3 in self.verb_suf3: - if token.endswith(su3): - return token[:-3] - if len(token) > 4: - for su2 in self.verb_suf2: - if token.endswith(su2): - return token[:-2] - if len(token) > 3: - for su1 in self.verb_suf1: - if token.endswith(su1): - return token[:-1] - - def verb_t4(self, token): - """ - stem the present prefixes - """ - if len(token) > 3: - for pr1 in self.verb_suf1: - if token.startswith(pr1): - return token[1:] - if token.startswith("\u064A"): - return token[1:] - - def verb_t5(self, token): - """ - stem the future prefixes - """ - if len(token) > 4: - for pr2 in self.verb_pr22: - if token.startswith(pr2): - return token[2:] - for pr2 in self.verb_pr2: - if token.startswith(pr2): - return token[2:] - return token - - def verb_t6(self, token): - """ - stem the order prefixes - """ - if len(token) > 4: - for pr3 in self.verb_pr33: - if token.startswith(pr3): - return token[2:] - return token diff --git a/pipeline/nltk/stem/arlstem2.py b/pipeline/nltk/stem/arlstem2.py deleted file mode 100644 index a2d9e9551ecffff219821bb570f96b21f588a6f0..0000000000000000000000000000000000000000 --- a/pipeline/nltk/stem/arlstem2.py +++ /dev/null @@ -1,457 +0,0 @@ -# -# Natural Language Toolkit: ARLSTem Stemmer v2 -# -# Copyright (C) 2001-2023 NLTK Project -# -# Author: Kheireddine Abainia (x-programer) -# Algorithms: Kheireddine Abainia -# Hamza Rebbani -# URL: -# For license information, see LICENSE.TXT - - -""" -ARLSTem2 Arabic Light Stemmer -The details about the implementation of this algorithm are described in: -K. Abainia and H. Rebbani, Comparing the Effectiveness of the Improved ARLSTem -Algorithm with Existing Arabic Light Stemmers, International Conference on -Theoretical and Applicative Aspects of Computer Science (ICTAACS'19), Skikda, -Algeria, December 15-16, 2019. -ARLSTem2 is an Arabic light stemmer based on removing the affixes from -the words (i.e. prefixes, suffixes and infixes). It is an improvement -of the previous Arabic light stemmer (ARLSTem). The new version was compared to -the original algorithm and several existing Arabic light stemmers, where the -results showed that the new version considerably improves the under-stemming -errors that are common to light stemmers. Both ARLSTem and ARLSTem2 can be run -online and do not use any dictionary. -""" -import re - -from nltk.stem.api import StemmerI - - -class ARLSTem2(StemmerI): - """ - Return a stemmed Arabic word after removing affixes. This an improved - version of the previous algorithm, which reduces under-stemming errors. - Typically used in Arabic search engine, information retrieval and NLP. - - >>> from nltk.stem import arlstem2 - >>> stemmer = ARLSTem2() - >>> word = stemmer.stem('يعمل') - >>> print(word) - عمل - - :param token: The input Arabic word (unicode) to be stemmed - :type token: unicode - :return: A unicode Arabic word - """ - - def __init__(self): - # different Alif with hamza - self.re_hamzated_alif = re.compile(r"[\u0622\u0623\u0625]") - self.re_alifMaqsura = re.compile(r"[\u0649]") - self.re_diacritics = re.compile(r"[\u064B-\u065F]") - - # Alif Laam, Laam Laam, Fa Laam, Fa Ba - self.pr2 = ["\u0627\u0644", "\u0644\u0644", "\u0641\u0644", "\u0641\u0628"] - # Ba Alif Laam, Kaaf Alif Laam, Waaw Alif Laam - self.pr3 = ["\u0628\u0627\u0644", "\u0643\u0627\u0644", "\u0648\u0627\u0644"] - # Fa Laam Laam, Waaw Laam Laam - self.pr32 = ["\u0641\u0644\u0644", "\u0648\u0644\u0644"] - # Fa Ba Alif Laam, Waaw Ba Alif Laam, Fa Kaaf Alif Laam - self.pr4 = [ - "\u0641\u0628\u0627\u0644", - "\u0648\u0628\u0627\u0644", - "\u0641\u0643\u0627\u0644", - ] - - # Kaf Yaa, Kaf Miim - self.su2 = ["\u0643\u064A", "\u0643\u0645"] - # Ha Alif, Ha Miim - self.su22 = ["\u0647\u0627", "\u0647\u0645"] - # Kaf Miim Alif, Kaf Noon Shadda - self.su3 = ["\u0643\u0645\u0627", "\u0643\u0646\u0651"] - # Ha Miim Alif, Ha Noon Shadda - self.su32 = ["\u0647\u0645\u0627", "\u0647\u0646\u0651"] - - # Alif Noon, Ya Noon, Waaw Noon - self.pl_si2 = ["\u0627\u0646", "\u064A\u0646", "\u0648\u0646"] - # Taa Alif Noon, Taa Ya Noon - self.pl_si3 = ["\u062A\u0627\u0646", "\u062A\u064A\u0646"] - - # Alif Noon, Waaw Noon - self.verb_su2 = ["\u0627\u0646", "\u0648\u0646"] - # Siin Taa, Siin Yaa - self.verb_pr2 = ["\u0633\u062A", "\u0633\u064A"] - # Siin Alif, Siin Noon - self.verb_pr22 = ["\u0633\u0627", "\u0633\u0646"] - # Lam Noon, Lam Taa, Lam Yaa, Lam Hamza - self.verb_pr33 = [ - "\u0644\u0646", - "\u0644\u062A", - "\u0644\u064A", - "\u0644\u0623", - ] - # Taa Miim Alif, Taa Noon Shadda - self.verb_suf3 = ["\u062A\u0645\u0627", "\u062A\u0646\u0651"] - # Noon Alif, Taa Miim, Taa Alif, Waaw Alif - self.verb_suf2 = [ - "\u0646\u0627", - "\u062A\u0645", - "\u062A\u0627", - "\u0648\u0627", - ] - # Taa, Alif, Noon - self.verb_suf1 = ["\u062A", "\u0627", "\u0646"] - - def stem1(self, token): - """ - call this function to get the first stem - """ - try: - if token is None: - raise ValueError( - "The word could not be stemmed, because \ - it is empty !" - ) - self.is_verb = False - # remove Arabic diacritics and replace some letters with others - token = self.norm(token) - # strip the common noun prefixes - pre = self.pref(token) - if pre is not None: - token = pre - # transform the feminine form to masculine form - fm = self.fem2masc(token) - if fm is not None: - return fm - # strip the adjective affixes - adj = self.adjective(token) - if adj is not None: - return adj - # strip the suffixes that are common to nouns and verbs - token = self.suff(token) - # transform a plural noun to a singular noun - ps = self.plur2sing(token) - if ps is None: - if pre is None: # if the noun prefixes are not stripped - # strip the verb prefixes and suffixes - verb = self.verb(token) - if verb is not None: - self.is_verb = True - return verb - else: - return ps - return token - except ValueError as e: - print(e) - - def stem(self, token): - # stem the input word - try: - if token is None: - raise ValueError( - "The word could not be stemmed, because \ - it is empty !" - ) - # run the first round of stemming - token = self.stem1(token) - # check if there is some additional noun affixes - if len(token) > 4: - # ^Taa, $Yaa + char - if token.startswith("\u062A") and token[-2] == "\u064A": - token = token[1:-2] + token[-1] - return token - # ^Miim, $Waaw + char - if token.startswith("\u0645") and token[-2] == "\u0648": - token = token[1:-2] + token[-1] - return token - if len(token) > 3: - # !^Alif, $Yaa - if not token.startswith("\u0627") and token.endswith("\u064A"): - token = token[:-1] - return token - # $Laam - if token.startswith("\u0644"): - return token[1:] - return token - except ValueError as e: - print(e) - - def norm(self, token): - """ - normalize the word by removing diacritics, replace hamzated Alif - with Alif bare, replace AlifMaqsura with Yaa and remove Waaw at the - beginning. - """ - # strip Arabic diacritics - token = self.re_diacritics.sub("", token) - # replace Hamzated Alif with Alif bare - token = self.re_hamzated_alif.sub("\u0627", token) - # replace alifMaqsura with Yaa - token = self.re_alifMaqsura.sub("\u064A", token) - # strip the Waaw from the word beginning if the remaining is - # tri-literal at least - if token.startswith("\u0648") and len(token) > 3: - token = token[1:] - return token - - def pref(self, token): - """ - remove prefixes from the words' beginning. - """ - if len(token) > 5: - for p3 in self.pr3: - if token.startswith(p3): - return token[3:] - if len(token) > 6: - for p4 in self.pr4: - if token.startswith(p4): - return token[4:] - if len(token) > 5: - for p3 in self.pr32: - if token.startswith(p3): - return token[3:] - if len(token) > 4: - for p2 in self.pr2: - if token.startswith(p2): - return token[2:] - - def adjective(self, token): - """ - remove the infixes from adjectives - """ - # ^Alif, Alif, $Yaa - if len(token) > 5: - if ( - token.startswith("\u0627") - and token[-3] == "\u0627" - and token.endswith("\u064A") - ): - return token[:-3] + token[-2] - - def suff(self, token): - """ - remove the suffixes from the word's ending. - """ - if token.endswith("\u0643") and len(token) > 3: - return token[:-1] - if len(token) > 4: - for s2 in self.su2: - if token.endswith(s2): - return token[:-2] - if len(token) > 5: - for s3 in self.su3: - if token.endswith(s3): - return token[:-3] - if token.endswith("\u0647") and len(token) > 3: - token = token[:-1] - return token - if len(token) > 4: - for s2 in self.su22: - if token.endswith(s2): - return token[:-2] - if len(token) > 5: - for s3 in self.su32: - if token.endswith(s3): - return token[:-3] - # $Noon and Alif - if token.endswith("\u0646\u0627") and len(token) > 4: - return token[:-2] - return token - - def fem2masc(self, token): - """ - transform the word from the feminine form to the masculine form. - """ - if len(token) > 6: - # ^Taa, Yaa, $Yaa and Taa Marbuta - if ( - token.startswith("\u062A") - and token[-4] == "\u064A" - and token.endswith("\u064A\u0629") - ): - return token[1:-4] + token[-3] - # ^Alif, Yaa, $Yaa and Taa Marbuta - if ( - token.startswith("\u0627") - and token[-4] == "\u0627" - and token.endswith("\u064A\u0629") - ): - return token[:-4] + token[-3] - # $Alif, Yaa and Taa Marbuta - if token.endswith("\u0627\u064A\u0629") and len(token) > 5: - return token[:-2] - if len(token) > 4: - # Alif, $Taa Marbuta - if token[1] == "\u0627" and token.endswith("\u0629"): - return token[0] + token[2:-1] - # $Yaa and Taa Marbuta - if token.endswith("\u064A\u0629"): - return token[:-2] - # $Taa Marbuta - if token.endswith("\u0629") and len(token) > 3: - return token[:-1] - - def plur2sing(self, token): - """ - transform the word from the plural form to the singular form. - """ - # ^Haa, $Noon, Waaw - if len(token) > 5: - if token.startswith("\u0645") and token.endswith("\u0648\u0646"): - return token[1:-2] - if len(token) > 4: - for ps2 in self.pl_si2: - if token.endswith(ps2): - return token[:-2] - if len(token) > 5: - for ps3 in self.pl_si3: - if token.endswith(ps3): - return token[:-3] - if len(token) > 4: - # $Alif, Taa - if token.endswith("\u0627\u062A"): - return token[:-2] - # ^Alif Alif - if token.startswith("\u0627") and token[2] == "\u0627": - return token[:2] + token[3:] - # ^Alif Alif - if token.startswith("\u0627") and token[-2] == "\u0627": - return token[1:-2] + token[-1] - - def verb(self, token): - """ - stem the verb prefixes and suffixes or both - """ - vb = self.verb_t1(token) - if vb is not None: - return vb - vb = self.verb_t2(token) - if vb is not None: - return vb - vb = self.verb_t3(token) - if vb is not None: - return vb - vb = self.verb_t4(token) - if vb is not None: - return vb - vb = self.verb_t5(token) - if vb is not None: - return vb - vb = self.verb_t6(token) - return vb - - def verb_t1(self, token): - """ - stem the present tense co-occurred prefixes and suffixes - """ - if len(token) > 5 and token.startswith("\u062A"): # Taa - for s2 in self.pl_si2: - if token.endswith(s2): - return token[1:-2] - if len(token) > 5 and token.startswith("\u064A"): # Yaa - for s2 in self.verb_su2: - if token.endswith(s2): - return token[1:-2] - if len(token) > 4 and token.startswith("\u0627"): # Alif - # Waaw Alif - if len(token) > 5 and token.endswith("\u0648\u0627"): - return token[1:-2] - # Yaa - if token.endswith("\u064A"): - return token[1:-1] - # Alif - if token.endswith("\u0627"): - return token[1:-1] - # Noon - if token.endswith("\u0646"): - return token[1:-1] - # ^Yaa, Noon$ - if len(token) > 4 and token.startswith("\u064A") and token.endswith("\u0646"): - return token[1:-1] - # ^Taa, Noon$ - if len(token) > 4 and token.startswith("\u062A") and token.endswith("\u0646"): - return token[1:-1] - - def verb_t2(self, token): - """ - stem the future tense co-occurred prefixes and suffixes - """ - if len(token) > 6: - for s2 in self.pl_si2: - # ^Siin Taa - if token.startswith(self.verb_pr2[0]) and token.endswith(s2): - return token[2:-2] - # ^Siin Yaa, Alif Noon$ - if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[0]): - return token[2:-2] - # ^Siin Yaa, Waaw Noon$ - if token.startswith(self.verb_pr2[1]) and token.endswith(self.pl_si2[2]): - return token[2:-2] - # ^Siin Taa, Noon$ - if ( - len(token) > 5 - and token.startswith(self.verb_pr2[0]) - and token.endswith("\u0646") - ): - return token[2:-1] - # ^Siin Yaa, Noon$ - if ( - len(token) > 5 - and token.startswith(self.verb_pr2[1]) - and token.endswith("\u0646") - ): - return token[2:-1] - - def verb_t3(self, token): - """ - stem the present tense suffixes - """ - if len(token) > 5: - for su3 in self.verb_suf3: - if token.endswith(su3): - return token[:-3] - if len(token) > 4: - for su2 in self.verb_suf2: - if token.endswith(su2): - return token[:-2] - if len(token) > 3: - for su1 in self.verb_suf1: - if token.endswith(su1): - return token[:-1] - - def verb_t4(self, token): - """ - stem the present tense prefixes - """ - if len(token) > 3: - for pr1 in self.verb_suf1: - if token.startswith(pr1): - return token[1:] - if token.startswith("\u064A"): - return token[1:] - - def verb_t5(self, token): - """ - stem the future tense prefixes - """ - if len(token) > 4: - for pr2 in self.verb_pr22: - if token.startswith(pr2): - return token[2:] - for pr2 in self.verb_pr2: - if token.startswith(pr2): - return token[2:] - - def verb_t6(self, token): - """ - stem the imperative tense prefixes - """ - if len(token) > 4: - for pr3 in self.verb_pr33: - if token.startswith(pr3): - return token[2:] - - return token diff --git a/pipeline/nltk/stem/cistem.py b/pipeline/nltk/stem/cistem.py deleted file mode 100644 index 69c07a42a373cec1eca9d75e9d474c4c1063e70b..0000000000000000000000000000000000000000 --- a/pipeline/nltk/stem/cistem.py +++ /dev/null @@ -1,209 +0,0 @@ -# Natural Language Toolkit: CISTEM Stemmer for German -# Copyright (C) 2001-2023 NLTK Project -# Author: Leonie Weissweiler -# Tom Aarsen <> (modifications) -# Algorithm: Leonie Weissweiler -# Alexander Fraser -# URL: -# For license information, see LICENSE.TXT - -import re -from typing import Tuple - -from nltk.stem.api import StemmerI - - -class Cistem(StemmerI): - """ - CISTEM Stemmer for German - - This is the official Python implementation of the CISTEM stemmer. - It is based on the paper - Leonie Weissweiler, Alexander Fraser (2017). Developing a Stemmer for German - Based on a Comparative Analysis of Publicly Available Stemmers. - In Proceedings of the German Society for Computational Linguistics and Language - Technology (GSCL) - which can be read here: - https://www.cis.lmu.de/~weissweiler/cistem/ - - In the paper, we conducted an analysis of publicly available stemmers, - developed two gold standards for German stemming and evaluated the stemmers - based on the two gold standards. We then proposed the stemmer implemented here - and show that it achieves slightly better f-measure than the other stemmers and - is thrice as fast as the Snowball stemmer for German while being about as fast - as most other stemmers. - - case_insensitive is a a boolean specifying if case-insensitive stemming - should be used. Case insensitivity improves performance only if words in the - text may be incorrectly upper case. For all-lowercase and correctly cased - text, best performance is achieved by setting case_insensitive for false. - - :param case_insensitive: if True, the stemming is case insensitive. False by default. - :type case_insensitive: bool - """ - - strip_ge = re.compile(r"^ge(.{4,})") - repl_xx = re.compile(r"(.)\1") - strip_emr = re.compile(r"e[mr]$") - strip_nd = re.compile(r"nd$") - strip_t = re.compile(r"t$") - strip_esn = re.compile(r"[esn]$") - repl_xx_back = re.compile(r"(.)\*") - - def __init__(self, case_insensitive: bool = False): - self._case_insensitive = case_insensitive - - @staticmethod - def replace_to(word: str) -> str: - word = word.replace("sch", "$") - word = word.replace("ei", "%") - word = word.replace("ie", "&") - word = Cistem.repl_xx.sub(r"\1*", word) - - return word - - @staticmethod - def replace_back(word: str) -> str: - word = Cistem.repl_xx_back.sub(r"\1\1", word) - word = word.replace("%", "ei") - word = word.replace("&", "ie") - word = word.replace("$", "sch") - - return word - - def stem(self, word: str) -> str: - """Stems the input word. - - :param word: The word that is to be stemmed. - :type word: str - :return: The stemmed word. - :rtype: str - - >>> from nltk.stem.cistem import Cistem - >>> stemmer = Cistem() - >>> s1 = "Speicherbehältern" - >>> stemmer.stem(s1) - 'speicherbehalt' - >>> s2 = "Grenzpostens" - >>> stemmer.stem(s2) - 'grenzpost' - >>> s3 = "Ausgefeiltere" - >>> stemmer.stem(s3) - 'ausgefeilt' - >>> stemmer = Cistem(True) - >>> stemmer.stem(s1) - 'speicherbehal' - >>> stemmer.stem(s2) - 'grenzpo' - >>> stemmer.stem(s3) - 'ausgefeil' - """ - if len(word) == 0: - return word - - upper = word[0].isupper() - word = word.lower() - - word = word.replace("ü", "u") - word = word.replace("ö", "o") - word = word.replace("ä", "a") - word = word.replace("ß", "ss") - - word = Cistem.strip_ge.sub(r"\1", word) - - return self._segment_inner(word, upper)[0] - - def segment(self, word: str) -> Tuple[str, str]: - """ - This method works very similarly to stem (:func:'cistem.stem'). The difference is that in - addition to returning the stem, it also returns the rest that was removed at - the end. To be able to return the stem unchanged so the stem and the rest - can be concatenated to form the original word, all subsitutions that altered - the stem in any other way than by removing letters at the end were left out. - - :param word: The word that is to be stemmed. - :type word: str - :return: A tuple of the stemmed word and the removed suffix. - :rtype: Tuple[str, str] - - >>> from nltk.stem.cistem import Cistem - >>> stemmer = Cistem() - >>> s1 = "Speicherbehältern" - >>> stemmer.segment(s1) - ('speicherbehält', 'ern') - >>> s2 = "Grenzpostens" - >>> stemmer.segment(s2) - ('grenzpost', 'ens') - >>> s3 = "Ausgefeiltere" - >>> stemmer.segment(s3) - ('ausgefeilt', 'ere') - >>> stemmer = Cistem(True) - >>> stemmer.segment(s1) - ('speicherbehäl', 'tern') - >>> stemmer.segment(s2) - ('grenzpo', 'stens') - >>> stemmer.segment(s3) - ('ausgefeil', 'tere') - """ - if len(word) == 0: - return ("", "") - - upper = word[0].isupper() - word = word.lower() - - return self._segment_inner(word, upper) - - def _segment_inner(self, word: str, upper: bool): - """Inner method for iteratively applying the code stemming regexes. - This method receives a pre-processed variant of the word to be stemmed, - or the word to be segmented, and returns a tuple of the word and the - removed suffix. - - :param word: A pre-processed variant of the word that is to be stemmed. - :type word: str - :param upper: Whether the original word started with a capital letter. - :type upper: bool - :return: A tuple of the stemmed word and the removed suffix. - :rtype: Tuple[str, str] - """ - - rest_length = 0 - word_copy = word[:] - - # Pre-processing before applying the substitution patterns - word = Cistem.replace_to(word) - rest = "" - - # Apply the substitution patterns - while len(word) > 3: - if len(word) > 5: - word, n = Cistem.strip_emr.subn("", word) - if n != 0: - rest_length += 2 - continue - - word, n = Cistem.strip_nd.subn("", word) - if n != 0: - rest_length += 2 - continue - - if not upper or self._case_insensitive: - word, n = Cistem.strip_t.subn("", word) - if n != 0: - rest_length += 1 - continue - - word, n = Cistem.strip_esn.subn("", word) - if n != 0: - rest_length += 1 - continue - else: - break - - # Post-processing after applying the substitution patterns - word = Cistem.replace_back(word) - - if rest_length: - rest = word_copy[-rest_length:] - - return (word, rest) diff --git a/pipeline/nltk/stem/isri.py b/pipeline/nltk/stem/isri.py deleted file mode 100644 index 4ae91f1fafaf713330ce78696873e258487d2d0a..0000000000000000000000000000000000000000 --- a/pipeline/nltk/stem/isri.py +++ /dev/null @@ -1,395 +0,0 @@ -# -# Natural Language Toolkit: The ISRI Arabic Stemmer -# -# Copyright (C) 2001-2023 NLTK Project -# Algorithm: Kazem Taghva, Rania Elkhoury, and Jeffrey Coombs (2005) -# Author: Hosam Algasaier -# URL: -# For license information, see LICENSE.TXT - -""" -ISRI Arabic Stemmer - -The algorithm for this stemmer is described in: - -Taghva, K., Elkoury, R., and Coombs, J. 2005. Arabic Stemming without a root dictionary. -Information Science Research Institute. University of Nevada, Las Vegas, USA. - -The Information Science Research Institute’s (ISRI) Arabic stemmer shares many features -with the Khoja stemmer. However, the main difference is that ISRI stemmer does not use root -dictionary. Also, if a root is not found, ISRI stemmer returned normalized form, rather than -returning the original unmodified word. - -Additional adjustments were made to improve the algorithm: - -1- Adding 60 stop words. -2- Adding the pattern (تفاعيل) to ISRI pattern set. -3- The step 2 in the original algorithm was normalizing all hamza. This step is discarded because it -increases the word ambiguities and changes the original root. - -""" -import re - -from nltk.stem.api import StemmerI - - -class ISRIStemmer(StemmerI): - """ - ISRI Arabic stemmer based on algorithm: Arabic Stemming without a root dictionary. - Information Science Research Institute. University of Nevada, Las Vegas, USA. - - A few minor modifications have been made to ISRI basic algorithm. - See the source code of this module for more information. - - isri.stem(token) returns Arabic root for the given token. - - The ISRI Stemmer requires that all tokens have Unicode string types. - If you use Python IDLE on Arabic Windows you have to decode text first - using Arabic '1256' coding. - """ - - def __init__(self): - # length three prefixes - self.p3 = [ - "\u0643\u0627\u0644", - "\u0628\u0627\u0644", - "\u0648\u0644\u0644", - "\u0648\u0627\u0644", - ] - - # length two prefixes - self.p2 = ["\u0627\u0644", "\u0644\u0644"] - - # length one prefixes - self.p1 = [ - "\u0644", - "\u0628", - "\u0641", - "\u0633", - "\u0648", - "\u064a", - "\u062a", - "\u0646", - "\u0627", - ] - - # length three suffixes - self.s3 = [ - "\u062a\u0645\u0644", - "\u0647\u0645\u0644", - "\u062a\u0627\u0646", - "\u062a\u064a\u0646", - "\u0643\u0645\u0644", - ] - - # length two suffixes - self.s2 = [ - "\u0648\u0646", - "\u0627\u062a", - "\u0627\u0646", - "\u064a\u0646", - "\u062a\u0646", - "\u0643\u0645", - "\u0647\u0646", - "\u0646\u0627", - "\u064a\u0627", - "\u0647\u0627", - "\u062a\u0645", - "\u0643\u0646", - "\u0646\u064a", - "\u0648\u0627", - "\u0645\u0627", - "\u0647\u0645", - ] - - # length one suffixes - self.s1 = ["\u0629", "\u0647", "\u064a", "\u0643", "\u062a", "\u0627", "\u0646"] - - # groups of length four patterns - self.pr4 = { - 0: ["\u0645"], - 1: ["\u0627"], - 2: ["\u0627", "\u0648", "\u064A"], - 3: ["\u0629"], - } - - # Groups of length five patterns and length three roots - self.pr53 = { - 0: ["\u0627", "\u062a"], - 1: ["\u0627", "\u064a", "\u0648"], - 2: ["\u0627", "\u062a", "\u0645"], - 3: ["\u0645", "\u064a", "\u062a"], - 4: ["\u0645", "\u062a"], - 5: ["\u0627", "\u0648"], - 6: ["\u0627", "\u0645"], - } - - self.re_short_vowels = re.compile(r"[\u064B-\u0652]") - self.re_hamza = re.compile(r"[\u0621\u0624\u0626]") - self.re_initial_hamza = re.compile(r"^[\u0622\u0623\u0625]") - - self.stop_words = [ - "\u064a\u0643\u0648\u0646", - "\u0648\u0644\u064a\u0633", - "\u0648\u0643\u0627\u0646", - "\u0643\u0630\u0644\u0643", - "\u0627\u0644\u062a\u064a", - "\u0648\u0628\u064a\u0646", - "\u0639\u0644\u064a\u0647\u0627", - "\u0645\u0633\u0627\u0621", - "\u0627\u0644\u0630\u064a", - "\u0648\u0643\u0627\u0646\u062a", - "\u0648\u0644\u0643\u0646", - "\u0648\u0627\u0644\u062a\u064a", - "\u062a\u0643\u0648\u0646", - "\u0627\u0644\u064a\u0648\u0645", - "\u0627\u0644\u0644\u0630\u064a\u0646", - "\u0639\u0644\u064a\u0647", - "\u0643\u0627\u0646\u062a", - "\u0644\u0630\u0644\u0643", - "\u0623\u0645\u0627\u0645", - "\u0647\u0646\u0627\u0643", - "\u0645\u0646\u0647\u0627", - "\u0645\u0627\u0632\u0627\u0644", - "\u0644\u0627\u0632\u0627\u0644", - "\u0644\u0627\u064a\u0632\u0627\u0644", - "\u0645\u0627\u064a\u0632\u0627\u0644", - "\u0627\u0635\u0628\u062d", - "\u0623\u0635\u0628\u062d", - "\u0623\u0645\u0633\u0649", - "\u0627\u0645\u0633\u0649", - "\u0623\u0636\u062d\u0649", - "\u0627\u0636\u062d\u0649", - "\u0645\u0627\u0628\u0631\u062d", - "\u0645\u0627\u0641\u062a\u0626", - "\u0645\u0627\u0627\u0646\u0641\u0643", - "\u0644\u0627\u0633\u064a\u0645\u0627", - "\u0648\u0644\u0627\u064a\u0632\u0627\u0644", - "\u0627\u0644\u062d\u0627\u0644\u064a", - "\u0627\u0644\u064a\u0647\u0627", - "\u0627\u0644\u0630\u064a\u0646", - "\u0641\u0627\u0646\u0647", - "\u0648\u0627\u0644\u0630\u064a", - "\u0648\u0647\u0630\u0627", - "\u0644\u0647\u0630\u0627", - "\u0641\u0643\u0627\u0646", - "\u0633\u062a\u0643\u0648\u0646", - "\u0627\u0644\u064a\u0647", - "\u064a\u0645\u0643\u0646", - "\u0628\u0647\u0630\u0627", - "\u0627\u0644\u0630\u0649", - ] - - def stem(self, token): - """ - Stemming a word token using the ISRI stemmer. - """ - token = self.norm( - token, 1 - ) # remove diacritics which representing Arabic short vowels - if token in self.stop_words: - return token # exclude stop words from being processed - token = self.pre32( - token - ) # remove length three and length two prefixes in this order - token = self.suf32( - token - ) # remove length three and length two suffixes in this order - token = self.waw( - token - ) # remove connective ‘و’ if it precedes a word beginning with ‘و’ - token = self.norm(token, 2) # normalize initial hamza to bare alif - # if 4 <= word length <= 7, then stem; otherwise, no stemming - if len(token) == 4: # length 4 word - token = self.pro_w4(token) - elif len(token) == 5: # length 5 word - token = self.pro_w53(token) - token = self.end_w5(token) - elif len(token) == 6: # length 6 word - token = self.pro_w6(token) - token = self.end_w6(token) - elif len(token) == 7: # length 7 word - token = self.suf1(token) - if len(token) == 7: - token = self.pre1(token) - if len(token) == 6: - token = self.pro_w6(token) - token = self.end_w6(token) - return token - - def norm(self, word, num=3): - """ - normalization: - num=1 normalize diacritics - num=2 normalize initial hamza - num=3 both 1&2 - """ - if num == 1: - word = self.re_short_vowels.sub("", word) - elif num == 2: - word = self.re_initial_hamza.sub("\u0627", word) - elif num == 3: - word = self.re_short_vowels.sub("", word) - word = self.re_initial_hamza.sub("\u0627", word) - return word - - def pre32(self, word): - """remove length three and length two prefixes in this order""" - if len(word) >= 6: - for pre3 in self.p3: - if word.startswith(pre3): - return word[3:] - if len(word) >= 5: - for pre2 in self.p2: - if word.startswith(pre2): - return word[2:] - return word - - def suf32(self, word): - """remove length three and length two suffixes in this order""" - if len(word) >= 6: - for suf3 in self.s3: - if word.endswith(suf3): - return word[:-3] - if len(word) >= 5: - for suf2 in self.s2: - if word.endswith(suf2): - return word[:-2] - return word - - def waw(self, word): - """remove connective ‘و’ if it precedes a word beginning with ‘و’""" - if len(word) >= 4 and word[:2] == "\u0648\u0648": - word = word[1:] - return word - - def pro_w4(self, word): - """process length four patterns and extract length three roots""" - if word[0] in self.pr4[0]: # مفعل - word = word[1:] - elif word[1] in self.pr4[1]: # فاعل - word = word[:1] + word[2:] - elif word[2] in self.pr4[2]: # فعال - فعول - فعيل - word = word[:2] + word[3] - elif word[3] in self.pr4[3]: # فعلة - word = word[:-1] - else: - word = self.suf1(word) # do - normalize short sufix - if len(word) == 4: - word = self.pre1(word) # do - normalize short prefix - return word - - def pro_w53(self, word): - """process length five patterns and extract length three roots""" - if word[2] in self.pr53[0] and word[0] == "\u0627": # افتعل - افاعل - word = word[1] + word[3:] - elif word[3] in self.pr53[1] and word[0] == "\u0645": # مفعول - مفعال - مفعيل - word = word[1:3] + word[4] - elif word[0] in self.pr53[2] and word[4] == "\u0629": # مفعلة - تفعلة - افعلة - word = word[1:4] - elif word[0] in self.pr53[3] and word[2] == "\u062a": # مفتعل - يفتعل - تفتعل - word = word[1] + word[3:] - elif word[0] in self.pr53[4] and word[2] == "\u0627": # مفاعل - تفاعل - word = word[1] + word[3:] - elif word[2] in self.pr53[5] and word[4] == "\u0629": # فعولة - فعالة - word = word[:2] + word[3] - elif word[0] in self.pr53[6] and word[1] == "\u0646": # انفعل - منفعل - word = word[2:] - elif word[3] == "\u0627" and word[0] == "\u0627": # افعال - word = word[1:3] + word[4] - elif word[4] == "\u0646" and word[3] == "\u0627": # فعلان - word = word[:3] - elif word[3] == "\u064a" and word[0] == "\u062a": # تفعيل - word = word[1:3] + word[4] - elif word[3] == "\u0648" and word[1] == "\u0627": # فاعول - word = word[0] + word[2] + word[4] - elif word[2] == "\u0627" and word[1] == "\u0648": # فواعل - word = word[0] + word[3:] - elif word[3] == "\u0626" and word[2] == "\u0627": # فعائل - word = word[:2] + word[4] - elif word[4] == "\u0629" and word[1] == "\u0627": # فاعلة - word = word[0] + word[2:4] - elif word[4] == "\u064a" and word[2] == "\u0627": # فعالي - word = word[:2] + word[3] - else: - word = self.suf1(word) # do - normalize short sufix - if len(word) == 5: - word = self.pre1(word) # do - normalize short prefix - return word - - def pro_w54(self, word): - """process length five patterns and extract length four roots""" - if word[0] in self.pr53[2]: # تفعلل - افعلل - مفعلل - word = word[1:] - elif word[4] == "\u0629": # فعللة - word = word[:4] - elif word[2] == "\u0627": # فعالل - word = word[:2] + word[3:] - return word - - def end_w5(self, word): - """ending step (word of length five)""" - if len(word) == 4: - word = self.pro_w4(word) - elif len(word) == 5: - word = self.pro_w54(word) - return word - - def pro_w6(self, word): - """process length six patterns and extract length three roots""" - if word.startswith("\u0627\u0633\u062a") or word.startswith( - "\u0645\u0633\u062a" - ): # مستفعل - استفعل - word = word[3:] - elif ( - word[0] == "\u0645" and word[3] == "\u0627" and word[5] == "\u0629" - ): # مفعالة - word = word[1:3] + word[4] - elif ( - word[0] == "\u0627" and word[2] == "\u062a" and word[4] == "\u0627" - ): # افتعال - word = word[1] + word[3] + word[5] - elif ( - word[0] == "\u0627" and word[3] == "\u0648" and word[2] == word[4] - ): # افعوعل - word = word[1] + word[4:] - elif ( - word[0] == "\u062a" and word[2] == "\u0627" and word[4] == "\u064a" - ): # تفاعيل new pattern - word = word[1] + word[3] + word[5] - else: - word = self.suf1(word) # do - normalize short sufix - if len(word) == 6: - word = self.pre1(word) # do - normalize short prefix - return word - - def pro_w64(self, word): - """process length six patterns and extract length four roots""" - if word[0] == "\u0627" and word[4] == "\u0627": # افعلال - word = word[1:4] + word[5] - elif word.startswith("\u0645\u062a"): # متفعلل - word = word[2:] - return word - - def end_w6(self, word): - """ending step (word of length six)""" - if len(word) == 5: - word = self.pro_w53(word) - word = self.end_w5(word) - elif len(word) == 6: - word = self.pro_w64(word) - return word - - def suf1(self, word): - """normalize short sufix""" - for sf1 in self.s1: - if word.endswith(sf1): - return word[:-1] - return word - - def pre1(self, word): - """normalize short prefix""" - for sp1 in self.p1: - if word.startswith(sp1): - return word[1:] - return word diff --git a/pipeline/nltk/stem/lancaster.py b/pipeline/nltk/stem/lancaster.py deleted file mode 100644 index 40a87331848c9f25332e5e655bc24d85b563c2c2..0000000000000000000000000000000000000000 --- a/pipeline/nltk/stem/lancaster.py +++ /dev/null @@ -1,343 +0,0 @@ -# Natural Language Toolkit: Stemmers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Tomcavage -# URL: -# For license information, see LICENSE.TXT - -""" -A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm. -Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61. -""" -import re - -from nltk.stem.api import StemmerI - - -class LancasterStemmer(StemmerI): - """ - Lancaster Stemmer - - >>> from nltk.stem.lancaster import LancasterStemmer - >>> st = LancasterStemmer() - >>> st.stem('maximum') # Remove "-um" when word is intact - 'maxim' - >>> st.stem('presumably') # Don't remove "-um" when word is not intact - 'presum' - >>> st.stem('multiply') # No action taken if word ends with "-ply" - 'multiply' - >>> st.stem('provision') # Replace "-sion" with "-j" to trigger "j" set of rules - 'provid' - >>> st.stem('owed') # Word starting with vowel must contain at least 2 letters - 'ow' - >>> st.stem('ear') # ditto - 'ear' - >>> st.stem('saying') # Words starting with consonant must contain at least 3 - 'say' - >>> st.stem('crying') # letters and one of those letters must be a vowel - 'cry' - >>> st.stem('string') # ditto - 'string' - >>> st.stem('meant') # ditto - 'meant' - >>> st.stem('cement') # ditto - 'cem' - >>> st_pre = LancasterStemmer(strip_prefix_flag=True) - >>> st_pre.stem('kilometer') # Test Prefix - 'met' - >>> st_custom = LancasterStemmer(rule_tuple=("ssen4>", "s1t.")) - >>> st_custom.stem("ness") # Change s to t - 'nest' - """ - - # The rule list is static since it doesn't change between instances - default_rule_tuple = ( - "ai*2.", # -ia > - if intact - "a*1.", # -a > - if intact - "bb1.", # -bb > -b - "city3s.", # -ytic > -ys - "ci2>", # -ic > - - "cn1t>", # -nc > -nt - "dd1.", # -dd > -d - "dei3y>", # -ied > -y - "deec2ss.", # -ceed >", -cess - "dee1.", # -eed > -ee - "de2>", # -ed > - - "dooh4>", # -hood > - - "e1>", # -e > - - "feil1v.", # -lief > -liev - "fi2>", # -if > - - "gni3>", # -ing > - - "gai3y.", # -iag > -y - "ga2>", # -ag > - - "gg1.", # -gg > -g - "ht*2.", # -th > - if intact - "hsiug5ct.", # -guish > -ct - "hsi3>", # -ish > - - "i*1.", # -i > - if intact - "i1y>", # -i > -y - "ji1d.", # -ij > -id -- see nois4j> & vis3j> - "juf1s.", # -fuj > -fus - "ju1d.", # -uj > -ud - "jo1d.", # -oj > -od - "jeh1r.", # -hej > -her - "jrev1t.", # -verj > -vert - "jsim2t.", # -misj > -mit - "jn1d.", # -nj > -nd - "j1s.", # -j > -s - "lbaifi6.", # -ifiabl > - - "lbai4y.", # -iabl > -y - "lba3>", # -abl > - - "lbi3.", # -ibl > - - "lib2l>", # -bil > -bl - "lc1.", # -cl > c - "lufi4y.", # -iful > -y - "luf3>", # -ful > - - "lu2.", # -ul > - - "lai3>", # -ial > - - "lau3>", # -ual > - - "la2>", # -al > - - "ll1.", # -ll > -l - "mui3.", # -ium > - - "mu*2.", # -um > - if intact - "msi3>", # -ism > - - "mm1.", # -mm > -m - "nois4j>", # -sion > -j - "noix4ct.", # -xion > -ct - "noi3>", # -ion > - - "nai3>", # -ian > - - "na2>", # -an > - - "nee0.", # protect -een - "ne2>", # -en > - - "nn1.", # -nn > -n - "pihs4>", # -ship > - - "pp1.", # -pp > -p - "re2>", # -er > - - "rae0.", # protect -ear - "ra2.", # -ar > - - "ro2>", # -or > - - "ru2>", # -ur > - - "rr1.", # -rr > -r - "rt1>", # -tr > -t - "rei3y>", # -ier > -y - "sei3y>", # -ies > -y - "sis2.", # -sis > -s - "si2>", # -is > - - "ssen4>", # -ness > - - "ss0.", # protect -ss - "suo3>", # -ous > - - "su*2.", # -us > - if intact - "s*1>", # -s > - if intact - "s0.", # -s > -s - "tacilp4y.", # -plicat > -ply - "ta2>", # -at > - - "tnem4>", # -ment > - - "tne3>", # -ent > - - "tna3>", # -ant > - - "tpir2b.", # -ript > -rib - "tpro2b.", # -orpt > -orb - "tcud1.", # -duct > -duc - "tpmus2.", # -sumpt > -sum - "tpec2iv.", # -cept > -ceiv - "tulo2v.", # -olut > -olv - "tsis0.", # protect -sist - "tsi3>", # -ist > - - "tt1.", # -tt > -t - "uqi3.", # -iqu > - - "ugo1.", # -ogu > -og - "vis3j>", # -siv > -j - "vie0.", # protect -eiv - "vi2>", # -iv > - - "ylb1>", # -bly > -bl - "yli3y>", # -ily > -y - "ylp0.", # protect -ply - "yl2>", # -ly > - - "ygo1.", # -ogy > -og - "yhp1.", # -phy > -ph - "ymo1.", # -omy > -om - "ypo1.", # -opy > -op - "yti3>", # -ity > - - "yte3>", # -ety > - - "ytl2.", # -lty > -l - "yrtsi5.", # -istry > - - "yra3>", # -ary > - - "yro3>", # -ory > - - "yfi3.", # -ify > - - "ycn2t>", # -ncy > -nt - "yca3>", # -acy > - - "zi2>", # -iz > - - "zy1s.", # -yz > -ys - ) - - def __init__(self, rule_tuple=None, strip_prefix_flag=False): - """Create an instance of the Lancaster stemmer.""" - # Setup an empty rule dictionary - this will be filled in later - self.rule_dictionary = {} - # Check if a user wants to strip prefix - self._strip_prefix = strip_prefix_flag - # Check if a user wants to use his/her own rule tuples. - self._rule_tuple = rule_tuple if rule_tuple else self.default_rule_tuple - - def parseRules(self, rule_tuple=None): - """Validate the set of rules used in this stemmer. - - If this function is called as an individual method, without using stem - method, rule_tuple argument will be compiled into self.rule_dictionary. - If this function is called within stem, self._rule_tuple will be used. - - """ - # If there is no argument for the function, use class' own rule tuple. - rule_tuple = rule_tuple if rule_tuple else self._rule_tuple - valid_rule = re.compile(r"^[a-z]+\*?\d[a-z]*[>\.]?$") - # Empty any old rules from the rule set before adding new ones - self.rule_dictionary = {} - - for rule in rule_tuple: - if not valid_rule.match(rule): - raise ValueError(f"The rule {rule} is invalid") - first_letter = rule[0:1] - if first_letter in self.rule_dictionary: - self.rule_dictionary[first_letter].append(rule) - else: - self.rule_dictionary[first_letter] = [rule] - - def stem(self, word): - """Stem a word using the Lancaster stemmer.""" - # Lower-case the word, since all the rules are lower-cased - word = word.lower() - word = self.__stripPrefix(word) if self._strip_prefix else word - - # Save a copy of the original word - intact_word = word - - # If rule dictionary is empty, parse rule tuple. - if not self.rule_dictionary: - self.parseRules() - - return self.__doStemming(word, intact_word) - - def __doStemming(self, word, intact_word): - """Perform the actual word stemming""" - - valid_rule = re.compile(r"^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$") - - proceed = True - - while proceed: - - # Find the position of the last letter of the word to be stemmed - last_letter_position = self.__getLastLetter(word) - - # Only stem the word if it has a last letter and a rule matching that last letter - if ( - last_letter_position < 0 - or word[last_letter_position] not in self.rule_dictionary - ): - proceed = False - - else: - rule_was_applied = False - - # Go through each rule that matches the word's final letter - for rule in self.rule_dictionary[word[last_letter_position]]: - rule_match = valid_rule.match(rule) - if rule_match: - ( - ending_string, - intact_flag, - remove_total, - append_string, - cont_flag, - ) = rule_match.groups() - - # Convert the number of chars to remove when stemming - # from a string to an integer - remove_total = int(remove_total) - - # Proceed if word's ending matches rule's word ending - if word.endswith(ending_string[::-1]): - if intact_flag: - if word == intact_word and self.__isAcceptable( - word, remove_total - ): - word = self.__applyRule( - word, remove_total, append_string - ) - rule_was_applied = True - if cont_flag == ".": - proceed = False - break - elif self.__isAcceptable(word, remove_total): - word = self.__applyRule( - word, remove_total, append_string - ) - rule_was_applied = True - if cont_flag == ".": - proceed = False - break - # If no rules apply, the word doesn't need any more stemming - if rule_was_applied == False: - proceed = False - return word - - def __getLastLetter(self, word): - """Get the zero-based index of the last alphabetic character in this string""" - last_letter = -1 - for position in range(len(word)): - if word[position].isalpha(): - last_letter = position - else: - break - return last_letter - - def __isAcceptable(self, word, remove_total): - """Determine if the word is acceptable for stemming.""" - word_is_acceptable = False - # If the word starts with a vowel, it must be at least 2 - # characters long to be stemmed - if word[0] in "aeiouy": - if len(word) - remove_total >= 2: - word_is_acceptable = True - # If the word starts with a consonant, it must be at least 3 - # characters long (including one vowel) to be stemmed - elif len(word) - remove_total >= 3: - if word[1] in "aeiouy": - word_is_acceptable = True - elif word[2] in "aeiouy": - word_is_acceptable = True - return word_is_acceptable - - def __applyRule(self, word, remove_total, append_string): - """Apply the stemming rule to the word""" - # Remove letters from the end of the word - new_word_length = len(word) - remove_total - word = word[0:new_word_length] - - # And add new letters to the end of the truncated word - if append_string: - word += append_string - return word - - def __stripPrefix(self, word): - """Remove prefix from a word. - - This function originally taken from Whoosh. - - """ - for prefix in ( - "kilo", - "micro", - "milli", - "intra", - "ultra", - "mega", - "nano", - "pico", - "pseudo", - ): - if word.startswith(prefix): - return word[len(prefix) :] - return word - - def __repr__(self): - return "" diff --git a/pipeline/nltk/stem/porter.py b/pipeline/nltk/stem/porter.py deleted file mode 100644 index c84402d8083677ea9e727f5f5b0998529ad96ba6..0000000000000000000000000000000000000000 --- a/pipeline/nltk/stem/porter.py +++ /dev/null @@ -1,715 +0,0 @@ -""" -Porter Stemmer - -This is the Porter stemming algorithm. It follows the algorithm -presented in - -Porter, M. "An algorithm for suffix stripping." Program 14.3 (1980): 130-137. - -with some optional deviations that can be turned on or off with the -`mode` argument to the constructor. - -Martin Porter, the algorithm's inventor, maintains a web page about the -algorithm at - - https://www.tartarus.org/~martin/PorterStemmer/ - -which includes another Python implementation and other implementations -in many languages. -""" - -__docformat__ = "plaintext" - -import re - -from nltk.stem.api import StemmerI - - -class PorterStemmer(StemmerI): - """ - A word stemmer based on the Porter stemming algorithm. - - Porter, M. "An algorithm for suffix stripping." - Program 14.3 (1980): 130-137. - - See https://www.tartarus.org/~martin/PorterStemmer/ for the homepage - of the algorithm. - - Martin Porter has endorsed several modifications to the Porter - algorithm since writing his original paper, and those extensions are - included in the implementations on his website. Additionally, others - have proposed further improvements to the algorithm, including NLTK - contributors. There are thus three modes that can be selected by - passing the appropriate constant to the class constructor's `mode` - attribute: - - - PorterStemmer.ORIGINAL_ALGORITHM - - An implementation that is faithful to the original paper. - - Note that Martin Porter has deprecated this version of the - algorithm. Martin distributes implementations of the Porter - Stemmer in many languages, hosted at: - - https://www.tartarus.org/~martin/PorterStemmer/ - - and all of these implementations include his extensions. He - strongly recommends against using the original, published - version of the algorithm; only use this mode if you clearly - understand why you are choosing to do so. - - - PorterStemmer.MARTIN_EXTENSIONS - - An implementation that only uses the modifications to the - algorithm that are included in the implementations on Martin - Porter's website. He has declared Porter frozen, so the - behaviour of those implementations should never change. - - - PorterStemmer.NLTK_EXTENSIONS (default) - - An implementation that includes further improvements devised by - NLTK contributors or taken from other modified implementations - found on the web. - - For the best stemming, you should use the default NLTK_EXTENSIONS - version. However, if you need to get the same results as either the - original algorithm or one of Martin Porter's hosted versions for - compatibility with an existing implementation or dataset, you can use - one of the other modes instead. - """ - - # Modes the Stemmer can be instantiated in - NLTK_EXTENSIONS = "NLTK_EXTENSIONS" - MARTIN_EXTENSIONS = "MARTIN_EXTENSIONS" - ORIGINAL_ALGORITHM = "ORIGINAL_ALGORITHM" - - def __init__(self, mode=NLTK_EXTENSIONS): - if mode not in ( - self.NLTK_EXTENSIONS, - self.MARTIN_EXTENSIONS, - self.ORIGINAL_ALGORITHM, - ): - raise ValueError( - "Mode must be one of PorterStemmer.NLTK_EXTENSIONS, " - "PorterStemmer.MARTIN_EXTENSIONS, or " - "PorterStemmer.ORIGINAL_ALGORITHM" - ) - - self.mode = mode - - if self.mode == self.NLTK_EXTENSIONS: - # This is a table of irregular forms. It is quite short, - # but still reflects the errors actually drawn to Martin - # Porter's attention over a 20 year period! - irregular_forms = { - "sky": ["sky", "skies"], - "die": ["dying"], - "lie": ["lying"], - "tie": ["tying"], - "news": ["news"], - "inning": ["innings", "inning"], - "outing": ["outings", "outing"], - "canning": ["cannings", "canning"], - "howe": ["howe"], - "proceed": ["proceed"], - "exceed": ["exceed"], - "succeed": ["succeed"], - } - - self.pool = {} - for key in irregular_forms: - for val in irregular_forms[key]: - self.pool[val] = key - - self.vowels = frozenset(["a", "e", "i", "o", "u"]) - - def _is_consonant(self, word, i): - """Returns True if word[i] is a consonant, False otherwise - - A consonant is defined in the paper as follows: - - A consonant in a word is a letter other than A, E, I, O or - U, and other than Y preceded by a consonant. (The fact that - the term `consonant' is defined to some extent in terms of - itself does not make it ambiguous.) So in TOY the consonants - are T and Y, and in SYZYGY they are S, Z and G. If a letter - is not a consonant it is a vowel. - """ - if word[i] in self.vowels: - return False - if word[i] == "y": - if i == 0: - return True - else: - return not self._is_consonant(word, i - 1) - return True - - def _measure(self, stem): - r"""Returns the 'measure' of stem, per definition in the paper - - From the paper: - - A consonant will be denoted by c, a vowel by v. A list - ccc... of length greater than 0 will be denoted by C, and a - list vvv... of length greater than 0 will be denoted by V. - Any word, or part of a word, therefore has one of the four - forms: - - CVCV ... C - CVCV ... V - VCVC ... C - VCVC ... V - - These may all be represented by the single form - - [C]VCVC ... [V] - - where the square brackets denote arbitrary presence of their - contents. Using (VC){m} to denote VC repeated m times, this - may again be written as - - [C](VC){m}[V]. - - m will be called the \measure\ of any word or word part when - represented in this form. The case m = 0 covers the null - word. Here are some examples: - - m=0 TR, EE, TREE, Y, BY. - m=1 TROUBLE, OATS, TREES, IVY. - m=2 TROUBLES, PRIVATE, OATEN, ORRERY. - """ - cv_sequence = "" - - # Construct a string of 'c's and 'v's representing whether each - # character in `stem` is a consonant or a vowel. - # e.g. 'falafel' becomes 'cvcvcvc', - # 'architecture' becomes 'vcccvcvccvcv' - for i in range(len(stem)): - if self._is_consonant(stem, i): - cv_sequence += "c" - else: - cv_sequence += "v" - - # Count the number of 'vc' occurrences, which is equivalent to - # the number of 'VC' occurrences in Porter's reduced form in the - # docstring above, which is in turn equivalent to `m` - return cv_sequence.count("vc") - - def _has_positive_measure(self, stem): - return self._measure(stem) > 0 - - def _contains_vowel(self, stem): - """Returns True if stem contains a vowel, else False""" - for i in range(len(stem)): - if not self._is_consonant(stem, i): - return True - return False - - def _ends_double_consonant(self, word): - """Implements condition *d from the paper - - Returns True if word ends with a double consonant - """ - return ( - len(word) >= 2 - and word[-1] == word[-2] - and self._is_consonant(word, len(word) - 1) - ) - - def _ends_cvc(self, word): - """Implements condition *o from the paper - - From the paper: - - *o - the stem ends cvc, where the second c is not W, X or Y - (e.g. -WIL, -HOP). - """ - return ( - len(word) >= 3 - and self._is_consonant(word, len(word) - 3) - and not self._is_consonant(word, len(word) - 2) - and self._is_consonant(word, len(word) - 1) - and word[-1] not in ("w", "x", "y") - ) or ( - self.mode == self.NLTK_EXTENSIONS - and len(word) == 2 - and not self._is_consonant(word, 0) - and self._is_consonant(word, 1) - ) - - def _replace_suffix(self, word, suffix, replacement): - """Replaces `suffix` of `word` with `replacement""" - assert word.endswith(suffix), "Given word doesn't end with given suffix" - if suffix == "": - return word + replacement - else: - return word[: -len(suffix)] + replacement - - def _apply_rule_list(self, word, rules): - """Applies the first applicable suffix-removal rule to the word - - Takes a word and a list of suffix-removal rules represented as - 3-tuples, with the first element being the suffix to remove, - the second element being the string to replace it with, and the - final element being the condition for the rule to be applicable, - or None if the rule is unconditional. - """ - for rule in rules: - suffix, replacement, condition = rule - if suffix == "*d" and self._ends_double_consonant(word): - stem = word[:-2] - if condition is None or condition(stem): - return stem + replacement - else: - # Don't try any further rules - return word - if word.endswith(suffix): - stem = self._replace_suffix(word, suffix, "") - if condition is None or condition(stem): - return stem + replacement - else: - # Don't try any further rules - return word - - return word - - def _step1a(self, word): - """Implements Step 1a from "An algorithm for suffix stripping" - - From the paper: - - SSES -> SS caresses -> caress - IES -> I ponies -> poni - ties -> ti - SS -> SS caress -> caress - S -> cats -> cat - """ - # this NLTK-only rule extends the original algorithm, so - # that 'flies'->'fli' but 'dies'->'die' etc - if self.mode == self.NLTK_EXTENSIONS: - if word.endswith("ies") and len(word) == 4: - return self._replace_suffix(word, "ies", "ie") - - return self._apply_rule_list( - word, - [ - ("sses", "ss", None), # SSES -> SS - ("ies", "i", None), # IES -> I - ("ss", "ss", None), # SS -> SS - ("s", "", None), # S -> - ], - ) - - def _step1b(self, word): - """Implements Step 1b from "An algorithm for suffix stripping" - - From the paper: - - (m>0) EED -> EE feed -> feed - agreed -> agree - (*v*) ED -> plastered -> plaster - bled -> bled - (*v*) ING -> motoring -> motor - sing -> sing - - If the second or third of the rules in Step 1b is successful, - the following is done: - - AT -> ATE conflat(ed) -> conflate - BL -> BLE troubl(ed) -> trouble - IZ -> IZE siz(ed) -> size - (*d and not (*L or *S or *Z)) - -> single letter - hopp(ing) -> hop - tann(ed) -> tan - fall(ing) -> fall - hiss(ing) -> hiss - fizz(ed) -> fizz - (m=1 and *o) -> E fail(ing) -> fail - fil(ing) -> file - - The rule to map to a single letter causes the removal of one of - the double letter pair. The -E is put back on -AT, -BL and -IZ, - so that the suffixes -ATE, -BLE and -IZE can be recognised - later. This E may be removed in step 4. - """ - # this NLTK-only block extends the original algorithm, so that - # 'spied'->'spi' but 'died'->'die' etc - if self.mode == self.NLTK_EXTENSIONS: - if word.endswith("ied"): - if len(word) == 4: - return self._replace_suffix(word, "ied", "ie") - else: - return self._replace_suffix(word, "ied", "i") - - # (m>0) EED -> EE - if word.endswith("eed"): - stem = self._replace_suffix(word, "eed", "") - if self._measure(stem) > 0: - return stem + "ee" - else: - return word - - rule_2_or_3_succeeded = False - - for suffix in ["ed", "ing"]: - if word.endswith(suffix): - intermediate_stem = self._replace_suffix(word, suffix, "") - if self._contains_vowel(intermediate_stem): - rule_2_or_3_succeeded = True - break - - if not rule_2_or_3_succeeded: - return word - - return self._apply_rule_list( - intermediate_stem, - [ - ("at", "ate", None), # AT -> ATE - ("bl", "ble", None), # BL -> BLE - ("iz", "ize", None), # IZ -> IZE - # (*d and not (*L or *S or *Z)) - # -> single letter - ( - "*d", - intermediate_stem[-1], - lambda stem: intermediate_stem[-1] not in ("l", "s", "z"), - ), - # (m=1 and *o) -> E - ( - "", - "e", - lambda stem: (self._measure(stem) == 1 and self._ends_cvc(stem)), - ), - ], - ) - - def _step1c(self, word): - """Implements Step 1c from "An algorithm for suffix stripping" - - From the paper: - - Step 1c - - (*v*) Y -> I happy -> happi - sky -> sky - """ - - def nltk_condition(stem): - """ - This has been modified from the original Porter algorithm so - that y->i is only done when y is preceded by a consonant, - but not if the stem is only a single consonant, i.e. - - (*c and not c) Y -> I - - So 'happy' -> 'happi', but - 'enjoy' -> 'enjoy' etc - - This is a much better rule. Formerly 'enjoy'->'enjoi' and - 'enjoyment'->'enjoy'. Step 1c is perhaps done too soon; but - with this modification that no longer really matters. - - Also, the removal of the contains_vowel(z) condition means - that 'spy', 'fly', 'try' ... stem to 'spi', 'fli', 'tri' and - conflate with 'spied', 'tried', 'flies' ... - """ - return len(stem) > 1 and self._is_consonant(stem, len(stem) - 1) - - def original_condition(stem): - return self._contains_vowel(stem) - - return self._apply_rule_list( - word, - [ - ( - "y", - "i", - nltk_condition - if self.mode == self.NLTK_EXTENSIONS - else original_condition, - ) - ], - ) - - def _step2(self, word): - """Implements Step 2 from "An algorithm for suffix stripping" - - From the paper: - - Step 2 - - (m>0) ATIONAL -> ATE relational -> relate - (m>0) TIONAL -> TION conditional -> condition - rational -> rational - (m>0) ENCI -> ENCE valenci -> valence - (m>0) ANCI -> ANCE hesitanci -> hesitance - (m>0) IZER -> IZE digitizer -> digitize - (m>0) ABLI -> ABLE conformabli -> conformable - (m>0) ALLI -> AL radicalli -> radical - (m>0) ENTLI -> ENT differentli -> different - (m>0) ELI -> E vileli - > vile - (m>0) OUSLI -> OUS analogousli -> analogous - (m>0) IZATION -> IZE vietnamization -> vietnamize - (m>0) ATION -> ATE predication -> predicate - (m>0) ATOR -> ATE operator -> operate - (m>0) ALISM -> AL feudalism -> feudal - (m>0) IVENESS -> IVE decisiveness -> decisive - (m>0) FULNESS -> FUL hopefulness -> hopeful - (m>0) OUSNESS -> OUS callousness -> callous - (m>0) ALITI -> AL formaliti -> formal - (m>0) IVITI -> IVE sensitiviti -> sensitive - (m>0) BILITI -> BLE sensibiliti -> sensible - """ - - if self.mode == self.NLTK_EXTENSIONS: - # Instead of applying the ALLI -> AL rule after '(a)bli' per - # the published algorithm, instead we apply it first, and, - # if it succeeds, run the result through step2 again. - if word.endswith("alli") and self._has_positive_measure( - self._replace_suffix(word, "alli", "") - ): - return self._step2(self._replace_suffix(word, "alli", "al")) - - bli_rule = ("bli", "ble", self._has_positive_measure) - abli_rule = ("abli", "able", self._has_positive_measure) - - rules = [ - ("ational", "ate", self._has_positive_measure), - ("tional", "tion", self._has_positive_measure), - ("enci", "ence", self._has_positive_measure), - ("anci", "ance", self._has_positive_measure), - ("izer", "ize", self._has_positive_measure), - abli_rule if self.mode == self.ORIGINAL_ALGORITHM else bli_rule, - ("alli", "al", self._has_positive_measure), - ("entli", "ent", self._has_positive_measure), - ("eli", "e", self._has_positive_measure), - ("ousli", "ous", self._has_positive_measure), - ("ization", "ize", self._has_positive_measure), - ("ation", "ate", self._has_positive_measure), - ("ator", "ate", self._has_positive_measure), - ("alism", "al", self._has_positive_measure), - ("iveness", "ive", self._has_positive_measure), - ("fulness", "ful", self._has_positive_measure), - ("ousness", "ous", self._has_positive_measure), - ("aliti", "al", self._has_positive_measure), - ("iviti", "ive", self._has_positive_measure), - ("biliti", "ble", self._has_positive_measure), - ] - - if self.mode == self.NLTK_EXTENSIONS: - rules.append(("fulli", "ful", self._has_positive_measure)) - - # The 'l' of the 'logi' -> 'log' rule is put with the stem, - # so that short stems like 'geo' 'theo' etc work like - # 'archaeo' 'philo' etc. - rules.append( - ("logi", "log", lambda stem: self._has_positive_measure(word[:-3])) - ) - - if self.mode == self.MARTIN_EXTENSIONS: - rules.append(("logi", "log", self._has_positive_measure)) - - return self._apply_rule_list(word, rules) - - def _step3(self, word): - """Implements Step 3 from "An algorithm for suffix stripping" - - From the paper: - - Step 3 - - (m>0) ICATE -> IC triplicate -> triplic - (m>0) ATIVE -> formative -> form - (m>0) ALIZE -> AL formalize -> formal - (m>0) ICITI -> IC electriciti -> electric - (m>0) ICAL -> IC electrical -> electric - (m>0) FUL -> hopeful -> hope - (m>0) NESS -> goodness -> good - """ - return self._apply_rule_list( - word, - [ - ("icate", "ic", self._has_positive_measure), - ("ative", "", self._has_positive_measure), - ("alize", "al", self._has_positive_measure), - ("iciti", "ic", self._has_positive_measure), - ("ical", "ic", self._has_positive_measure), - ("ful", "", self._has_positive_measure), - ("ness", "", self._has_positive_measure), - ], - ) - - def _step4(self, word): - """Implements Step 4 from "An algorithm for suffix stripping" - - Step 4 - - (m>1) AL -> revival -> reviv - (m>1) ANCE -> allowance -> allow - (m>1) ENCE -> inference -> infer - (m>1) ER -> airliner -> airlin - (m>1) IC -> gyroscopic -> gyroscop - (m>1) ABLE -> adjustable -> adjust - (m>1) IBLE -> defensible -> defens - (m>1) ANT -> irritant -> irrit - (m>1) EMENT -> replacement -> replac - (m>1) MENT -> adjustment -> adjust - (m>1) ENT -> dependent -> depend - (m>1 and (*S or *T)) ION -> adoption -> adopt - (m>1) OU -> homologou -> homolog - (m>1) ISM -> communism -> commun - (m>1) ATE -> activate -> activ - (m>1) ITI -> angulariti -> angular - (m>1) OUS -> homologous -> homolog - (m>1) IVE -> effective -> effect - (m>1) IZE -> bowdlerize -> bowdler - - The suffixes are now removed. All that remains is a little - tidying up. - """ - measure_gt_1 = lambda stem: self._measure(stem) > 1 - - return self._apply_rule_list( - word, - [ - ("al", "", measure_gt_1), - ("ance", "", measure_gt_1), - ("ence", "", measure_gt_1), - ("er", "", measure_gt_1), - ("ic", "", measure_gt_1), - ("able", "", measure_gt_1), - ("ible", "", measure_gt_1), - ("ant", "", measure_gt_1), - ("ement", "", measure_gt_1), - ("ment", "", measure_gt_1), - ("ent", "", measure_gt_1), - # (m>1 and (*S or *T)) ION -> - ( - "ion", - "", - lambda stem: self._measure(stem) > 1 and stem[-1] in ("s", "t"), - ), - ("ou", "", measure_gt_1), - ("ism", "", measure_gt_1), - ("ate", "", measure_gt_1), - ("iti", "", measure_gt_1), - ("ous", "", measure_gt_1), - ("ive", "", measure_gt_1), - ("ize", "", measure_gt_1), - ], - ) - - def _step5a(self, word): - """Implements Step 5a from "An algorithm for suffix stripping" - - From the paper: - - Step 5a - - (m>1) E -> probate -> probat - rate -> rate - (m=1 and not *o) E -> cease -> ceas - """ - # Note that Martin's test vocabulary and reference - # implementations are inconsistent in how they handle the case - # where two rules both refer to a suffix that matches the word - # to be stemmed, but only the condition of the second one is - # true. - # Earlier in step2b we had the rules: - # (m>0) EED -> EE - # (*v*) ED -> - # but the examples in the paper included "feed"->"feed", even - # though (*v*) is true for "fe" and therefore the second rule - # alone would map "feed"->"fe". - # However, in THIS case, we need to handle the consecutive rules - # differently and try both conditions (obviously; the second - # rule here would be redundant otherwise). Martin's paper makes - # no explicit mention of the inconsistency; you have to infer it - # from the examples. - # For this reason, we can't use _apply_rule_list here. - if word.endswith("e"): - stem = self._replace_suffix(word, "e", "") - if self._measure(stem) > 1: - return stem - if self._measure(stem) == 1 and not self._ends_cvc(stem): - return stem - return word - - def _step5b(self, word): - """Implements Step 5a from "An algorithm for suffix stripping" - - From the paper: - - Step 5b - - (m > 1 and *d and *L) -> single letter - controll -> control - roll -> roll - """ - return self._apply_rule_list( - word, [("ll", "l", lambda stem: self._measure(word[:-1]) > 1)] - ) - - def stem(self, word, to_lowercase=True): - """ - :param to_lowercase: if `to_lowercase=True` the word always lowercase - """ - stem = word.lower() if to_lowercase else word - - if self.mode == self.NLTK_EXTENSIONS and word in self.pool: - return self.pool[stem] - - if self.mode != self.ORIGINAL_ALGORITHM and len(word) <= 2: - # With this line, strings of length 1 or 2 don't go through - # the stemming process, although no mention is made of this - # in the published algorithm. - return stem - - stem = self._step1a(stem) - stem = self._step1b(stem) - stem = self._step1c(stem) - stem = self._step2(stem) - stem = self._step3(stem) - stem = self._step4(stem) - stem = self._step5a(stem) - stem = self._step5b(stem) - - return stem - - def __repr__(self): - return "" - - -def demo(): - """ - A demonstration of the porter stemmer on a sample from - the Penn Treebank corpus. - """ - - from nltk import stem - from nltk.corpus import treebank - - stemmer = stem.PorterStemmer() - - orig = [] - stemmed = [] - for item in treebank.fileids()[:3]: - for (word, tag) in treebank.tagged_words(item): - orig.append(word) - stemmed.append(stemmer.stem(word)) - - # Convert the results to a string, and word-wrap them. - results = " ".join(stemmed) - results = re.sub(r"(.{,70})\s", r"\1\n", results + " ").rstrip() - - # Convert the original to a string, and word wrap it. - original = " ".join(orig) - original = re.sub(r"(.{,70})\s", r"\1\n", original + " ").rstrip() - - # Print the results. - print("-Original-".center(70).replace(" ", "*").replace("-", " ")) - print(original) - print("-Results-".center(70).replace(" ", "*").replace("-", " ")) - print(results) - print("*" * 70) diff --git a/pipeline/nltk/stem/regexp.py b/pipeline/nltk/stem/regexp.py deleted file mode 100644 index 473b42bd4a194bc11a51db9db7a00178a945862a..0000000000000000000000000000000000000000 --- a/pipeline/nltk/stem/regexp.py +++ /dev/null @@ -1,56 +0,0 @@ -# Natural Language Toolkit: Stemmers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Trevor Cohn -# Edward Loper -# Steven Bird -# URL: -# For license information, see LICENSE.TXT -import re - -from nltk.stem.api import StemmerI - - -class RegexpStemmer(StemmerI): - """ - A stemmer that uses regular expressions to identify morphological - affixes. Any substrings that match the regular expressions will - be removed. - - >>> from nltk.stem import RegexpStemmer - >>> st = RegexpStemmer('ing$|s$|e$|able$', min=4) - >>> st.stem('cars') - 'car' - >>> st.stem('mass') - 'mas' - >>> st.stem('was') - 'was' - >>> st.stem('bee') - 'bee' - >>> st.stem('compute') - 'comput' - >>> st.stem('advisable') - 'advis' - - :type regexp: str or regexp - :param regexp: The regular expression that should be used to - identify morphological affixes. - :type min: int - :param min: The minimum length of string to stem - """ - - def __init__(self, regexp, min=0): - - if not hasattr(regexp, "pattern"): - regexp = re.compile(regexp) - self._regexp = regexp - self._min = min - - def stem(self, word): - if len(word) < self._min: - return word - else: - return self._regexp.sub("", word) - - def __repr__(self): - return f"" diff --git a/pipeline/nltk/stem/rslp.py b/pipeline/nltk/stem/rslp.py deleted file mode 100644 index b1dfeb35e09643e2e75af68cac3bcc7632fc2245..0000000000000000000000000000000000000000 --- a/pipeline/nltk/stem/rslp.py +++ /dev/null @@ -1,137 +0,0 @@ -# Natural Language Toolkit: RSLP Stemmer -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Tiago Tresoldi -# URL: -# For license information, see LICENSE.TXT - -# This code is based on the algorithm presented in the paper "A Stemming -# Algorithm for the Portuguese Language" by Viviane Moreira Orengo and -# Christian Huyck, which unfortunately I had no access to. The code is a -# Python version, with some minor modifications of mine, to the description -# presented at https://www.webcitation.org/5NnvdIzOb and to the C source code -# available at http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html. -# Please note that this stemmer is intended for demonstration and educational -# purposes only. Feel free to write me for any comments, including the -# development of a different and/or better stemmer for Portuguese. I also -# suggest using NLTK's mailing list for Portuguese for any discussion. - -# Este código é baseado no algoritmo apresentado no artigo "A Stemming -# Algorithm for the Portuguese Language" de Viviane Moreira Orengo e -# Christian Huyck, o qual infelizmente não tive a oportunidade de ler. O -# código é uma conversão para Python, com algumas pequenas modificações -# minhas, daquele apresentado em https://www.webcitation.org/5NnvdIzOb e do -# código para linguagem C disponível em -# http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html. Por favor, -# lembre-se de que este stemmer foi desenvolvido com finalidades unicamente -# de demonstração e didáticas. Sinta-se livre para me escrever para qualquer -# comentário, inclusive sobre o desenvolvimento de um stemmer diferente -# e/ou melhor para o português. Também sugiro utilizar-se a lista de discussão -# do NLTK para o português para qualquer debate. - -from nltk.data import load -from nltk.stem.api import StemmerI - - -class RSLPStemmer(StemmerI): - """ - A stemmer for Portuguese. - - >>> from nltk.stem import RSLPStemmer - >>> st = RSLPStemmer() - >>> # opening lines of Erico Verissimo's "Música ao Longe" - >>> text = ''' - ... Clarissa risca com giz no quadro-negro a paisagem que os alunos - ... devem copiar . Uma casinha de porta e janela , em cima duma - ... coxilha .''' - >>> for token in text.split(): # doctest: +NORMALIZE_WHITESPACE - ... print(st.stem(token)) - clariss risc com giz no quadro-negr a pais que os alun dev copi . - uma cas de port e janel , em cim dum coxilh . - """ - - def __init__(self): - self._model = [] - - self._model.append(self.read_rule("step0.pt")) - self._model.append(self.read_rule("step1.pt")) - self._model.append(self.read_rule("step2.pt")) - self._model.append(self.read_rule("step3.pt")) - self._model.append(self.read_rule("step4.pt")) - self._model.append(self.read_rule("step5.pt")) - self._model.append(self.read_rule("step6.pt")) - - def read_rule(self, filename): - rules = load("nltk:stemmers/rslp/" + filename, format="raw").decode("utf8") - lines = rules.split("\n") - - lines = [line for line in lines if line != ""] # remove blank lines - lines = [line for line in lines if line[0] != "#"] # remove comments - - # NOTE: a simple but ugly hack to make this parser happy with double '\t's - lines = [line.replace("\t\t", "\t") for line in lines] - - # parse rules - rules = [] - for line in lines: - rule = [] - tokens = line.split("\t") - - # text to be searched for at the end of the string - rule.append(tokens[0][1:-1]) # remove quotes - - # minimum stem size to perform the replacement - rule.append(int(tokens[1])) - - # text to be replaced into - rule.append(tokens[2][1:-1]) # remove quotes - - # exceptions to this rule - rule.append([token[1:-1] for token in tokens[3].split(",")]) - - # append to the results - rules.append(rule) - - return rules - - def stem(self, word): - word = word.lower() - - # the word ends in 's'? apply rule for plural reduction - if word[-1] == "s": - word = self.apply_rule(word, 0) - - # the word ends in 'a'? apply rule for feminine reduction - if word[-1] == "a": - word = self.apply_rule(word, 1) - - # augmentative reduction - word = self.apply_rule(word, 3) - - # adverb reduction - word = self.apply_rule(word, 2) - - # noun reduction - prev_word = word - word = self.apply_rule(word, 4) - if word == prev_word: - # verb reduction - prev_word = word - word = self.apply_rule(word, 5) - if word == prev_word: - # vowel removal - word = self.apply_rule(word, 6) - - return word - - def apply_rule(self, word, rule_index): - rules = self._model[rule_index] - for rule in rules: - suffix_length = len(rule[0]) - if word[-suffix_length:] == rule[0]: # if suffix matches - if len(word) >= suffix_length + rule[1]: # if we have minimum size - if word not in rule[3]: # if not an exception - word = word[:-suffix_length] + rule[2] - break - - return word diff --git a/pipeline/nltk/stem/snowball.py b/pipeline/nltk/stem/snowball.py deleted file mode 100644 index 08cd9e76993213eafb0d1698f3f9b019af21068d..0000000000000000000000000000000000000000 --- a/pipeline/nltk/stem/snowball.py +++ /dev/null @@ -1,5946 +0,0 @@ -# -# Natural Language Toolkit: Snowball Stemmer -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Peter Michael Stahl -# Peter Ljunglof (revisions) -# Lakhdar Benzahia (co-writer) -# Assem Chelli (reviewer arabicstemmer) -# Abdelkrim Aries (reviewer arabicstemmer) -# Algorithms: Dr Martin Porter -# Assem Chelli arabic stemming algorithm -# Benzahia Lakhdar -# URL: -# For license information, see LICENSE.TXT - -""" -Snowball stemmers - -This module provides a port of the Snowball stemmers -developed by Martin Porter. - -There is also a demo function: `snowball.demo()`. - -""" - -import re - -from nltk.corpus import stopwords -from nltk.stem import porter -from nltk.stem.api import StemmerI -from nltk.stem.util import prefix_replace, suffix_replace - - -class SnowballStemmer(StemmerI): - - """ - Snowball Stemmer - - The following languages are supported: - Arabic, Danish, Dutch, English, Finnish, French, German, - Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian, - Spanish and Swedish. - - The algorithm for English is documented here: - - Porter, M. \"An algorithm for suffix stripping.\" - Program 14.3 (1980): 130-137. - - The algorithms have been developed by Martin Porter. - These stemmers are called Snowball, because Porter created - a programming language with this name for creating - new stemming algorithms. There is more information available - at http://snowball.tartarus.org/ - - The stemmer is invoked as shown below: - - >>> from nltk.stem import SnowballStemmer # See which languages are supported - >>> print(" ".join(SnowballStemmer.languages)) # doctest: +NORMALIZE_WHITESPACE - arabic danish dutch english finnish french german hungarian - italian norwegian porter portuguese romanian russian - spanish swedish - >>> stemmer = SnowballStemmer("german") # Choose a language - >>> stemmer.stem("Autobahnen") # Stem a word - 'autobahn' - - Invoking the stemmers that way is useful if you do not know the - language to be stemmed at runtime. Alternatively, if you already know - the language, then you can invoke the language specific stemmer directly: - - >>> from nltk.stem.snowball import GermanStemmer - >>> stemmer = GermanStemmer() - >>> stemmer.stem("Autobahnen") - 'autobahn' - - :param language: The language whose subclass is instantiated. - :type language: str or unicode - :param ignore_stopwords: If set to True, stopwords are - not stemmed and returned unchanged. - Set to False by default. - :type ignore_stopwords: bool - :raise ValueError: If there is no stemmer for the specified - language, a ValueError is raised. - """ - - languages = ( - "arabic", - "danish", - "dutch", - "english", - "finnish", - "french", - "german", - "hungarian", - "italian", - "norwegian", - "porter", - "portuguese", - "romanian", - "russian", - "spanish", - "swedish", - ) - - def __init__(self, language, ignore_stopwords=False): - if language not in self.languages: - raise ValueError(f"The language '{language}' is not supported.") - stemmerclass = globals()[language.capitalize() + "Stemmer"] - self.stemmer = stemmerclass(ignore_stopwords) - self.stem = self.stemmer.stem - self.stopwords = self.stemmer.stopwords - - def stem(self, token): - return self.stemmer.stem(self, token) - - -class _LanguageSpecificStemmer(StemmerI): - - """ - This helper subclass offers the possibility - to invoke a specific stemmer directly. - This is useful if you already know the language to be stemmed at runtime. - - Create an instance of the Snowball stemmer. - - :param ignore_stopwords: If set to True, stopwords are - not stemmed and returned unchanged. - Set to False by default. - :type ignore_stopwords: bool - """ - - def __init__(self, ignore_stopwords=False): - # The language is the name of the class, minus the final "Stemmer". - language = type(self).__name__.lower() - if language.endswith("stemmer"): - language = language[:-7] - - self.stopwords = set() - if ignore_stopwords: - try: - for word in stopwords.words(language): - self.stopwords.add(word) - except OSError as e: - raise ValueError( - "{!r} has no list of stopwords. Please set" - " 'ignore_stopwords' to 'False'.".format(self) - ) from e - - def __repr__(self): - """ - Print out the string representation of the respective class. - - """ - return f"<{type(self).__name__}>" - - -class PorterStemmer(_LanguageSpecificStemmer, porter.PorterStemmer): - """ - A word stemmer based on the original Porter stemming algorithm. - - Porter, M. \"An algorithm for suffix stripping.\" - Program 14.3 (1980): 130-137. - - A few minor modifications have been made to Porter's basic - algorithm. See the source code of the module - nltk.stem.porter for more information. - - """ - - def __init__(self, ignore_stopwords=False): - _LanguageSpecificStemmer.__init__(self, ignore_stopwords) - porter.PorterStemmer.__init__(self) - - -class _ScandinavianStemmer(_LanguageSpecificStemmer): - - """ - This subclass encapsulates a method for defining the string region R1. - It is used by the Danish, Norwegian, and Swedish stemmer. - - """ - - def _r1_scandinavian(self, word, vowels): - """ - Return the region R1 that is used by the Scandinavian stemmers. - - R1 is the region after the first non-vowel following a vowel, - or is the null region at the end of the word if there is no - such non-vowel. But then R1 is adjusted so that the region - before it contains at least three letters. - - :param word: The word whose region R1 is determined. - :type word: str or unicode - :param vowels: The vowels of the respective language that are - used to determine the region R1. - :type vowels: unicode - :return: the region R1 for the respective word. - :rtype: unicode - :note: This helper method is invoked by the respective stem method of - the subclasses DanishStemmer, NorwegianStemmer, and - SwedishStemmer. It is not to be invoked directly! - - """ - r1 = "" - for i in range(1, len(word)): - if word[i] not in vowels and word[i - 1] in vowels: - if 3 > len(word[: i + 1]) > 0: - r1 = word[3:] - elif len(word[: i + 1]) >= 3: - r1 = word[i + 1 :] - else: - return word - break - - return r1 - - -class _StandardStemmer(_LanguageSpecificStemmer): - - """ - This subclass encapsulates two methods for defining the standard versions - of the string regions R1, R2, and RV. - - """ - - def _r1r2_standard(self, word, vowels): - """ - Return the standard interpretations of the string regions R1 and R2. - - R1 is the region after the first non-vowel following a vowel, - or is the null region at the end of the word if there is no - such non-vowel. - - R2 is the region after the first non-vowel following a vowel - in R1, or is the null region at the end of the word if there - is no such non-vowel. - - :param word: The word whose regions R1 and R2 are determined. - :type word: str or unicode - :param vowels: The vowels of the respective language that are - used to determine the regions R1 and R2. - :type vowels: unicode - :return: (r1,r2), the regions R1 and R2 for the respective word. - :rtype: tuple - :note: This helper method is invoked by the respective stem method of - the subclasses DutchStemmer, FinnishStemmer, - FrenchStemmer, GermanStemmer, ItalianStemmer, - PortugueseStemmer, RomanianStemmer, and SpanishStemmer. - It is not to be invoked directly! - :note: A detailed description of how to define R1 and R2 - can be found at http://snowball.tartarus.org/texts/r1r2.html - - """ - r1 = "" - r2 = "" - for i in range(1, len(word)): - if word[i] not in vowels and word[i - 1] in vowels: - r1 = word[i + 1 :] - break - - for i in range(1, len(r1)): - if r1[i] not in vowels and r1[i - 1] in vowels: - r2 = r1[i + 1 :] - break - - return (r1, r2) - - def _rv_standard(self, word, vowels): - """ - Return the standard interpretation of the string region RV. - - If the second letter is a consonant, RV is the region after the - next following vowel. If the first two letters are vowels, RV is - the region after the next following consonant. Otherwise, RV is - the region after the third letter. - - :param word: The word whose region RV is determined. - :type word: str or unicode - :param vowels: The vowels of the respective language that are - used to determine the region RV. - :type vowels: unicode - :return: the region RV for the respective word. - :rtype: unicode - :note: This helper method is invoked by the respective stem method of - the subclasses ItalianStemmer, PortugueseStemmer, - RomanianStemmer, and SpanishStemmer. It is not to be - invoked directly! - - """ - rv = "" - if len(word) >= 2: - if word[1] not in vowels: - for i in range(2, len(word)): - if word[i] in vowels: - rv = word[i + 1 :] - break - - elif word[0] in vowels and word[1] in vowels: - for i in range(2, len(word)): - if word[i] not in vowels: - rv = word[i + 1 :] - break - else: - rv = word[3:] - - return rv - - -class ArabicStemmer(_StandardStemmer): - """ - https://github.com/snowballstem/snowball/blob/master/algorithms/arabic/stem_Unicode.sbl (Original Algorithm) - The Snowball Arabic light Stemmer - Algorithm: - - - Assem Chelli - - Abdelkrim Aries - - Lakhdar Benzahia - - NLTK Version Author: - - - Lakhdar Benzahia - """ - - # Normalize_pre stes - __vocalization = re.compile( - r"[\u064b-\u064c-\u064d-\u064e-\u064f-\u0650-\u0651-\u0652]" - ) # ً، ٌ، ٍ، َ، ُ، ِ، ّ، ْ - - __kasheeda = re.compile(r"[\u0640]") # ـ tatweel/kasheeda - - __arabic_punctuation_marks = re.compile(r"[\u060C-\u061B-\u061F]") # ؛ ، ؟ - - # Normalize_post - __last_hamzat = ("\u0623", "\u0625", "\u0622", "\u0624", "\u0626") # أ، إ، آ، ؤ، ئ - - # normalize other hamza's - __initial_hamzat = re.compile(r"^[\u0622\u0623\u0625]") # أ، إ، آ - - __waw_hamza = re.compile(r"[\u0624]") # ؤ - - __yeh_hamza = re.compile(r"[\u0626]") # ئ - - __alefat = re.compile(r"[\u0623\u0622\u0625]") # أ، إ، آ - - # Checks - __checks1 = ( - "\u0643\u0627\u0644", - "\u0628\u0627\u0644", # بال، كال - "\u0627\u0644", - "\u0644\u0644", # لل، ال - ) - - __checks2 = ("\u0629", "\u0627\u062a") # ة # female plural ات - - # Suffixes - __suffix_noun_step1a = ( - "\u064a", - "\u0643", - "\u0647", # ي، ك، ه - "\u0646\u0627", - "\u0643\u0645", - "\u0647\u0627", - "\u0647\u0646", - "\u0647\u0645", # نا، كم، ها، هن، هم - "\u0643\u0645\u0627", - "\u0647\u0645\u0627", # كما، هما - ) - - __suffix_noun_step1b = "\u0646" # ن - - __suffix_noun_step2a = ("\u0627", "\u064a", "\u0648") # ا، ي، و - - __suffix_noun_step2b = "\u0627\u062a" # ات - - __suffix_noun_step2c1 = "\u062a" # ت - - __suffix_noun_step2c2 = "\u0629" # ة - - __suffix_noun_step3 = "\u064a" # ي - - __suffix_verb_step1 = ( - "\u0647", - "\u0643", # ه، ك - "\u0646\u064a", - "\u0646\u0627", - "\u0647\u0627", - "\u0647\u0645", # ني، نا، ها، هم - "\u0647\u0646", - "\u0643\u0645", - "\u0643\u0646", # هن، كم، كن - "\u0647\u0645\u0627", - "\u0643\u0645\u0627", - "\u0643\u0645\u0648", # هما، كما، كمو - ) - - __suffix_verb_step2a = ( - "\u062a", - "\u0627", - "\u0646", - "\u064a", # ت، ا، ن، ي - "\u0646\u0627", - "\u062a\u0627", - "\u062a\u0646", # نا، تا، تن Past - "\u0627\u0646", - "\u0648\u0646", - "\u064a\u0646", # ان، هن، ين Present - "\u062a\u0645\u0627", # تما - ) - - __suffix_verb_step2b = ("\u0648\u0627", "\u062a\u0645") # وا، تم - - __suffix_verb_step2c = ("\u0648", "\u062a\u0645\u0648") # و # تمو - - __suffix_all_alef_maqsura = "\u0649" # ى - - # Prefixes - __prefix_step1 = ( - "\u0623", # أ - "\u0623\u0623", - "\u0623\u0622", - "\u0623\u0624", - "\u0623\u0627", - "\u0623\u0625", # أأ، أآ، أؤ، أا، أإ - ) - - __prefix_step2a = ("\u0641\u0627\u0644", "\u0648\u0627\u0644") # فال، وال - - __prefix_step2b = ("\u0641", "\u0648") # ف، و - - __prefix_step3a_noun = ( - "\u0627\u0644", - "\u0644\u0644", # لل، ال - "\u0643\u0627\u0644", - "\u0628\u0627\u0644", # بال، كال - ) - - __prefix_step3b_noun = ( - "\u0628", - "\u0643", - "\u0644", # ب، ك، ل - "\u0628\u0628", - "\u0643\u0643", # بب، كك - ) - - __prefix_step3_verb = ( - "\u0633\u064a", - "\u0633\u062a", - "\u0633\u0646", - "\u0633\u0623", - ) # سي، ست، سن، سأ - - __prefix_step4_verb = ( - "\u064a\u0633\u062a", - "\u0646\u0633\u062a", - "\u062a\u0633\u062a", - ) # يست، نست، تست - - # Suffixes added due to Conjugation Verbs - __conjugation_suffix_verb_1 = ("\u0647", "\u0643") # ه، ك - - __conjugation_suffix_verb_2 = ( - "\u0646\u064a", - "\u0646\u0627", - "\u0647\u0627", # ني، نا، ها - "\u0647\u0645", - "\u0647\u0646", - "\u0643\u0645", # هم، هن، كم - "\u0643\u0646", # كن - ) - __conjugation_suffix_verb_3 = ( - "\u0647\u0645\u0627", - "\u0643\u0645\u0627", - "\u0643\u0645\u0648", - ) # هما، كما، كمو - - __conjugation_suffix_verb_4 = ("\u0627", "\u0646", "\u064a") # ا، ن، ي - - __conjugation_suffix_verb_past = ( - "\u0646\u0627", - "\u062a\u0627", - "\u062a\u0646", - ) # نا، تا، تن - - __conjugation_suffix_verb_present = ( - "\u0627\u0646", - "\u0648\u0646", - "\u064a\u0646", - ) # ان، ون، ين - - # Suffixes added due to derivation Names - __conjugation_suffix_noun_1 = ("\u064a", "\u0643", "\u0647") # ي، ك، ه - - __conjugation_suffix_noun_2 = ( - "\u0646\u0627", - "\u0643\u0645", # نا، كم - "\u0647\u0627", - "\u0647\u0646", - "\u0647\u0645", # ها، هن، هم - ) - - __conjugation_suffix_noun_3 = ( - "\u0643\u0645\u0627", - "\u0647\u0645\u0627", - ) # كما، هما - - # Prefixes added due to derivation Names - __prefixes1 = ("\u0648\u0627", "\u0641\u0627") # فا، وا - - __articles_3len = ("\u0643\u0627\u0644", "\u0628\u0627\u0644") # بال كال - - __articles_2len = ("\u0627\u0644", "\u0644\u0644") # ال لل - - # Prepositions letters - __prepositions1 = ("\u0643", "\u0644") # ك، ل - __prepositions2 = ("\u0628\u0628", "\u0643\u0643") # بب، كك - - is_verb = True - is_noun = True - is_defined = False - - suffixes_verb_step1_success = False - suffix_verb_step2a_success = False - suffix_verb_step2b_success = False - suffix_noun_step2c2_success = False - suffix_noun_step1a_success = False - suffix_noun_step2a_success = False - suffix_noun_step2b_success = False - suffixe_noun_step1b_success = False - prefix_step2a_success = False - prefix_step3a_noun_success = False - prefix_step3b_noun_success = False - - def __normalize_pre(self, token): - """ - :param token: string - :return: normalized token type string - """ - # strip diacritics - token = self.__vocalization.sub("", token) - # strip kasheeda - token = self.__kasheeda.sub("", token) - # strip punctuation marks - token = self.__arabic_punctuation_marks.sub("", token) - return token - - def __normalize_post(self, token): - # normalize last hamza - for hamza in self.__last_hamzat: - if token.endswith(hamza): - token = suffix_replace(token, hamza, "\u0621") - break - # normalize other hamzat - token = self.__initial_hamzat.sub("\u0627", token) - token = self.__waw_hamza.sub("\u0648", token) - token = self.__yeh_hamza.sub("\u064a", token) - token = self.__alefat.sub("\u0627", token) - return token - - def __checks_1(self, token): - for prefix in self.__checks1: - if token.startswith(prefix): - if prefix in self.__articles_3len and len(token) > 4: - self.is_noun = True - self.is_verb = False - self.is_defined = True - break - - if prefix in self.__articles_2len and len(token) > 3: - self.is_noun = True - self.is_verb = False - self.is_defined = True - break - - def __checks_2(self, token): - for suffix in self.__checks2: - if token.endswith(suffix): - if suffix == "\u0629" and len(token) > 2: - self.is_noun = True - self.is_verb = False - break - - if suffix == "\u0627\u062a" and len(token) > 3: - self.is_noun = True - self.is_verb = False - break - - def __Suffix_Verb_Step1(self, token): - for suffix in self.__suffix_verb_step1: - if token.endswith(suffix): - if suffix in self.__conjugation_suffix_verb_1 and len(token) >= 4: - token = token[:-1] - self.suffixes_verb_step1_success = True - break - - if suffix in self.__conjugation_suffix_verb_2 and len(token) >= 5: - token = token[:-2] - self.suffixes_verb_step1_success = True - break - - if suffix in self.__conjugation_suffix_verb_3 and len(token) >= 6: - token = token[:-3] - self.suffixes_verb_step1_success = True - break - return token - - def __Suffix_Verb_Step2a(self, token): - for suffix in self.__suffix_verb_step2a: - if token.endswith(suffix) and len(token) > 3: - if suffix == "\u062a" and len(token) >= 4: - token = token[:-1] - self.suffix_verb_step2a_success = True - break - - if suffix in self.__conjugation_suffix_verb_4 and len(token) >= 4: - token = token[:-1] - self.suffix_verb_step2a_success = True - break - - if suffix in self.__conjugation_suffix_verb_past and len(token) >= 5: - token = token[:-2] # past - self.suffix_verb_step2a_success = True - break - - if suffix in self.__conjugation_suffix_verb_present and len(token) > 5: - token = token[:-2] # present - self.suffix_verb_step2a_success = True - break - - if suffix == "\u062a\u0645\u0627" and len(token) >= 6: - token = token[:-3] - self.suffix_verb_step2a_success = True - break - return token - - def __Suffix_Verb_Step2c(self, token): - for suffix in self.__suffix_verb_step2c: - if token.endswith(suffix): - if suffix == "\u062a\u0645\u0648" and len(token) >= 6: - token = token[:-3] - break - - if suffix == "\u0648" and len(token) >= 4: - token = token[:-1] - break - return token - - def __Suffix_Verb_Step2b(self, token): - for suffix in self.__suffix_verb_step2b: - if token.endswith(suffix) and len(token) >= 5: - token = token[:-2] - self.suffix_verb_step2b_success = True - break - return token - - def __Suffix_Noun_Step2c2(self, token): - for suffix in self.__suffix_noun_step2c2: - if token.endswith(suffix) and len(token) >= 3: - token = token[:-1] - self.suffix_noun_step2c2_success = True - break - return token - - def __Suffix_Noun_Step1a(self, token): - for suffix in self.__suffix_noun_step1a: - if token.endswith(suffix): - if suffix in self.__conjugation_suffix_noun_1 and len(token) >= 4: - token = token[:-1] - self.suffix_noun_step1a_success = True - break - - if suffix in self.__conjugation_suffix_noun_2 and len(token) >= 5: - token = token[:-2] - self.suffix_noun_step1a_success = True - break - - if suffix in self.__conjugation_suffix_noun_3 and len(token) >= 6: - token = token[:-3] - self.suffix_noun_step1a_success = True - break - return token - - def __Suffix_Noun_Step2a(self, token): - for suffix in self.__suffix_noun_step2a: - if token.endswith(suffix) and len(token) > 4: - token = token[:-1] - self.suffix_noun_step2a_success = True - break - return token - - def __Suffix_Noun_Step2b(self, token): - for suffix in self.__suffix_noun_step2b: - if token.endswith(suffix) and len(token) >= 5: - token = token[:-2] - self.suffix_noun_step2b_success = True - break - return token - - def __Suffix_Noun_Step2c1(self, token): - for suffix in self.__suffix_noun_step2c1: - if token.endswith(suffix) and len(token) >= 4: - token = token[:-1] - break - return token - - def __Suffix_Noun_Step1b(self, token): - for suffix in self.__suffix_noun_step1b: - if token.endswith(suffix) and len(token) > 5: - token = token[:-1] - self.suffixe_noun_step1b_success = True - break - return token - - def __Suffix_Noun_Step3(self, token): - for suffix in self.__suffix_noun_step3: - if token.endswith(suffix) and len(token) >= 3: - token = token[:-1] # ya' nisbiya - break - return token - - def __Suffix_All_alef_maqsura(self, token): - for suffix in self.__suffix_all_alef_maqsura: - if token.endswith(suffix): - token = suffix_replace(token, suffix, "\u064a") - return token - - def __Prefix_Step1(self, token): - for prefix in self.__prefix_step1: - if token.startswith(prefix) and len(token) > 3: - if prefix == "\u0623\u0623": - token = prefix_replace(token, prefix, "\u0623") - break - - elif prefix == "\u0623\u0622": - token = prefix_replace(token, prefix, "\u0622") - break - - elif prefix == "\u0623\u0624": - token = prefix_replace(token, prefix, "\u0624") - break - - elif prefix == "\u0623\u0627": - token = prefix_replace(token, prefix, "\u0627") - break - - elif prefix == "\u0623\u0625": - token = prefix_replace(token, prefix, "\u0625") - break - return token - - def __Prefix_Step2a(self, token): - for prefix in self.__prefix_step2a: - if token.startswith(prefix) and len(token) > 5: - token = token[len(prefix) :] - self.prefix_step2a_success = True - break - return token - - def __Prefix_Step2b(self, token): - for prefix in self.__prefix_step2b: - if token.startswith(prefix) and len(token) > 3: - if token[:2] not in self.__prefixes1: - token = token[len(prefix) :] - break - return token - - def __Prefix_Step3a_Noun(self, token): - for prefix in self.__prefix_step3a_noun: - if token.startswith(prefix): - if prefix in self.__articles_2len and len(token) > 4: - token = token[len(prefix) :] - self.prefix_step3a_noun_success = True - break - if prefix in self.__articles_3len and len(token) > 5: - token = token[len(prefix) :] - break - return token - - def __Prefix_Step3b_Noun(self, token): - for prefix in self.__prefix_step3b_noun: - if token.startswith(prefix): - if len(token) > 3: - if prefix == "\u0628": - token = token[len(prefix) :] - self.prefix_step3b_noun_success = True - break - - if prefix in self.__prepositions2: - token = prefix_replace(token, prefix, prefix[1]) - self.prefix_step3b_noun_success = True - break - - if prefix in self.__prepositions1 and len(token) > 4: - token = token[len(prefix) :] # BUG: cause confusion - self.prefix_step3b_noun_success = True - break - return token - - def __Prefix_Step3_Verb(self, token): - for prefix in self.__prefix_step3_verb: - if token.startswith(prefix) and len(token) > 4: - token = prefix_replace(token, prefix, prefix[1]) - break - return token - - def __Prefix_Step4_Verb(self, token): - for prefix in self.__prefix_step4_verb: - if token.startswith(prefix) and len(token) > 4: - token = prefix_replace(token, prefix, "\u0627\u0633\u062a") - self.is_verb = True - self.is_noun = False - break - return token - - def stem(self, word): - """ - Stem an Arabic word and return the stemmed form. - - :param word: string - :return: string - """ - # set initial values - self.is_verb = True - self.is_noun = True - self.is_defined = False - - self.suffix_verb_step2a_success = False - self.suffix_verb_step2b_success = False - self.suffix_noun_step2c2_success = False - self.suffix_noun_step1a_success = False - self.suffix_noun_step2a_success = False - self.suffix_noun_step2b_success = False - self.suffixe_noun_step1b_success = False - self.prefix_step2a_success = False - self.prefix_step3a_noun_success = False - self.prefix_step3b_noun_success = False - - modified_word = word - # guess type and properties - # checks1 - self.__checks_1(modified_word) - # checks2 - self.__checks_2(modified_word) - # Pre_Normalization - modified_word = self.__normalize_pre(modified_word) - # Avoid stopwords - if modified_word in self.stopwords or len(modified_word) <= 2: - return modified_word - # Start stemming - if self.is_verb: - modified_word = self.__Suffix_Verb_Step1(modified_word) - if self.suffixes_verb_step1_success: - modified_word = self.__Suffix_Verb_Step2a(modified_word) - if not self.suffix_verb_step2a_success: - modified_word = self.__Suffix_Verb_Step2c(modified_word) - # or next TODO: How to deal with or next instruction - else: - modified_word = self.__Suffix_Verb_Step2b(modified_word) - if not self.suffix_verb_step2b_success: - modified_word = self.__Suffix_Verb_Step2a(modified_word) - if self.is_noun: - modified_word = self.__Suffix_Noun_Step2c2(modified_word) - if not self.suffix_noun_step2c2_success: - if not self.is_defined: - modified_word = self.__Suffix_Noun_Step1a(modified_word) - # if self.suffix_noun_step1a_success: - modified_word = self.__Suffix_Noun_Step2a(modified_word) - if not self.suffix_noun_step2a_success: - modified_word = self.__Suffix_Noun_Step2b(modified_word) - if ( - not self.suffix_noun_step2b_success - and not self.suffix_noun_step2a_success - ): - modified_word = self.__Suffix_Noun_Step2c1(modified_word) - # or next ? todo : how to deal with or next - else: - modified_word = self.__Suffix_Noun_Step1b(modified_word) - if self.suffixe_noun_step1b_success: - modified_word = self.__Suffix_Noun_Step2a(modified_word) - if not self.suffix_noun_step2a_success: - modified_word = self.__Suffix_Noun_Step2b(modified_word) - if ( - not self.suffix_noun_step2b_success - and not self.suffix_noun_step2a_success - ): - modified_word = self.__Suffix_Noun_Step2c1(modified_word) - else: - if not self.is_defined: - modified_word = self.__Suffix_Noun_Step2a(modified_word) - modified_word = self.__Suffix_Noun_Step2b(modified_word) - modified_word = self.__Suffix_Noun_Step3(modified_word) - if not self.is_noun and self.is_verb: - modified_word = self.__Suffix_All_alef_maqsura(modified_word) - - # prefixes - modified_word = self.__Prefix_Step1(modified_word) - modified_word = self.__Prefix_Step2a(modified_word) - if not self.prefix_step2a_success: - modified_word = self.__Prefix_Step2b(modified_word) - modified_word = self.__Prefix_Step3a_Noun(modified_word) - if not self.prefix_step3a_noun_success and self.is_noun: - modified_word = self.__Prefix_Step3b_Noun(modified_word) - else: - if not self.prefix_step3b_noun_success and self.is_verb: - modified_word = self.__Prefix_Step3_Verb(modified_word) - modified_word = self.__Prefix_Step4_Verb(modified_word) - - # post normalization stemming - modified_word = self.__normalize_post(modified_word) - stemmed_word = modified_word - return stemmed_word - - -class DanishStemmer(_ScandinavianStemmer): - - """ - The Danish Snowball stemmer. - - :cvar __vowels: The Danish vowels. - :type __vowels: unicode - :cvar __consonants: The Danish consonants. - :type __consonants: unicode - :cvar __double_consonants: The Danish double consonants. - :type __double_consonants: tuple - :cvar __s_ending: Letters that may directly appear before a word final 's'. - :type __s_ending: unicode - :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. - :type __step1_suffixes: tuple - :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. - :type __step2_suffixes: tuple - :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. - :type __step3_suffixes: tuple - :note: A detailed description of the Danish - stemming algorithm can be found under - http://snowball.tartarus.org/algorithms/danish/stemmer.html - - """ - - # The language's vowels and other important characters are defined. - __vowels = "aeiouy\xE6\xE5\xF8" - __consonants = "bcdfghjklmnpqrstvwxz" - __double_consonants = ( - "bb", - "cc", - "dd", - "ff", - "gg", - "hh", - "jj", - "kk", - "ll", - "mm", - "nn", - "pp", - "qq", - "rr", - "ss", - "tt", - "vv", - "ww", - "xx", - "zz", - ) - __s_ending = "abcdfghjklmnoprtvyz\xE5" - - # The different suffixes, divided into the algorithm's steps - # and organized by length, are listed in tuples. - __step1_suffixes = ( - "erendes", - "erende", - "hedens", - "ethed", - "erede", - "heden", - "heder", - "endes", - "ernes", - "erens", - "erets", - "ered", - "ende", - "erne", - "eren", - "erer", - "heds", - "enes", - "eres", - "eret", - "hed", - "ene", - "ere", - "ens", - "ers", - "ets", - "en", - "er", - "es", - "et", - "e", - "s", - ) - __step2_suffixes = ("gd", "dt", "gt", "kt") - __step3_suffixes = ("elig", "l\xF8st", "lig", "els", "ig") - - def stem(self, word): - """ - Stem a Danish word and return the stemmed form. - - :param word: The word that is stemmed. - :type word: str or unicode - :return: The stemmed form. - :rtype: unicode - - """ - # Every word is put into lower case for normalization. - word = word.lower() - - if word in self.stopwords: - return word - - # After this, the required regions are generated - # by the respective helper method. - r1 = self._r1_scandinavian(word, self.__vowels) - - # Then the actual stemming process starts. - # Every new step is explicitly indicated - # according to the descriptions on the Snowball website. - - # STEP 1 - for suffix in self.__step1_suffixes: - if r1.endswith(suffix): - if suffix == "s": - if word[-2] in self.__s_ending: - word = word[:-1] - r1 = r1[:-1] - else: - word = word[: -len(suffix)] - r1 = r1[: -len(suffix)] - break - - # STEP 2 - for suffix in self.__step2_suffixes: - if r1.endswith(suffix): - word = word[:-1] - r1 = r1[:-1] - break - - # STEP 3 - if r1.endswith("igst"): - word = word[:-2] - r1 = r1[:-2] - - for suffix in self.__step3_suffixes: - if r1.endswith(suffix): - if suffix == "l\xF8st": - word = word[:-1] - r1 = r1[:-1] - else: - word = word[: -len(suffix)] - r1 = r1[: -len(suffix)] - - if r1.endswith(self.__step2_suffixes): - word = word[:-1] - r1 = r1[:-1] - break - - # STEP 4: Undouble - for double_cons in self.__double_consonants: - if word.endswith(double_cons) and len(word) > 3: - word = word[:-1] - break - - return word - - -class DutchStemmer(_StandardStemmer): - - """ - The Dutch Snowball stemmer. - - :cvar __vowels: The Dutch vowels. - :type __vowels: unicode - :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. - :type __step1_suffixes: tuple - :cvar __step3b_suffixes: Suffixes to be deleted in step 3b of the algorithm. - :type __step3b_suffixes: tuple - :note: A detailed description of the Dutch - stemming algorithm can be found under - http://snowball.tartarus.org/algorithms/dutch/stemmer.html - - """ - - __vowels = "aeiouy\xE8" - __step1_suffixes = ("heden", "ene", "en", "se", "s") - __step3b_suffixes = ("baar", "lijk", "bar", "end", "ing", "ig") - - def stem(self, word): - """ - Stem a Dutch word and return the stemmed form. - - :param word: The word that is stemmed. - :type word: str or unicode - :return: The stemmed form. - :rtype: unicode - - """ - word = word.lower() - - if word in self.stopwords: - return word - - step2_success = False - - # Vowel accents are removed. - word = ( - word.replace("\xE4", "a") - .replace("\xE1", "a") - .replace("\xEB", "e") - .replace("\xE9", "e") - .replace("\xED", "i") - .replace("\xEF", "i") - .replace("\xF6", "o") - .replace("\xF3", "o") - .replace("\xFC", "u") - .replace("\xFA", "u") - ) - - # An initial 'y', a 'y' after a vowel, - # and an 'i' between self.__vowels is put into upper case. - # As from now these are treated as consonants. - if word.startswith("y"): - word = "".join(("Y", word[1:])) - - for i in range(1, len(word)): - if word[i - 1] in self.__vowels and word[i] == "y": - word = "".join((word[:i], "Y", word[i + 1 :])) - - for i in range(1, len(word) - 1): - if ( - word[i - 1] in self.__vowels - and word[i] == "i" - and word[i + 1] in self.__vowels - ): - word = "".join((word[:i], "I", word[i + 1 :])) - - r1, r2 = self._r1r2_standard(word, self.__vowels) - - # R1 is adjusted so that the region before it - # contains at least 3 letters. - for i in range(1, len(word)): - if word[i] not in self.__vowels and word[i - 1] in self.__vowels: - if 3 > len(word[: i + 1]) > 0: - r1 = word[3:] - elif len(word[: i + 1]) == 0: - return word - break - - # STEP 1 - for suffix in self.__step1_suffixes: - if r1.endswith(suffix): - if suffix == "heden": - word = suffix_replace(word, suffix, "heid") - r1 = suffix_replace(r1, suffix, "heid") - if r2.endswith("heden"): - r2 = suffix_replace(r2, suffix, "heid") - - elif ( - suffix in ("ene", "en") - and not word.endswith("heden") - and word[-len(suffix) - 1] not in self.__vowels - and word[-len(suffix) - 3 : -len(suffix)] != "gem" - ): - word = word[: -len(suffix)] - r1 = r1[: -len(suffix)] - r2 = r2[: -len(suffix)] - if word.endswith(("kk", "dd", "tt")): - word = word[:-1] - r1 = r1[:-1] - r2 = r2[:-1] - - elif ( - suffix in ("se", "s") - and word[-len(suffix) - 1] not in self.__vowels - and word[-len(suffix) - 1] != "j" - ): - word = word[: -len(suffix)] - r1 = r1[: -len(suffix)] - r2 = r2[: -len(suffix)] - break - - # STEP 2 - if r1.endswith("e") and word[-2] not in self.__vowels: - step2_success = True - word = word[:-1] - r1 = r1[:-1] - r2 = r2[:-1] - - if word.endswith(("kk", "dd", "tt")): - word = word[:-1] - r1 = r1[:-1] - r2 = r2[:-1] - - # STEP 3a - if r2.endswith("heid") and word[-5] != "c": - word = word[:-4] - r1 = r1[:-4] - r2 = r2[:-4] - - if ( - r1.endswith("en") - and word[-3] not in self.__vowels - and word[-5:-2] != "gem" - ): - word = word[:-2] - r1 = r1[:-2] - r2 = r2[:-2] - - if word.endswith(("kk", "dd", "tt")): - word = word[:-1] - r1 = r1[:-1] - r2 = r2[:-1] - - # STEP 3b: Derivational suffixes - for suffix in self.__step3b_suffixes: - if r2.endswith(suffix): - if suffix in ("end", "ing"): - word = word[:-3] - r2 = r2[:-3] - - if r2.endswith("ig") and word[-3] != "e": - word = word[:-2] - else: - if word.endswith(("kk", "dd", "tt")): - word = word[:-1] - - elif suffix == "ig" and word[-3] != "e": - word = word[:-2] - - elif suffix == "lijk": - word = word[:-4] - r1 = r1[:-4] - - if r1.endswith("e") and word[-2] not in self.__vowels: - word = word[:-1] - if word.endswith(("kk", "dd", "tt")): - word = word[:-1] - - elif suffix == "baar": - word = word[:-4] - - elif suffix == "bar" and step2_success: - word = word[:-3] - break - - # STEP 4: Undouble vowel - if len(word) >= 4: - if word[-1] not in self.__vowels and word[-1] != "I": - if word[-3:-1] in ("aa", "ee", "oo", "uu"): - if word[-4] not in self.__vowels: - word = "".join((word[:-3], word[-3], word[-1])) - - # All occurrences of 'I' and 'Y' are put back into lower case. - word = word.replace("I", "i").replace("Y", "y") - - return word - - -class EnglishStemmer(_StandardStemmer): - - """ - The English Snowball stemmer. - - :cvar __vowels: The English vowels. - :type __vowels: unicode - :cvar __double_consonants: The English double consonants. - :type __double_consonants: tuple - :cvar __li_ending: Letters that may directly appear before a word final 'li'. - :type __li_ending: unicode - :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. - :type __step0_suffixes: tuple - :cvar __step1a_suffixes: Suffixes to be deleted in step 1a of the algorithm. - :type __step1a_suffixes: tuple - :cvar __step1b_suffixes: Suffixes to be deleted in step 1b of the algorithm. - :type __step1b_suffixes: tuple - :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. - :type __step2_suffixes: tuple - :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. - :type __step3_suffixes: tuple - :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. - :type __step4_suffixes: tuple - :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm. - :type __step5_suffixes: tuple - :cvar __special_words: A dictionary containing words - which have to be stemmed specially. - :type __special_words: dict - :note: A detailed description of the English - stemming algorithm can be found under - http://snowball.tartarus.org/algorithms/english/stemmer.html - """ - - __vowels = "aeiouy" - __double_consonants = ("bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt") - __li_ending = "cdeghkmnrt" - __step0_suffixes = ("'s'", "'s", "'") - __step1a_suffixes = ("sses", "ied", "ies", "us", "ss", "s") - __step1b_suffixes = ("eedly", "ingly", "edly", "eed", "ing", "ed") - __step2_suffixes = ( - "ization", - "ational", - "fulness", - "ousness", - "iveness", - "tional", - "biliti", - "lessli", - "entli", - "ation", - "alism", - "aliti", - "ousli", - "iviti", - "fulli", - "enci", - "anci", - "abli", - "izer", - "ator", - "alli", - "bli", - "ogi", - "li", - ) - __step3_suffixes = ( - "ational", - "tional", - "alize", - "icate", - "iciti", - "ative", - "ical", - "ness", - "ful", - ) - __step4_suffixes = ( - "ement", - "ance", - "ence", - "able", - "ible", - "ment", - "ant", - "ent", - "ism", - "ate", - "iti", - "ous", - "ive", - "ize", - "ion", - "al", - "er", - "ic", - ) - __step5_suffixes = ("e", "l") - __special_words = { - "skis": "ski", - "skies": "sky", - "dying": "die", - "lying": "lie", - "tying": "tie", - "idly": "idl", - "gently": "gentl", - "ugly": "ugli", - "early": "earli", - "only": "onli", - "singly": "singl", - "sky": "sky", - "news": "news", - "howe": "howe", - "atlas": "atlas", - "cosmos": "cosmos", - "bias": "bias", - "andes": "andes", - "inning": "inning", - "innings": "inning", - "outing": "outing", - "outings": "outing", - "canning": "canning", - "cannings": "canning", - "herring": "herring", - "herrings": "herring", - "earring": "earring", - "earrings": "earring", - "proceed": "proceed", - "proceeds": "proceed", - "proceeded": "proceed", - "proceeding": "proceed", - "exceed": "exceed", - "exceeds": "exceed", - "exceeded": "exceed", - "exceeding": "exceed", - "succeed": "succeed", - "succeeds": "succeed", - "succeeded": "succeed", - "succeeding": "succeed", - } - - def stem(self, word): - - """ - Stem an English word and return the stemmed form. - - :param word: The word that is stemmed. - :type word: str or unicode - :return: The stemmed form. - :rtype: unicode - - """ - word = word.lower() - - if word in self.stopwords or len(word) <= 2: - return word - - elif word in self.__special_words: - return self.__special_words[word] - - # Map the different apostrophe characters to a single consistent one - word = ( - word.replace("\u2019", "\x27") - .replace("\u2018", "\x27") - .replace("\u201B", "\x27") - ) - - if word.startswith("\x27"): - word = word[1:] - - if word.startswith("y"): - word = "".join(("Y", word[1:])) - - for i in range(1, len(word)): - if word[i - 1] in self.__vowels and word[i] == "y": - word = "".join((word[:i], "Y", word[i + 1 :])) - - step1a_vowel_found = False - step1b_vowel_found = False - - r1 = "" - r2 = "" - - if word.startswith(("gener", "commun", "arsen")): - if word.startswith(("gener", "arsen")): - r1 = word[5:] - else: - r1 = word[6:] - - for i in range(1, len(r1)): - if r1[i] not in self.__vowels and r1[i - 1] in self.__vowels: - r2 = r1[i + 1 :] - break - else: - r1, r2 = self._r1r2_standard(word, self.__vowels) - - # STEP 0 - for suffix in self.__step0_suffixes: - if word.endswith(suffix): - word = word[: -len(suffix)] - r1 = r1[: -len(suffix)] - r2 = r2[: -len(suffix)] - break - - # STEP 1a - for suffix in self.__step1a_suffixes: - if word.endswith(suffix): - - if suffix == "sses": - word = word[:-2] - r1 = r1[:-2] - r2 = r2[:-2] - - elif suffix in ("ied", "ies"): - if len(word[: -len(suffix)]) > 1: - word = word[:-2] - r1 = r1[:-2] - r2 = r2[:-2] - else: - word = word[:-1] - r1 = r1[:-1] - r2 = r2[:-1] - - elif suffix == "s": - for letter in word[:-2]: - if letter in self.__vowels: - step1a_vowel_found = True - break - - if step1a_vowel_found: - word = word[:-1] - r1 = r1[:-1] - r2 = r2[:-1] - break - - # STEP 1b - for suffix in self.__step1b_suffixes: - if word.endswith(suffix): - if suffix in ("eed", "eedly"): - - if r1.endswith(suffix): - word = suffix_replace(word, suffix, "ee") - - if len(r1) >= len(suffix): - r1 = suffix_replace(r1, suffix, "ee") - else: - r1 = "" - - if len(r2) >= len(suffix): - r2 = suffix_replace(r2, suffix, "ee") - else: - r2 = "" - else: - for letter in word[: -len(suffix)]: - if letter in self.__vowels: - step1b_vowel_found = True - break - - if step1b_vowel_found: - word = word[: -len(suffix)] - r1 = r1[: -len(suffix)] - r2 = r2[: -len(suffix)] - - if word.endswith(("at", "bl", "iz")): - word = "".join((word, "e")) - r1 = "".join((r1, "e")) - - if len(word) > 5 or len(r1) >= 3: - r2 = "".join((r2, "e")) - - elif word.endswith(self.__double_consonants): - word = word[:-1] - r1 = r1[:-1] - r2 = r2[:-1] - - elif ( - r1 == "" - and len(word) >= 3 - and word[-1] not in self.__vowels - and word[-1] not in "wxY" - and word[-2] in self.__vowels - and word[-3] not in self.__vowels - ) or ( - r1 == "" - and len(word) == 2 - and word[0] in self.__vowels - and word[1] not in self.__vowels - ): - - word = "".join((word, "e")) - - if len(r1) > 0: - r1 = "".join((r1, "e")) - - if len(r2) > 0: - r2 = "".join((r2, "e")) - break - - # STEP 1c - if len(word) > 2 and word[-1] in "yY" and word[-2] not in self.__vowels: - word = "".join((word[:-1], "i")) - if len(r1) >= 1: - r1 = "".join((r1[:-1], "i")) - else: - r1 = "" - - if len(r2) >= 1: - r2 = "".join((r2[:-1], "i")) - else: - r2 = "" - - # STEP 2 - for suffix in self.__step2_suffixes: - if word.endswith(suffix): - if r1.endswith(suffix): - if suffix == "tional": - word = word[:-2] - r1 = r1[:-2] - r2 = r2[:-2] - - elif suffix in ("enci", "anci", "abli"): - word = "".join((word[:-1], "e")) - - if len(r1) >= 1: - r1 = "".join((r1[:-1], "e")) - else: - r1 = "" - - if len(r2) >= 1: - r2 = "".join((r2[:-1], "e")) - else: - r2 = "" - - elif suffix == "entli": - word = word[:-2] - r1 = r1[:-2] - r2 = r2[:-2] - - elif suffix in ("izer", "ization"): - word = suffix_replace(word, suffix, "ize") - - if len(r1) >= len(suffix): - r1 = suffix_replace(r1, suffix, "ize") - else: - r1 = "" - - if len(r2) >= len(suffix): - r2 = suffix_replace(r2, suffix, "ize") - else: - r2 = "" - - elif suffix in ("ational", "ation", "ator"): - word = suffix_replace(word, suffix, "ate") - - if len(r1) >= len(suffix): - r1 = suffix_replace(r1, suffix, "ate") - else: - r1 = "" - - if len(r2) >= len(suffix): - r2 = suffix_replace(r2, suffix, "ate") - else: - r2 = "e" - - elif suffix in ("alism", "aliti", "alli"): - word = suffix_replace(word, suffix, "al") - - if len(r1) >= len(suffix): - r1 = suffix_replace(r1, suffix, "al") - else: - r1 = "" - - if len(r2) >= len(suffix): - r2 = suffix_replace(r2, suffix, "al") - else: - r2 = "" - - elif suffix == "fulness": - word = word[:-4] - r1 = r1[:-4] - r2 = r2[:-4] - - elif suffix in ("ousli", "ousness"): - word = suffix_replace(word, suffix, "ous") - - if len(r1) >= len(suffix): - r1 = suffix_replace(r1, suffix, "ous") - else: - r1 = "" - - if len(r2) >= len(suffix): - r2 = suffix_replace(r2, suffix, "ous") - else: - r2 = "" - - elif suffix in ("iveness", "iviti"): - word = suffix_replace(word, suffix, "ive") - - if len(r1) >= len(suffix): - r1 = suffix_replace(r1, suffix, "ive") - else: - r1 = "" - - if len(r2) >= len(suffix): - r2 = suffix_replace(r2, suffix, "ive") - else: - r2 = "e" - - elif suffix in ("biliti", "bli"): - word = suffix_replace(word, suffix, "ble") - - if len(r1) >= len(suffix): - r1 = suffix_replace(r1, suffix, "ble") - else: - r1 = "" - - if len(r2) >= len(suffix): - r2 = suffix_replace(r2, suffix, "ble") - else: - r2 = "" - - elif suffix == "ogi" and word[-4] == "l": - word = word[:-1] - r1 = r1[:-1] - r2 = r2[:-1] - - elif suffix in ("fulli", "lessli"): - word = word[:-2] - r1 = r1[:-2] - r2 = r2[:-2] - - elif suffix == "li" and word[-3] in self.__li_ending: - word = word[:-2] - r1 = r1[:-2] - r2 = r2[:-2] - break - - # STEP 3 - for suffix in self.__step3_suffixes: - if word.endswith(suffix): - if r1.endswith(suffix): - if suffix == "tional": - word = word[:-2] - r1 = r1[:-2] - r2 = r2[:-2] - - elif suffix == "ational": - word = suffix_replace(word, suffix, "ate") - - if len(r1) >= len(suffix): - r1 = suffix_replace(r1, suffix, "ate") - else: - r1 = "" - - if len(r2) >= len(suffix): - r2 = suffix_replace(r2, suffix, "ate") - else: - r2 = "" - - elif suffix == "alize": - word = word[:-3] - r1 = r1[:-3] - r2 = r2[:-3] - - elif suffix in ("icate", "iciti", "ical"): - word = suffix_replace(word, suffix, "ic") - - if len(r1) >= len(suffix): - r1 = suffix_replace(r1, suffix, "ic") - else: - r1 = "" - - if len(r2) >= len(suffix): - r2 = suffix_replace(r2, suffix, "ic") - else: - r2 = "" - - elif suffix in ("ful", "ness"): - word = word[: -len(suffix)] - r1 = r1[: -len(suffix)] - r2 = r2[: -len(suffix)] - - elif suffix == "ative" and r2.endswith(suffix): - word = word[:-5] - r1 = r1[:-5] - r2 = r2[:-5] - break - - # STEP 4 - for suffix in self.__step4_suffixes: - if word.endswith(suffix): - if r2.endswith(suffix): - if suffix == "ion": - if word[-4] in "st": - word = word[:-3] - r1 = r1[:-3] - r2 = r2[:-3] - else: - word = word[: -len(suffix)] - r1 = r1[: -len(suffix)] - r2 = r2[: -len(suffix)] - break - - # STEP 5 - if r2.endswith("l") and word[-2] == "l": - word = word[:-1] - elif r2.endswith("e"): - word = word[:-1] - elif r1.endswith("e"): - if len(word) >= 4 and ( - word[-2] in self.__vowels - or word[-2] in "wxY" - or word[-3] not in self.__vowels - or word[-4] in self.__vowels - ): - word = word[:-1] - - word = word.replace("Y", "y") - - return word - - -class FinnishStemmer(_StandardStemmer): - - """ - The Finnish Snowball stemmer. - - :cvar __vowels: The Finnish vowels. - :type __vowels: unicode - :cvar __restricted_vowels: A subset of the Finnish vowels. - :type __restricted_vowels: unicode - :cvar __long_vowels: The Finnish vowels in their long forms. - :type __long_vowels: tuple - :cvar __consonants: The Finnish consonants. - :type __consonants: unicode - :cvar __double_consonants: The Finnish double consonants. - :type __double_consonants: tuple - :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. - :type __step1_suffixes: tuple - :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. - :type __step2_suffixes: tuple - :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. - :type __step3_suffixes: tuple - :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. - :type __step4_suffixes: tuple - :note: A detailed description of the Finnish - stemming algorithm can be found under - http://snowball.tartarus.org/algorithms/finnish/stemmer.html - """ - - __vowels = "aeiouy\xE4\xF6" - __restricted_vowels = "aeiou\xE4\xF6" - __long_vowels = ("aa", "ee", "ii", "oo", "uu", "\xE4\xE4", "\xF6\xF6") - __consonants = "bcdfghjklmnpqrstvwxz" - __double_consonants = ( - "bb", - "cc", - "dd", - "ff", - "gg", - "hh", - "jj", - "kk", - "ll", - "mm", - "nn", - "pp", - "qq", - "rr", - "ss", - "tt", - "vv", - "ww", - "xx", - "zz", - ) - __step1_suffixes = ( - "kaan", - "k\xE4\xE4n", - "sti", - "kin", - "han", - "h\xE4n", - "ko", - "k\xF6", - "pa", - "p\xE4", - ) - __step2_suffixes = ("nsa", "ns\xE4", "mme", "nne", "si", "ni", "an", "\xE4n", "en") - __step3_suffixes = ( - "siin", - "tten", - "seen", - "han", - "hen", - "hin", - "hon", - "h\xE4n", - "h\xF6n", - "den", - "tta", - "tt\xE4", - "ssa", - "ss\xE4", - "sta", - "st\xE4", - "lla", - "ll\xE4", - "lta", - "lt\xE4", - "lle", - "ksi", - "ine", - "ta", - "t\xE4", - "na", - "n\xE4", - "a", - "\xE4", - "n", - ) - __step4_suffixes = ( - "impi", - "impa", - "imp\xE4", - "immi", - "imma", - "imm\xE4", - "mpi", - "mpa", - "mp\xE4", - "mmi", - "mma", - "mm\xE4", - "eja", - "ej\xE4", - ) - - def stem(self, word): - """ - Stem a Finnish word and return the stemmed form. - - :param word: The word that is stemmed. - :type word: str or unicode - :return: The stemmed form. - :rtype: unicode - - """ - word = word.lower() - - if word in self.stopwords: - return word - - step3_success = False - - r1, r2 = self._r1r2_standard(word, self.__vowels) - - # STEP 1: Particles etc. - for suffix in self.__step1_suffixes: - if r1.endswith(suffix): - if suffix == "sti": - if suffix in r2: - word = word[:-3] - r1 = r1[:-3] - r2 = r2[:-3] - else: - if word[-len(suffix) - 1] in "ntaeiouy\xE4\xF6": - word = word[: -len(suffix)] - r1 = r1[: -len(suffix)] - r2 = r2[: -len(suffix)] - break - - # STEP 2: Possessives - for suffix in self.__step2_suffixes: - if r1.endswith(suffix): - if suffix == "si": - if word[-3] != "k": - word = word[:-2] - r1 = r1[:-2] - r2 = r2[:-2] - - elif suffix == "ni": - word = word[:-2] - r1 = r1[:-2] - r2 = r2[:-2] - if word.endswith("kse"): - word = suffix_replace(word, "kse", "ksi") - - if r1.endswith("kse"): - r1 = suffix_replace(r1, "kse", "ksi") - - if r2.endswith("kse"): - r2 = suffix_replace(r2, "kse", "ksi") - - elif suffix == "an": - if word[-4:-2] in ("ta", "na") or word[-5:-2] in ( - "ssa", - "sta", - "lla", - "lta", - ): - word = word[:-2] - r1 = r1[:-2] - r2 = r2[:-2] - - elif suffix == "\xE4n": - if word[-4:-2] in ("t\xE4", "n\xE4") or word[-5:-2] in ( - "ss\xE4", - "st\xE4", - "ll\xE4", - "lt\xE4", - ): - word = word[:-2] - r1 = r1[:-2] - r2 = r2[:-2] - - elif suffix == "en": - if word[-5:-2] in ("lle", "ine"): - word = word[:-2] - r1 = r1[:-2] - r2 = r2[:-2] - else: - word = word[:-3] - r1 = r1[:-3] - r2 = r2[:-3] - break - - # STEP 3: Cases - for suffix in self.__step3_suffixes: - if r1.endswith(suffix): - if suffix in ("han", "hen", "hin", "hon", "h\xE4n", "h\xF6n"): - if ( - (suffix == "han" and word[-4] == "a") - or (suffix == "hen" and word[-4] == "e") - or (suffix == "hin" and word[-4] == "i") - or (suffix == "hon" and word[-4] == "o") - or (suffix == "h\xE4n" and word[-4] == "\xE4") - or (suffix == "h\xF6n" and word[-4] == "\xF6") - ): - word = word[:-3] - r1 = r1[:-3] - r2 = r2[:-3] - step3_success = True - - elif suffix in ("siin", "den", "tten"): - if ( - word[-len(suffix) - 1] == "i" - and word[-len(suffix) - 2] in self.__restricted_vowels - ): - word = word[: -len(suffix)] - r1 = r1[: -len(suffix)] - r2 = r2[: -len(suffix)] - step3_success = True - else: - continue - - elif suffix == "seen": - if word[-6:-4] in self.__long_vowels: - word = word[:-4] - r1 = r1[:-4] - r2 = r2[:-4] - step3_success = True - else: - continue - - elif suffix in ("a", "\xE4"): - if word[-2] in self.__vowels and word[-3] in self.__consonants: - word = word[:-1] - r1 = r1[:-1] - r2 = r2[:-1] - step3_success = True - - elif suffix in ("tta", "tt\xE4"): - if word[-4] == "e": - word = word[:-3] - r1 = r1[:-3] - r2 = r2[:-3] - step3_success = True - - elif suffix == "n": - word = word[:-1] - r1 = r1[:-1] - r2 = r2[:-1] - step3_success = True - - if word[-2:] == "ie" or word[-2:] in self.__long_vowels: - word = word[:-1] - r1 = r1[:-1] - r2 = r2[:-1] - else: - word = word[: -len(suffix)] - r1 = r1[: -len(suffix)] - r2 = r2[: -len(suffix)] - step3_success = True - break - - # STEP 4: Other endings - for suffix in self.__step4_suffixes: - if r2.endswith(suffix): - if suffix in ("mpi", "mpa", "mp\xE4", "mmi", "mma", "mm\xE4"): - if word[-5:-3] != "po": - word = word[:-3] - r1 = r1[:-3] - r2 = r2[:-3] - else: - word = word[: -len(suffix)] - r1 = r1[: -len(suffix)] - r2 = r2[: -len(suffix)] - break - - # STEP 5: Plurals - if step3_success and len(r1) >= 1 and r1[-1] in "ij": - word = word[:-1] - r1 = r1[:-1] - - elif ( - not step3_success - and len(r1) >= 2 - and r1[-1] == "t" - and r1[-2] in self.__vowels - ): - word = word[:-1] - r1 = r1[:-1] - r2 = r2[:-1] - if r2.endswith("imma"): - word = word[:-4] - r1 = r1[:-4] - elif r2.endswith("mma") and r2[-5:-3] != "po": - word = word[:-3] - r1 = r1[:-3] - - # STEP 6: Tidying up - if r1[-2:] in self.__long_vowels: - word = word[:-1] - r1 = r1[:-1] - - if len(r1) >= 2 and r1[-2] in self.__consonants and r1[-1] in "a\xE4ei": - word = word[:-1] - r1 = r1[:-1] - - if r1.endswith(("oj", "uj")): - word = word[:-1] - r1 = r1[:-1] - - if r1.endswith("jo"): - word = word[:-1] - r1 = r1[:-1] - - # If the word ends with a double consonant - # followed by zero or more vowels, the last consonant is removed. - for i in range(1, len(word)): - if word[-i] in self.__vowels: - continue - else: - if i == 1: - if word[-i - 1 :] in self.__double_consonants: - word = word[:-1] - else: - if word[-i - 1 : -i + 1] in self.__double_consonants: - word = "".join((word[:-i], word[-i + 1 :])) - break - - return word - - -class FrenchStemmer(_StandardStemmer): - - """ - The French Snowball stemmer. - - :cvar __vowels: The French vowels. - :type __vowels: unicode - :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. - :type __step1_suffixes: tuple - :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm. - :type __step2a_suffixes: tuple - :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm. - :type __step2b_suffixes: tuple - :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. - :type __step4_suffixes: tuple - :note: A detailed description of the French - stemming algorithm can be found under - http://snowball.tartarus.org/algorithms/french/stemmer.html - """ - - __vowels = "aeiouy\xE2\xE0\xEB\xE9\xEA\xE8\xEF\xEE\xF4\xFB\xF9" - __step1_suffixes = ( - "issements", - "issement", - "atrices", - "atrice", - "ateurs", - "ations", - "logies", - "usions", - "utions", - "ements", - "amment", - "emment", - "ances", - "iqUes", - "ismes", - "ables", - "istes", - "ateur", - "ation", - "logie", - "usion", - "ution", - "ences", - "ement", - "euses", - "ments", - "ance", - "iqUe", - "isme", - "able", - "iste", - "ence", - "it\xE9s", - "ives", - "eaux", - "euse", - "ment", - "eux", - "it\xE9", - "ive", - "ifs", - "aux", - "if", - ) - __step2a_suffixes = ( - "issaIent", - "issantes", - "iraIent", - "issante", - "issants", - "issions", - "irions", - "issais", - "issait", - "issant", - "issent", - "issiez", - "issons", - "irais", - "irait", - "irent", - "iriez", - "irons", - "iront", - "isses", - "issez", - "\xEEmes", - "\xEEtes", - "irai", - "iras", - "irez", - "isse", - "ies", - "ira", - "\xEEt", - "ie", - "ir", - "is", - "it", - "i", - ) - __step2b_suffixes = ( - "eraIent", - "assions", - "erions", - "assent", - "assiez", - "\xE8rent", - "erais", - "erait", - "eriez", - "erons", - "eront", - "aIent", - "antes", - "asses", - "ions", - "erai", - "eras", - "erez", - "\xE2mes", - "\xE2tes", - "ante", - "ants", - "asse", - "\xE9es", - "era", - "iez", - "ais", - "ait", - "ant", - "\xE9e", - "\xE9s", - "er", - "ez", - "\xE2t", - "ai", - "as", - "\xE9", - "a", - ) - __step4_suffixes = ("i\xE8re", "I\xE8re", "ion", "ier", "Ier", "e", "\xEB") - - def stem(self, word): - """ - Stem a French word and return the stemmed form. - - :param word: The word that is stemmed. - :type word: str or unicode - :return: The stemmed form. - :rtype: unicode - - """ - word = word.lower() - - if word in self.stopwords: - return word - - step1_success = False - rv_ending_found = False - step2a_success = False - step2b_success = False - - # Every occurrence of 'u' after 'q' is put into upper case. - for i in range(1, len(word)): - if word[i - 1] == "q" and word[i] == "u": - word = "".join((word[:i], "U", word[i + 1 :])) - - # Every occurrence of 'u' and 'i' - # between vowels is put into upper case. - # Every occurrence of 'y' preceded or - # followed by a vowel is also put into upper case. - for i in range(1, len(word) - 1): - if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: - if word[i] == "u": - word = "".join((word[:i], "U", word[i + 1 :])) - - elif word[i] == "i": - word = "".join((word[:i], "I", word[i + 1 :])) - - if word[i - 1] in self.__vowels or word[i + 1] in self.__vowels: - if word[i] == "y": - word = "".join((word[:i], "Y", word[i + 1 :])) - - r1, r2 = self._r1r2_standard(word, self.__vowels) - rv = self.__rv_french(word, self.__vowels) - - # STEP 1: Standard suffix removal - for suffix in self.__step1_suffixes: - if word.endswith(suffix): - if suffix == "eaux": - word = word[:-1] - step1_success = True - - elif suffix in ("euse", "euses"): - if suffix in r2: - word = word[: -len(suffix)] - step1_success = True - - elif suffix in r1: - word = suffix_replace(word, suffix, "eux") - step1_success = True - - elif suffix in ("ement", "ements") and suffix in rv: - word = word[: -len(suffix)] - step1_success = True - - if word[-2:] == "iv" and "iv" in r2: - word = word[:-2] - - if word[-2:] == "at" and "at" in r2: - word = word[:-2] - - elif word[-3:] == "eus": - if "eus" in r2: - word = word[:-3] - elif "eus" in r1: - word = "".join((word[:-1], "x")) - - elif word[-3:] in ("abl", "iqU"): - if "abl" in r2 or "iqU" in r2: - word = word[:-3] - - elif word[-3:] in ("i\xE8r", "I\xE8r"): - if "i\xE8r" in rv or "I\xE8r" in rv: - word = "".join((word[:-3], "i")) - - elif suffix == "amment" and suffix in rv: - word = suffix_replace(word, "amment", "ant") - rv = suffix_replace(rv, "amment", "ant") - rv_ending_found = True - - elif suffix == "emment" and suffix in rv: - word = suffix_replace(word, "emment", "ent") - rv_ending_found = True - - elif ( - suffix in ("ment", "ments") - and suffix in rv - and not rv.startswith(suffix) - and rv[rv.rindex(suffix) - 1] in self.__vowels - ): - word = word[: -len(suffix)] - rv = rv[: -len(suffix)] - rv_ending_found = True - - elif suffix == "aux" and suffix in r1: - word = "".join((word[:-2], "l")) - step1_success = True - - elif ( - suffix in ("issement", "issements") - and suffix in r1 - and word[-len(suffix) - 1] not in self.__vowels - ): - word = word[: -len(suffix)] - step1_success = True - - elif ( - suffix - in ( - "ance", - "iqUe", - "isme", - "able", - "iste", - "eux", - "ances", - "iqUes", - "ismes", - "ables", - "istes", - ) - and suffix in r2 - ): - word = word[: -len(suffix)] - step1_success = True - - elif ( - suffix - in ("atrice", "ateur", "ation", "atrices", "ateurs", "ations") - and suffix in r2 - ): - word = word[: -len(suffix)] - step1_success = True - - if word[-2:] == "ic": - if "ic" in r2: - word = word[:-2] - else: - word = "".join((word[:-2], "iqU")) - - elif suffix in ("logie", "logies") and suffix in r2: - word = suffix_replace(word, suffix, "log") - step1_success = True - - elif suffix in ("usion", "ution", "usions", "utions") and suffix in r2: - word = suffix_replace(word, suffix, "u") - step1_success = True - - elif suffix in ("ence", "ences") and suffix in r2: - word = suffix_replace(word, suffix, "ent") - step1_success = True - - elif suffix in ("it\xE9", "it\xE9s") and suffix in r2: - word = word[: -len(suffix)] - step1_success = True - - if word[-4:] == "abil": - if "abil" in r2: - word = word[:-4] - else: - word = "".join((word[:-2], "l")) - - elif word[-2:] == "ic": - if "ic" in r2: - word = word[:-2] - else: - word = "".join((word[:-2], "iqU")) - - elif word[-2:] == "iv": - if "iv" in r2: - word = word[:-2] - - elif suffix in ("if", "ive", "ifs", "ives") and suffix in r2: - word = word[: -len(suffix)] - step1_success = True - - if word[-2:] == "at" and "at" in r2: - word = word[:-2] - - if word[-2:] == "ic": - if "ic" in r2: - word = word[:-2] - else: - word = "".join((word[:-2], "iqU")) - break - - # STEP 2a: Verb suffixes beginning 'i' - if not step1_success or rv_ending_found: - for suffix in self.__step2a_suffixes: - if word.endswith(suffix): - if ( - suffix in rv - and len(rv) > len(suffix) - and rv[rv.rindex(suffix) - 1] not in self.__vowels - ): - word = word[: -len(suffix)] - step2a_success = True - break - - # STEP 2b: Other verb suffixes - if not step2a_success: - for suffix in self.__step2b_suffixes: - if rv.endswith(suffix): - if suffix == "ions" and "ions" in r2: - word = word[:-4] - step2b_success = True - - elif suffix in ( - "eraIent", - "erions", - "\xE8rent", - "erais", - "erait", - "eriez", - "erons", - "eront", - "erai", - "eras", - "erez", - "\xE9es", - "era", - "iez", - "\xE9e", - "\xE9s", - "er", - "ez", - "\xE9", - ): - word = word[: -len(suffix)] - step2b_success = True - - elif suffix in ( - "assions", - "assent", - "assiez", - "aIent", - "antes", - "asses", - "\xE2mes", - "\xE2tes", - "ante", - "ants", - "asse", - "ais", - "ait", - "ant", - "\xE2t", - "ai", - "as", - "a", - ): - word = word[: -len(suffix)] - rv = rv[: -len(suffix)] - step2b_success = True - if rv.endswith("e"): - word = word[:-1] - break - - # STEP 3 - if step1_success or step2a_success or step2b_success: - if word[-1] == "Y": - word = "".join((word[:-1], "i")) - elif word[-1] == "\xE7": - word = "".join((word[:-1], "c")) - - # STEP 4: Residual suffixes - else: - if len(word) >= 2 and word[-1] == "s" and word[-2] not in "aiou\xE8s": - word = word[:-1] - - for suffix in self.__step4_suffixes: - if word.endswith(suffix): - if suffix in rv: - if suffix == "ion" and suffix in r2 and rv[-4] in "st": - word = word[:-3] - - elif suffix in ("ier", "i\xE8re", "Ier", "I\xE8re"): - word = suffix_replace(word, suffix, "i") - - elif suffix == "e": - word = word[:-1] - - elif suffix == "\xEB" and word[-3:-1] == "gu": - word = word[:-1] - break - - # STEP 5: Undouble - if word.endswith(("enn", "onn", "ett", "ell", "eill")): - word = word[:-1] - - # STEP 6: Un-accent - for i in range(1, len(word)): - if word[-i] not in self.__vowels: - i += 1 - else: - if i != 1 and word[-i] in ("\xE9", "\xE8"): - word = "".join((word[:-i], "e", word[-i + 1 :])) - break - - word = word.replace("I", "i").replace("U", "u").replace("Y", "y") - - return word - - def __rv_french(self, word, vowels): - """ - Return the region RV that is used by the French stemmer. - - If the word begins with two vowels, RV is the region after - the third letter. Otherwise, it is the region after the first - vowel not at the beginning of the word, or the end of the word - if these positions cannot be found. (Exceptionally, u'par', - u'col' or u'tap' at the beginning of a word is also taken to - define RV as the region to their right.) - - :param word: The French word whose region RV is determined. - :type word: str or unicode - :param vowels: The French vowels that are used to determine - the region RV. - :type vowels: unicode - :return: the region RV for the respective French word. - :rtype: unicode - :note: This helper method is invoked by the stem method of - the subclass FrenchStemmer. It is not to be invoked directly! - - """ - rv = "" - if len(word) >= 2: - if word.startswith(("par", "col", "tap")) or ( - word[0] in vowels and word[1] in vowels - ): - rv = word[3:] - else: - for i in range(1, len(word)): - if word[i] in vowels: - rv = word[i + 1 :] - break - - return rv - - -class GermanStemmer(_StandardStemmer): - - """ - The German Snowball stemmer. - - :cvar __vowels: The German vowels. - :type __vowels: unicode - :cvar __s_ending: Letters that may directly appear before a word final 's'. - :type __s_ending: unicode - :cvar __st_ending: Letter that may directly appear before a word final 'st'. - :type __st_ending: unicode - :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. - :type __step1_suffixes: tuple - :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. - :type __step2_suffixes: tuple - :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. - :type __step3_suffixes: tuple - :note: A detailed description of the German - stemming algorithm can be found under - http://snowball.tartarus.org/algorithms/german/stemmer.html - - """ - - __vowels = "aeiouy\xE4\xF6\xFC" - __s_ending = "bdfghklmnrt" - __st_ending = "bdfghklmnt" - - __step1_suffixes = ("ern", "em", "er", "en", "es", "e", "s") - __step2_suffixes = ("est", "en", "er", "st") - __step3_suffixes = ("isch", "lich", "heit", "keit", "end", "ung", "ig", "ik") - - def stem(self, word): - """ - Stem a German word and return the stemmed form. - - :param word: The word that is stemmed. - :type word: str or unicode - :return: The stemmed form. - :rtype: unicode - - """ - word = word.lower() - - if word in self.stopwords: - return word - - word = word.replace("\xDF", "ss") - - # Every occurrence of 'u' and 'y' - # between vowels is put into upper case. - for i in range(1, len(word) - 1): - if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: - if word[i] == "u": - word = "".join((word[:i], "U", word[i + 1 :])) - - elif word[i] == "y": - word = "".join((word[:i], "Y", word[i + 1 :])) - - r1, r2 = self._r1r2_standard(word, self.__vowels) - - # R1 is adjusted so that the region before it - # contains at least 3 letters. - for i in range(1, len(word)): - if word[i] not in self.__vowels and word[i - 1] in self.__vowels: - if 3 > len(word[: i + 1]) > 0: - r1 = word[3:] - elif len(word[: i + 1]) == 0: - return word - break - - # STEP 1 - for suffix in self.__step1_suffixes: - if r1.endswith(suffix): - if ( - suffix in ("en", "es", "e") - and word[-len(suffix) - 4 : -len(suffix)] == "niss" - ): - word = word[: -len(suffix) - 1] - r1 = r1[: -len(suffix) - 1] - r2 = r2[: -len(suffix) - 1] - - elif suffix == "s": - if word[-2] in self.__s_ending: - word = word[:-1] - r1 = r1[:-1] - r2 = r2[:-1] - else: - word = word[: -len(suffix)] - r1 = r1[: -len(suffix)] - r2 = r2[: -len(suffix)] - break - - # STEP 2 - for suffix in self.__step2_suffixes: - if r1.endswith(suffix): - if suffix == "st": - if word[-3] in self.__st_ending and len(word[:-3]) >= 3: - word = word[:-2] - r1 = r1[:-2] - r2 = r2[:-2] - else: - word = word[: -len(suffix)] - r1 = r1[: -len(suffix)] - r2 = r2[: -len(suffix)] - break - - # STEP 3: Derivational suffixes - for suffix in self.__step3_suffixes: - if r2.endswith(suffix): - if suffix in ("end", "ung"): - if ( - "ig" in r2[-len(suffix) - 2 : -len(suffix)] - and "e" not in r2[-len(suffix) - 3 : -len(suffix) - 2] - ): - word = word[: -len(suffix) - 2] - else: - word = word[: -len(suffix)] - - elif ( - suffix in ("ig", "ik", "isch") - and "e" not in r2[-len(suffix) - 1 : -len(suffix)] - ): - word = word[: -len(suffix)] - - elif suffix in ("lich", "heit"): - if ( - "er" in r1[-len(suffix) - 2 : -len(suffix)] - or "en" in r1[-len(suffix) - 2 : -len(suffix)] - ): - word = word[: -len(suffix) - 2] - else: - word = word[: -len(suffix)] - - elif suffix == "keit": - if "lich" in r2[-len(suffix) - 4 : -len(suffix)]: - word = word[: -len(suffix) - 4] - - elif "ig" in r2[-len(suffix) - 2 : -len(suffix)]: - word = word[: -len(suffix) - 2] - else: - word = word[: -len(suffix)] - break - - # Umlaut accents are removed and - # 'u' and 'y' are put back into lower case. - word = ( - word.replace("\xE4", "a") - .replace("\xF6", "o") - .replace("\xFC", "u") - .replace("U", "u") - .replace("Y", "y") - ) - - return word - - -class HungarianStemmer(_LanguageSpecificStemmer): - - """ - The Hungarian Snowball stemmer. - - :cvar __vowels: The Hungarian vowels. - :type __vowels: unicode - :cvar __digraphs: The Hungarian digraphs. - :type __digraphs: tuple - :cvar __double_consonants: The Hungarian double consonants. - :type __double_consonants: tuple - :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. - :type __step1_suffixes: tuple - :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. - :type __step2_suffixes: tuple - :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. - :type __step3_suffixes: tuple - :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. - :type __step4_suffixes: tuple - :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm. - :type __step5_suffixes: tuple - :cvar __step6_suffixes: Suffixes to be deleted in step 6 of the algorithm. - :type __step6_suffixes: tuple - :cvar __step7_suffixes: Suffixes to be deleted in step 7 of the algorithm. - :type __step7_suffixes: tuple - :cvar __step8_suffixes: Suffixes to be deleted in step 8 of the algorithm. - :type __step8_suffixes: tuple - :cvar __step9_suffixes: Suffixes to be deleted in step 9 of the algorithm. - :type __step9_suffixes: tuple - :note: A detailed description of the Hungarian - stemming algorithm can be found under - http://snowball.tartarus.org/algorithms/hungarian/stemmer.html - - """ - - __vowels = "aeiou\xF6\xFC\xE1\xE9\xED\xF3\xF5\xFA\xFB" - __digraphs = ("cs", "dz", "dzs", "gy", "ly", "ny", "ty", "zs") - __double_consonants = ( - "bb", - "cc", - "ccs", - "dd", - "ff", - "gg", - "ggy", - "jj", - "kk", - "ll", - "lly", - "mm", - "nn", - "nny", - "pp", - "rr", - "ss", - "ssz", - "tt", - "tty", - "vv", - "zz", - "zzs", - ) - - __step1_suffixes = ("al", "el") - __step2_suffixes = ( - "k\xE9ppen", - "onk\xE9nt", - "enk\xE9nt", - "ank\xE9nt", - "k\xE9pp", - "k\xE9nt", - "ban", - "ben", - "nak", - "nek", - "val", - "vel", - "t\xF3l", - "t\xF5l", - "r\xF3l", - "r\xF5l", - "b\xF3l", - "b\xF5l", - "hoz", - "hez", - "h\xF6z", - "n\xE1l", - "n\xE9l", - "\xE9rt", - "kor", - "ba", - "be", - "ra", - "re", - "ig", - "at", - "et", - "ot", - "\xF6t", - "ul", - "\xFCl", - "v\xE1", - "v\xE9", - "en", - "on", - "an", - "\xF6n", - "n", - "t", - ) - __step3_suffixes = ("\xE1nk\xE9nt", "\xE1n", "\xE9n") - __step4_suffixes = ( - "astul", - "est\xFCl", - "\xE1stul", - "\xE9st\xFCl", - "stul", - "st\xFCl", - ) - __step5_suffixes = ("\xE1", "\xE9") - __step6_suffixes = ( - "ok\xE9", - "\xF6k\xE9", - "ak\xE9", - "ek\xE9", - "\xE1k\xE9", - "\xE1\xE9i", - "\xE9k\xE9", - "\xE9\xE9i", - "k\xE9", - "\xE9i", - "\xE9\xE9", - "\xE9", - ) - __step7_suffixes = ( - "\xE1juk", - "\xE9j\xFCk", - "\xFCnk", - "unk", - "juk", - "j\xFCk", - "\xE1nk", - "\xE9nk", - "nk", - "uk", - "\xFCk", - "em", - "om", - "am", - "od", - "ed", - "ad", - "\xF6d", - "ja", - "je", - "\xE1m", - "\xE1d", - "\xE9m", - "\xE9d", - "m", - "d", - "a", - "e", - "o", - "\xE1", - "\xE9", - ) - __step8_suffixes = ( - "jaitok", - "jeitek", - "jaink", - "jeink", - "aitok", - "eitek", - "\xE1itok", - "\xE9itek", - "jaim", - "jeim", - "jaid", - "jeid", - "eink", - "aink", - "itek", - "jeik", - "jaik", - "\xE1ink", - "\xE9ink", - "aim", - "eim", - "aid", - "eid", - "jai", - "jei", - "ink", - "aik", - "eik", - "\xE1im", - "\xE1id", - "\xE1ik", - "\xE9im", - "\xE9id", - "\xE9ik", - "im", - "id", - "ai", - "ei", - "ik", - "\xE1i", - "\xE9i", - "i", - ) - __step9_suffixes = ("\xE1k", "\xE9k", "\xF6k", "ok", "ek", "ak", "k") - - def stem(self, word): - """ - Stem an Hungarian word and return the stemmed form. - - :param word: The word that is stemmed. - :type word: str or unicode - :return: The stemmed form. - :rtype: unicode - - """ - word = word.lower() - - if word in self.stopwords: - return word - - r1 = self.__r1_hungarian(word, self.__vowels, self.__digraphs) - - # STEP 1: Remove instrumental case - if r1.endswith(self.__step1_suffixes): - for double_cons in self.__double_consonants: - if word[-2 - len(double_cons) : -2] == double_cons: - word = "".join((word[:-4], word[-3])) - - if r1[-2 - len(double_cons) : -2] == double_cons: - r1 = "".join((r1[:-4], r1[-3])) - break - - # STEP 2: Remove frequent cases - for suffix in self.__step2_suffixes: - if word.endswith(suffix): - if r1.endswith(suffix): - word = word[: -len(suffix)] - r1 = r1[: -len(suffix)] - - if r1.endswith("\xE1"): - word = "".join((word[:-1], "a")) - r1 = suffix_replace(r1, "\xE1", "a") - - elif r1.endswith("\xE9"): - word = "".join((word[:-1], "e")) - r1 = suffix_replace(r1, "\xE9", "e") - break - - # STEP 3: Remove special cases - for suffix in self.__step3_suffixes: - if r1.endswith(suffix): - if suffix == "\xE9n": - word = suffix_replace(word, suffix, "e") - r1 = suffix_replace(r1, suffix, "e") - else: - word = suffix_replace(word, suffix, "a") - r1 = suffix_replace(r1, suffix, "a") - break - - # STEP 4: Remove other cases - for suffix in self.__step4_suffixes: - if r1.endswith(suffix): - if suffix == "\xE1stul": - word = suffix_replace(word, suffix, "a") - r1 = suffix_replace(r1, suffix, "a") - - elif suffix == "\xE9st\xFCl": - word = suffix_replace(word, suffix, "e") - r1 = suffix_replace(r1, suffix, "e") - else: - word = word[: -len(suffix)] - r1 = r1[: -len(suffix)] - break - - # STEP 5: Remove factive case - for suffix in self.__step5_suffixes: - if r1.endswith(suffix): - for double_cons in self.__double_consonants: - if word[-1 - len(double_cons) : -1] == double_cons: - word = "".join((word[:-3], word[-2])) - - if r1[-1 - len(double_cons) : -1] == double_cons: - r1 = "".join((r1[:-3], r1[-2])) - break - - # STEP 6: Remove owned - for suffix in self.__step6_suffixes: - if r1.endswith(suffix): - if suffix in ("\xE1k\xE9", "\xE1\xE9i"): - word = suffix_replace(word, suffix, "a") - r1 = suffix_replace(r1, suffix, "a") - - elif suffix in ("\xE9k\xE9", "\xE9\xE9i", "\xE9\xE9"): - word = suffix_replace(word, suffix, "e") - r1 = suffix_replace(r1, suffix, "e") - else: - word = word[: -len(suffix)] - r1 = r1[: -len(suffix)] - break - - # STEP 7: Remove singular owner suffixes - for suffix in self.__step7_suffixes: - if word.endswith(suffix): - if r1.endswith(suffix): - if suffix in ("\xE1nk", "\xE1juk", "\xE1m", "\xE1d", "\xE1"): - word = suffix_replace(word, suffix, "a") - r1 = suffix_replace(r1, suffix, "a") - - elif suffix in ("\xE9nk", "\xE9j\xFCk", "\xE9m", "\xE9d", "\xE9"): - word = suffix_replace(word, suffix, "e") - r1 = suffix_replace(r1, suffix, "e") - else: - word = word[: -len(suffix)] - r1 = r1[: -len(suffix)] - break - - # STEP 8: Remove plural owner suffixes - for suffix in self.__step8_suffixes: - if word.endswith(suffix): - if r1.endswith(suffix): - if suffix in ( - "\xE1im", - "\xE1id", - "\xE1i", - "\xE1ink", - "\xE1itok", - "\xE1ik", - ): - word = suffix_replace(word, suffix, "a") - r1 = suffix_replace(r1, suffix, "a") - - elif suffix in ( - "\xE9im", - "\xE9id", - "\xE9i", - "\xE9ink", - "\xE9itek", - "\xE9ik", - ): - word = suffix_replace(word, suffix, "e") - r1 = suffix_replace(r1, suffix, "e") - else: - word = word[: -len(suffix)] - r1 = r1[: -len(suffix)] - break - - # STEP 9: Remove plural suffixes - for suffix in self.__step9_suffixes: - if word.endswith(suffix): - if r1.endswith(suffix): - if suffix == "\xE1k": - word = suffix_replace(word, suffix, "a") - elif suffix == "\xE9k": - word = suffix_replace(word, suffix, "e") - else: - word = word[: -len(suffix)] - break - - return word - - def __r1_hungarian(self, word, vowels, digraphs): - """ - Return the region R1 that is used by the Hungarian stemmer. - - If the word begins with a vowel, R1 is defined as the region - after the first consonant or digraph (= two letters stand for - one phoneme) in the word. If the word begins with a consonant, - it is defined as the region after the first vowel in the word. - If the word does not contain both a vowel and consonant, R1 - is the null region at the end of the word. - - :param word: The Hungarian word whose region R1 is determined. - :type word: str or unicode - :param vowels: The Hungarian vowels that are used to determine - the region R1. - :type vowels: unicode - :param digraphs: The digraphs that are used to determine the - region R1. - :type digraphs: tuple - :return: the region R1 for the respective word. - :rtype: unicode - :note: This helper method is invoked by the stem method of the subclass - HungarianStemmer. It is not to be invoked directly! - - """ - r1 = "" - if word[0] in vowels: - for digraph in digraphs: - if digraph in word[1:]: - r1 = word[word.index(digraph[-1]) + 1 :] - return r1 - - for i in range(1, len(word)): - if word[i] not in vowels: - r1 = word[i + 1 :] - break - else: - for i in range(1, len(word)): - if word[i] in vowels: - r1 = word[i + 1 :] - break - - return r1 - - -class ItalianStemmer(_StandardStemmer): - - """ - The Italian Snowball stemmer. - - :cvar __vowels: The Italian vowels. - :type __vowels: unicode - :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. - :type __step0_suffixes: tuple - :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. - :type __step1_suffixes: tuple - :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. - :type __step2_suffixes: tuple - :note: A detailed description of the Italian - stemming algorithm can be found under - http://snowball.tartarus.org/algorithms/italian/stemmer.html - - """ - - __vowels = "aeiou\xE0\xE8\xEC\xF2\xF9" - __step0_suffixes = ( - "gliela", - "gliele", - "glieli", - "glielo", - "gliene", - "sene", - "mela", - "mele", - "meli", - "melo", - "mene", - "tela", - "tele", - "teli", - "telo", - "tene", - "cela", - "cele", - "celi", - "celo", - "cene", - "vela", - "vele", - "veli", - "velo", - "vene", - "gli", - "ci", - "la", - "le", - "li", - "lo", - "mi", - "ne", - "si", - "ti", - "vi", - ) - __step1_suffixes = ( - "atrice", - "atrici", - "azione", - "azioni", - "uzione", - "uzioni", - "usione", - "usioni", - "amento", - "amenti", - "imento", - "imenti", - "amente", - "abile", - "abili", - "ibile", - "ibili", - "mente", - "atore", - "atori", - "logia", - "logie", - "anza", - "anze", - "iche", - "ichi", - "ismo", - "ismi", - "ista", - "iste", - "isti", - "ist\xE0", - "ist\xE8", - "ist\xEC", - "ante", - "anti", - "enza", - "enze", - "ico", - "ici", - "ica", - "ice", - "oso", - "osi", - "osa", - "ose", - "it\xE0", - "ivo", - "ivi", - "iva", - "ive", - ) - __step2_suffixes = ( - "erebbero", - "irebbero", - "assero", - "assimo", - "eranno", - "erebbe", - "eremmo", - "ereste", - "eresti", - "essero", - "iranno", - "irebbe", - "iremmo", - "ireste", - "iresti", - "iscano", - "iscono", - "issero", - "arono", - "avamo", - "avano", - "avate", - "eremo", - "erete", - "erono", - "evamo", - "evano", - "evate", - "iremo", - "irete", - "irono", - "ivamo", - "ivano", - "ivate", - "ammo", - "ando", - "asse", - "assi", - "emmo", - "enda", - "ende", - "endi", - "endo", - "erai", - "erei", - "Yamo", - "iamo", - "immo", - "irai", - "irei", - "isca", - "isce", - "isci", - "isco", - "ano", - "are", - "ata", - "ate", - "ati", - "ato", - "ava", - "avi", - "avo", - "er\xE0", - "ere", - "er\xF2", - "ete", - "eva", - "evi", - "evo", - "ir\xE0", - "ire", - "ir\xF2", - "ita", - "ite", - "iti", - "ito", - "iva", - "ivi", - "ivo", - "ono", - "uta", - "ute", - "uti", - "uto", - "ar", - "ir", - ) - - def stem(self, word): - """ - Stem an Italian word and return the stemmed form. - - :param word: The word that is stemmed. - :type word: str or unicode - :return: The stemmed form. - :rtype: unicode - - """ - word = word.lower() - - if word in self.stopwords: - return word - - step1_success = False - - # All acute accents are replaced by grave accents. - word = ( - word.replace("\xE1", "\xE0") - .replace("\xE9", "\xE8") - .replace("\xED", "\xEC") - .replace("\xF3", "\xF2") - .replace("\xFA", "\xF9") - ) - - # Every occurrence of 'u' after 'q' - # is put into upper case. - for i in range(1, len(word)): - if word[i - 1] == "q" and word[i] == "u": - word = "".join((word[:i], "U", word[i + 1 :])) - - # Every occurrence of 'u' and 'i' - # between vowels is put into upper case. - for i in range(1, len(word) - 1): - if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: - if word[i] == "u": - word = "".join((word[:i], "U", word[i + 1 :])) - - elif word[i] == "i": - word = "".join((word[:i], "I", word[i + 1 :])) - - r1, r2 = self._r1r2_standard(word, self.__vowels) - rv = self._rv_standard(word, self.__vowels) - - # STEP 0: Attached pronoun - for suffix in self.__step0_suffixes: - if rv.endswith(suffix): - if rv[-len(suffix) - 4 : -len(suffix)] in ("ando", "endo"): - word = word[: -len(suffix)] - r1 = r1[: -len(suffix)] - r2 = r2[: -len(suffix)] - rv = rv[: -len(suffix)] - - elif rv[-len(suffix) - 2 : -len(suffix)] in ("ar", "er", "ir"): - word = suffix_replace(word, suffix, "e") - r1 = suffix_replace(r1, suffix, "e") - r2 = suffix_replace(r2, suffix, "e") - rv = suffix_replace(rv, suffix, "e") - break - - # STEP 1: Standard suffix removal - for suffix in self.__step1_suffixes: - if word.endswith(suffix): - if suffix == "amente" and r1.endswith(suffix): - step1_success = True - word = word[:-6] - r2 = r2[:-6] - rv = rv[:-6] - - if r2.endswith("iv"): - word = word[:-2] - r2 = r2[:-2] - rv = rv[:-2] - - if r2.endswith("at"): - word = word[:-2] - rv = rv[:-2] - - elif r2.endswith(("os", "ic")): - word = word[:-2] - rv = rv[:-2] - - elif r2.endswith("abil"): - word = word[:-4] - rv = rv[:-4] - - elif suffix in ("amento", "amenti", "imento", "imenti") and rv.endswith( - suffix - ): - step1_success = True - word = word[:-6] - rv = rv[:-6] - - elif r2.endswith(suffix): - step1_success = True - if suffix in ("azione", "azioni", "atore", "atori"): - word = word[: -len(suffix)] - r2 = r2[: -len(suffix)] - rv = rv[: -len(suffix)] - - if r2.endswith("ic"): - word = word[:-2] - rv = rv[:-2] - - elif suffix in ("logia", "logie"): - word = word[:-2] - rv = word[:-2] - - elif suffix in ("uzione", "uzioni", "usione", "usioni"): - word = word[:-5] - rv = rv[:-5] - - elif suffix in ("enza", "enze"): - word = suffix_replace(word, suffix, "te") - rv = suffix_replace(rv, suffix, "te") - - elif suffix == "it\xE0": - word = word[:-3] - r2 = r2[:-3] - rv = rv[:-3] - - if r2.endswith(("ic", "iv")): - word = word[:-2] - rv = rv[:-2] - - elif r2.endswith("abil"): - word = word[:-4] - rv = rv[:-4] - - elif suffix in ("ivo", "ivi", "iva", "ive"): - word = word[:-3] - r2 = r2[:-3] - rv = rv[:-3] - - if r2.endswith("at"): - word = word[:-2] - r2 = r2[:-2] - rv = rv[:-2] - - if r2.endswith("ic"): - word = word[:-2] - rv = rv[:-2] - else: - word = word[: -len(suffix)] - rv = rv[: -len(suffix)] - break - - # STEP 2: Verb suffixes - if not step1_success: - for suffix in self.__step2_suffixes: - if rv.endswith(suffix): - word = word[: -len(suffix)] - rv = rv[: -len(suffix)] - break - - # STEP 3a - if rv.endswith(("a", "e", "i", "o", "\xE0", "\xE8", "\xEC", "\xF2")): - word = word[:-1] - rv = rv[:-1] - - if rv.endswith("i"): - word = word[:-1] - rv = rv[:-1] - - # STEP 3b - if rv.endswith(("ch", "gh")): - word = word[:-1] - - word = word.replace("I", "i").replace("U", "u") - - return word - - -class NorwegianStemmer(_ScandinavianStemmer): - - """ - The Norwegian Snowball stemmer. - - :cvar __vowels: The Norwegian vowels. - :type __vowels: unicode - :cvar __s_ending: Letters that may directly appear before a word final 's'. - :type __s_ending: unicode - :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. - :type __step1_suffixes: tuple - :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. - :type __step2_suffixes: tuple - :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. - :type __step3_suffixes: tuple - :note: A detailed description of the Norwegian - stemming algorithm can be found under - http://snowball.tartarus.org/algorithms/norwegian/stemmer.html - - """ - - __vowels = "aeiouy\xE6\xE5\xF8" - __s_ending = "bcdfghjlmnoprtvyz" - __step1_suffixes = ( - "hetenes", - "hetene", - "hetens", - "heter", - "heten", - "endes", - "ande", - "ende", - "edes", - "enes", - "erte", - "ede", - "ane", - "ene", - "ens", - "ers", - "ets", - "het", - "ast", - "ert", - "en", - "ar", - "er", - "as", - "es", - "et", - "a", - "e", - "s", - ) - - __step2_suffixes = ("dt", "vt") - - __step3_suffixes = ( - "hetslov", - "eleg", - "elig", - "elov", - "slov", - "leg", - "eig", - "lig", - "els", - "lov", - "ig", - ) - - def stem(self, word): - """ - Stem a Norwegian word and return the stemmed form. - - :param word: The word that is stemmed. - :type word: str or unicode - :return: The stemmed form. - :rtype: unicode - - """ - word = word.lower() - - if word in self.stopwords: - return word - - r1 = self._r1_scandinavian(word, self.__vowels) - - # STEP 1 - for suffix in self.__step1_suffixes: - if r1.endswith(suffix): - if suffix in ("erte", "ert"): - word = suffix_replace(word, suffix, "er") - r1 = suffix_replace(r1, suffix, "er") - - elif suffix == "s": - if word[-2] in self.__s_ending or ( - word[-2] == "k" and word[-3] not in self.__vowels - ): - word = word[:-1] - r1 = r1[:-1] - else: - word = word[: -len(suffix)] - r1 = r1[: -len(suffix)] - break - - # STEP 2 - for suffix in self.__step2_suffixes: - if r1.endswith(suffix): - word = word[:-1] - r1 = r1[:-1] - break - - # STEP 3 - for suffix in self.__step3_suffixes: - if r1.endswith(suffix): - word = word[: -len(suffix)] - break - - return word - - -class PortugueseStemmer(_StandardStemmer): - - """ - The Portuguese Snowball stemmer. - - :cvar __vowels: The Portuguese vowels. - :type __vowels: unicode - :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. - :type __step1_suffixes: tuple - :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. - :type __step2_suffixes: tuple - :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. - :type __step4_suffixes: tuple - :note: A detailed description of the Portuguese - stemming algorithm can be found under - http://snowball.tartarus.org/algorithms/portuguese/stemmer.html - - """ - - __vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xE2\xEA\xF4" - __step1_suffixes = ( - "amentos", - "imentos", - "uço~es", - "amento", - "imento", - "adoras", - "adores", - "a\xE7o~es", - "logias", - "\xEAncias", - "amente", - "idades", - "an\xE7as", - "ismos", - "istas", - "adora", - "a\xE7a~o", - "antes", - "\xE2ncia", - "logia", - "uça~o", - "\xEAncia", - "mente", - "idade", - "an\xE7a", - "ezas", - "icos", - "icas", - "ismo", - "\xE1vel", - "\xEDvel", - "ista", - "osos", - "osas", - "ador", - "ante", - "ivas", - "ivos", - "iras", - "eza", - "ico", - "ica", - "oso", - "osa", - "iva", - "ivo", - "ira", - ) - __step2_suffixes = ( - "ar\xEDamos", - "er\xEDamos", - "ir\xEDamos", - "\xE1ssemos", - "\xEAssemos", - "\xEDssemos", - "ar\xEDeis", - "er\xEDeis", - "ir\xEDeis", - "\xE1sseis", - "\xE9sseis", - "\xEDsseis", - "\xE1ramos", - "\xE9ramos", - "\xEDramos", - "\xE1vamos", - "aremos", - "eremos", - "iremos", - "ariam", - "eriam", - "iriam", - "assem", - "essem", - "issem", - "ara~o", - "era~o", - "ira~o", - "arias", - "erias", - "irias", - "ardes", - "erdes", - "irdes", - "asses", - "esses", - "isses", - "astes", - "estes", - "istes", - "\xE1reis", - "areis", - "\xE9reis", - "ereis", - "\xEDreis", - "ireis", - "\xE1veis", - "\xEDamos", - "armos", - "ermos", - "irmos", - "aria", - "eria", - "iria", - "asse", - "esse", - "isse", - "aste", - "este", - "iste", - "arei", - "erei", - "irei", - "aram", - "eram", - "iram", - "avam", - "arem", - "erem", - "irem", - "ando", - "endo", - "indo", - "adas", - "idas", - "ar\xE1s", - "aras", - "er\xE1s", - "eras", - "ir\xE1s", - "avas", - "ares", - "eres", - "ires", - "\xEDeis", - "ados", - "idos", - "\xE1mos", - "amos", - "emos", - "imos", - "iras", - "ada", - "ida", - "ar\xE1", - "ara", - "er\xE1", - "era", - "ir\xE1", - "ava", - "iam", - "ado", - "ido", - "ias", - "ais", - "eis", - "ira", - "ia", - "ei", - "am", - "em", - "ar", - "er", - "ir", - "as", - "es", - "is", - "eu", - "iu", - "ou", - ) - __step4_suffixes = ("os", "a", "i", "o", "\xE1", "\xED", "\xF3") - - def stem(self, word): - """ - Stem a Portuguese word and return the stemmed form. - - :param word: The word that is stemmed. - :type word: str or unicode - :return: The stemmed form. - :rtype: unicode - - """ - word = word.lower() - - if word in self.stopwords: - return word - - step1_success = False - step2_success = False - - word = ( - word.replace("\xE3", "a~") - .replace("\xF5", "o~") - .replace("q\xFC", "qu") - .replace("g\xFC", "gu") - ) - - r1, r2 = self._r1r2_standard(word, self.__vowels) - rv = self._rv_standard(word, self.__vowels) - - # STEP 1: Standard suffix removal - for suffix in self.__step1_suffixes: - if word.endswith(suffix): - if suffix == "amente" and r1.endswith(suffix): - step1_success = True - - word = word[:-6] - r2 = r2[:-6] - rv = rv[:-6] - - if r2.endswith("iv"): - word = word[:-2] - r2 = r2[:-2] - rv = rv[:-2] - - if r2.endswith("at"): - word = word[:-2] - rv = rv[:-2] - - elif r2.endswith(("os", "ic", "ad")): - word = word[:-2] - rv = rv[:-2] - - elif ( - suffix in ("ira", "iras") - and rv.endswith(suffix) - and word[-len(suffix) - 1 : -len(suffix)] == "e" - ): - step1_success = True - - word = suffix_replace(word, suffix, "ir") - rv = suffix_replace(rv, suffix, "ir") - - elif r2.endswith(suffix): - step1_success = True - - if suffix in ("logia", "logias"): - word = suffix_replace(word, suffix, "log") - rv = suffix_replace(rv, suffix, "log") - - elif suffix in ("uça~o", "uço~es"): - word = suffix_replace(word, suffix, "u") - rv = suffix_replace(rv, suffix, "u") - - elif suffix in ("\xEAncia", "\xEAncias"): - word = suffix_replace(word, suffix, "ente") - rv = suffix_replace(rv, suffix, "ente") - - elif suffix == "mente": - word = word[:-5] - r2 = r2[:-5] - rv = rv[:-5] - - if r2.endswith(("ante", "avel", "ivel")): - word = word[:-4] - rv = rv[:-4] - - elif suffix in ("idade", "idades"): - word = word[: -len(suffix)] - r2 = r2[: -len(suffix)] - rv = rv[: -len(suffix)] - - if r2.endswith(("ic", "iv")): - word = word[:-2] - rv = rv[:-2] - - elif r2.endswith("abil"): - word = word[:-4] - rv = rv[:-4] - - elif suffix in ("iva", "ivo", "ivas", "ivos"): - word = word[: -len(suffix)] - r2 = r2[: -len(suffix)] - rv = rv[: -len(suffix)] - - if r2.endswith("at"): - word = word[:-2] - rv = rv[:-2] - else: - word = word[: -len(suffix)] - rv = rv[: -len(suffix)] - break - - # STEP 2: Verb suffixes - if not step1_success: - for suffix in self.__step2_suffixes: - if rv.endswith(suffix): - step2_success = True - - word = word[: -len(suffix)] - rv = rv[: -len(suffix)] - break - - # STEP 3 - if step1_success or step2_success: - if rv.endswith("i") and word[-2] == "c": - word = word[:-1] - rv = rv[:-1] - - ### STEP 4: Residual suffix - if not step1_success and not step2_success: - for suffix in self.__step4_suffixes: - if rv.endswith(suffix): - word = word[: -len(suffix)] - rv = rv[: -len(suffix)] - break - - # STEP 5 - if rv.endswith(("e", "\xE9", "\xEA")): - word = word[:-1] - rv = rv[:-1] - - if (word.endswith("gu") and rv.endswith("u")) or ( - word.endswith("ci") and rv.endswith("i") - ): - word = word[:-1] - - elif word.endswith("\xE7"): - word = suffix_replace(word, "\xE7", "c") - - word = word.replace("a~", "\xE3").replace("o~", "\xF5") - - return word - - -class RomanianStemmer(_StandardStemmer): - - """ - The Romanian Snowball stemmer. - - :cvar __vowels: The Romanian vowels. - :type __vowels: unicode - :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. - :type __step0_suffixes: tuple - :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. - :type __step1_suffixes: tuple - :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. - :type __step2_suffixes: tuple - :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. - :type __step3_suffixes: tuple - :note: A detailed description of the Romanian - stemming algorithm can be found under - http://snowball.tartarus.org/algorithms/romanian/stemmer.html - - """ - - __vowels = "aeiou\u0103\xE2\xEE" - __step0_suffixes = ( - "iilor", - "ului", - "elor", - "iile", - "ilor", - "atei", - "a\u0163ie", - "a\u0163ia", - "aua", - "ele", - "iua", - "iei", - "ile", - "ul", - "ea", - "ii", - ) - __step1_suffixes = ( - "abilitate", - "abilitati", - "abilit\u0103\u0163i", - "ibilitate", - "abilit\u0103i", - "ivitate", - "ivitati", - "ivit\u0103\u0163i", - "icitate", - "icitati", - "icit\u0103\u0163i", - "icatori", - "ivit\u0103i", - "icit\u0103i", - "icator", - "a\u0163iune", - "atoare", - "\u0103toare", - "i\u0163iune", - "itoare", - "iciva", - "icive", - "icivi", - "iciv\u0103", - "icala", - "icale", - "icali", - "ical\u0103", - "ativa", - "ative", - "ativi", - "ativ\u0103", - "atori", - "\u0103tori", - "itiva", - "itive", - "itivi", - "itiv\u0103", - "itori", - "iciv", - "ical", - "ativ", - "ator", - "\u0103tor", - "itiv", - "itor", - ) - __step2_suffixes = ( - "abila", - "abile", - "abili", - "abil\u0103", - "ibila", - "ibile", - "ibili", - "ibil\u0103", - "atori", - "itate", - "itati", - "it\u0103\u0163i", - "abil", - "ibil", - "oasa", - "oas\u0103", - "oase", - "anta", - "ante", - "anti", - "ant\u0103", - "ator", - "it\u0103i", - "iune", - "iuni", - "isme", - "ista", - "iste", - "isti", - "ist\u0103", - "i\u015Fti", - "ata", - "at\u0103", - "ati", - "ate", - "uta", - "ut\u0103", - "uti", - "ute", - "ita", - "it\u0103", - "iti", - "ite", - "ica", - "ice", - "ici", - "ic\u0103", - "osi", - "o\u015Fi", - "ant", - "iva", - "ive", - "ivi", - "iv\u0103", - "ism", - "ist", - "at", - "ut", - "it", - "ic", - "os", - "iv", - ) - __step3_suffixes = ( - "seser\u0103\u0163i", - "aser\u0103\u0163i", - "iser\u0103\u0163i", - "\xE2ser\u0103\u0163i", - "user\u0103\u0163i", - "seser\u0103m", - "aser\u0103m", - "iser\u0103m", - "\xE2ser\u0103m", - "user\u0103m", - "ser\u0103\u0163i", - "sese\u015Fi", - "seser\u0103", - "easc\u0103", - "ar\u0103\u0163i", - "ur\u0103\u0163i", - "ir\u0103\u0163i", - "\xE2r\u0103\u0163i", - "ase\u015Fi", - "aser\u0103", - "ise\u015Fi", - "iser\u0103", - "\xe2se\u015Fi", - "\xE2ser\u0103", - "use\u015Fi", - "user\u0103", - "ser\u0103m", - "sesem", - "indu", - "\xE2ndu", - "eaz\u0103", - "e\u015Fti", - "e\u015Fte", - "\u0103\u015Fti", - "\u0103\u015Fte", - "ea\u0163i", - "ia\u0163i", - "ar\u0103m", - "ur\u0103m", - "ir\u0103m", - "\xE2r\u0103m", - "asem", - "isem", - "\xE2sem", - "usem", - "se\u015Fi", - "ser\u0103", - "sese", - "are", - "ere", - "ire", - "\xE2re", - "ind", - "\xE2nd", - "eze", - "ezi", - "esc", - "\u0103sc", - "eam", - "eai", - "eau", - "iam", - "iai", - "iau", - "a\u015Fi", - "ar\u0103", - "u\u015Fi", - "ur\u0103", - "i\u015Fi", - "ir\u0103", - "\xE2\u015Fi", - "\xe2r\u0103", - "ase", - "ise", - "\xE2se", - "use", - "a\u0163i", - "e\u0163i", - "i\u0163i", - "\xe2\u0163i", - "sei", - "ez", - "am", - "ai", - "au", - "ea", - "ia", - "ui", - "\xE2i", - "\u0103m", - "em", - "im", - "\xE2m", - "se", - ) - - def stem(self, word): - """ - Stem a Romanian word and return the stemmed form. - - :param word: The word that is stemmed. - :type word: str or unicode - :return: The stemmed form. - :rtype: unicode - - """ - word = word.lower() - - if word in self.stopwords: - return word - - step1_success = False - step2_success = False - - for i in range(1, len(word) - 1): - if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: - if word[i] == "u": - word = "".join((word[:i], "U", word[i + 1 :])) - - elif word[i] == "i": - word = "".join((word[:i], "I", word[i + 1 :])) - - r1, r2 = self._r1r2_standard(word, self.__vowels) - rv = self._rv_standard(word, self.__vowels) - - # STEP 0: Removal of plurals and other simplifications - for suffix in self.__step0_suffixes: - if word.endswith(suffix): - if suffix in r1: - if suffix in ("ul", "ului"): - word = word[: -len(suffix)] - - if suffix in rv: - rv = rv[: -len(suffix)] - else: - rv = "" - - elif ( - suffix == "aua" - or suffix == "atei" - or (suffix == "ile" and word[-5:-3] != "ab") - ): - word = word[:-2] - - elif suffix in ("ea", "ele", "elor"): - word = suffix_replace(word, suffix, "e") - - if suffix in rv: - rv = suffix_replace(rv, suffix, "e") - else: - rv = "" - - elif suffix in ("ii", "iua", "iei", "iile", "iilor", "ilor"): - word = suffix_replace(word, suffix, "i") - - if suffix in rv: - rv = suffix_replace(rv, suffix, "i") - else: - rv = "" - - elif suffix in ("a\u0163ie", "a\u0163ia"): - word = word[:-1] - break - - # STEP 1: Reduction of combining suffixes - while True: - - replacement_done = False - - for suffix in self.__step1_suffixes: - if word.endswith(suffix): - if suffix in r1: - step1_success = True - replacement_done = True - - if suffix in ( - "abilitate", - "abilitati", - "abilit\u0103i", - "abilit\u0103\u0163i", - ): - word = suffix_replace(word, suffix, "abil") - - elif suffix == "ibilitate": - word = word[:-5] - - elif suffix in ( - "ivitate", - "ivitati", - "ivit\u0103i", - "ivit\u0103\u0163i", - ): - word = suffix_replace(word, suffix, "iv") - - elif suffix in ( - "icitate", - "icitati", - "icit\u0103i", - "icit\u0103\u0163i", - "icator", - "icatori", - "iciv", - "iciva", - "icive", - "icivi", - "iciv\u0103", - "ical", - "icala", - "icale", - "icali", - "ical\u0103", - ): - word = suffix_replace(word, suffix, "ic") - - elif suffix in ( - "ativ", - "ativa", - "ative", - "ativi", - "ativ\u0103", - "a\u0163iune", - "atoare", - "ator", - "atori", - "\u0103toare", - "\u0103tor", - "\u0103tori", - ): - word = suffix_replace(word, suffix, "at") - - if suffix in r2: - r2 = suffix_replace(r2, suffix, "at") - - elif suffix in ( - "itiv", - "itiva", - "itive", - "itivi", - "itiv\u0103", - "i\u0163iune", - "itoare", - "itor", - "itori", - ): - word = suffix_replace(word, suffix, "it") - - if suffix in r2: - r2 = suffix_replace(r2, suffix, "it") - else: - step1_success = False - break - - if not replacement_done: - break - - # STEP 2: Removal of standard suffixes - for suffix in self.__step2_suffixes: - if word.endswith(suffix): - if suffix in r2: - step2_success = True - - if suffix in ("iune", "iuni"): - if word[-5] == "\u0163": - word = "".join((word[:-5], "t")) - - elif suffix in ( - "ism", - "isme", - "ist", - "ista", - "iste", - "isti", - "ist\u0103", - "i\u015Fti", - ): - word = suffix_replace(word, suffix, "ist") - - else: - word = word[: -len(suffix)] - break - - # STEP 3: Removal of verb suffixes - if not step1_success and not step2_success: - for suffix in self.__step3_suffixes: - if word.endswith(suffix): - if suffix in rv: - if suffix in ( - "seser\u0103\u0163i", - "seser\u0103m", - "ser\u0103\u0163i", - "sese\u015Fi", - "seser\u0103", - "ser\u0103m", - "sesem", - "se\u015Fi", - "ser\u0103", - "sese", - "a\u0163i", - "e\u0163i", - "i\u0163i", - "\xE2\u0163i", - "sei", - "\u0103m", - "em", - "im", - "\xE2m", - "se", - ): - word = word[: -len(suffix)] - rv = rv[: -len(suffix)] - else: - if ( - not rv.startswith(suffix) - and rv[rv.index(suffix) - 1] not in "aeio\u0103\xE2\xEE" - ): - word = word[: -len(suffix)] - break - - # STEP 4: Removal of final vowel - for suffix in ("ie", "a", "e", "i", "\u0103"): - if word.endswith(suffix): - if suffix in rv: - word = word[: -len(suffix)] - break - - word = word.replace("I", "i").replace("U", "u") - - return word - - -class RussianStemmer(_LanguageSpecificStemmer): - - """ - The Russian Snowball stemmer. - - :cvar __perfective_gerund_suffixes: Suffixes to be deleted. - :type __perfective_gerund_suffixes: tuple - :cvar __adjectival_suffixes: Suffixes to be deleted. - :type __adjectival_suffixes: tuple - :cvar __reflexive_suffixes: Suffixes to be deleted. - :type __reflexive_suffixes: tuple - :cvar __verb_suffixes: Suffixes to be deleted. - :type __verb_suffixes: tuple - :cvar __noun_suffixes: Suffixes to be deleted. - :type __noun_suffixes: tuple - :cvar __superlative_suffixes: Suffixes to be deleted. - :type __superlative_suffixes: tuple - :cvar __derivational_suffixes: Suffixes to be deleted. - :type __derivational_suffixes: tuple - :note: A detailed description of the Russian - stemming algorithm can be found under - http://snowball.tartarus.org/algorithms/russian/stemmer.html - - """ - - __perfective_gerund_suffixes = ( - "ivshis'", - "yvshis'", - "vshis'", - "ivshi", - "yvshi", - "vshi", - "iv", - "yv", - "v", - ) - __adjectival_suffixes = ( - "ui^ushchi^ui^u", - "ui^ushchi^ai^a", - "ui^ushchimi", - "ui^ushchymi", - "ui^ushchego", - "ui^ushchogo", - "ui^ushchemu", - "ui^ushchomu", - "ui^ushchikh", - "ui^ushchykh", - "ui^ushchui^u", - "ui^ushchaia", - "ui^ushchoi^u", - "ui^ushchei^u", - "i^ushchi^ui^u", - "i^ushchi^ai^a", - "ui^ushchee", - "ui^ushchie", - "ui^ushchye", - "ui^ushchoe", - "ui^ushchei`", - "ui^ushchii`", - "ui^ushchyi`", - "ui^ushchoi`", - "ui^ushchem", - "ui^ushchim", - "ui^ushchym", - "ui^ushchom", - "i^ushchimi", - "i^ushchymi", - "i^ushchego", - "i^ushchogo", - "i^ushchemu", - "i^ushchomu", - "i^ushchikh", - "i^ushchykh", - "i^ushchui^u", - "i^ushchai^a", - "i^ushchoi^u", - "i^ushchei^u", - "i^ushchee", - "i^ushchie", - "i^ushchye", - "i^ushchoe", - "i^ushchei`", - "i^ushchii`", - "i^ushchyi`", - "i^ushchoi`", - "i^ushchem", - "i^ushchim", - "i^ushchym", - "i^ushchom", - "shchi^ui^u", - "shchi^ai^a", - "ivshi^ui^u", - "ivshi^ai^a", - "yvshi^ui^u", - "yvshi^ai^a", - "shchimi", - "shchymi", - "shchego", - "shchogo", - "shchemu", - "shchomu", - "shchikh", - "shchykh", - "shchui^u", - "shchai^a", - "shchoi^u", - "shchei^u", - "ivshimi", - "ivshymi", - "ivshego", - "ivshogo", - "ivshemu", - "ivshomu", - "ivshikh", - "ivshykh", - "ivshui^u", - "ivshai^a", - "ivshoi^u", - "ivshei^u", - "yvshimi", - "yvshymi", - "yvshego", - "yvshogo", - "yvshemu", - "yvshomu", - "yvshikh", - "yvshykh", - "yvshui^u", - "yvshai^a", - "yvshoi^u", - "yvshei^u", - "vshi^ui^u", - "vshi^ai^a", - "shchee", - "shchie", - "shchye", - "shchoe", - "shchei`", - "shchii`", - "shchyi`", - "shchoi`", - "shchem", - "shchim", - "shchym", - "shchom", - "ivshee", - "ivshie", - "ivshye", - "ivshoe", - "ivshei`", - "ivshii`", - "ivshyi`", - "ivshoi`", - "ivshem", - "ivshim", - "ivshym", - "ivshom", - "yvshee", - "yvshie", - "yvshye", - "yvshoe", - "yvshei`", - "yvshii`", - "yvshyi`", - "yvshoi`", - "yvshem", - "yvshim", - "yvshym", - "yvshom", - "vshimi", - "vshymi", - "vshego", - "vshogo", - "vshemu", - "vshomu", - "vshikh", - "vshykh", - "vshui^u", - "vshai^a", - "vshoi^u", - "vshei^u", - "emi^ui^u", - "emi^ai^a", - "nni^ui^u", - "nni^ai^a", - "vshee", - "vshie", - "vshye", - "vshoe", - "vshei`", - "vshii`", - "vshyi`", - "vshoi`", - "vshem", - "vshim", - "vshym", - "vshom", - "emimi", - "emymi", - "emego", - "emogo", - "ememu", - "emomu", - "emikh", - "emykh", - "emui^u", - "emai^a", - "emoi^u", - "emei^u", - "nnimi", - "nnymi", - "nnego", - "nnogo", - "nnemu", - "nnomu", - "nnikh", - "nnykh", - "nnui^u", - "nnai^a", - "nnoi^u", - "nnei^u", - "emee", - "emie", - "emye", - "emoe", - "emei`", - "emii`", - "emyi`", - "emoi`", - "emem", - "emim", - "emym", - "emom", - "nnee", - "nnie", - "nnye", - "nnoe", - "nnei`", - "nnii`", - "nnyi`", - "nnoi`", - "nnem", - "nnim", - "nnym", - "nnom", - "i^ui^u", - "i^ai^a", - "imi", - "ymi", - "ego", - "ogo", - "emu", - "omu", - "ikh", - "ykh", - "ui^u", - "ai^a", - "oi^u", - "ei^u", - "ee", - "ie", - "ye", - "oe", - "ei`", - "ii`", - "yi`", - "oi`", - "em", - "im", - "ym", - "om", - ) - __reflexive_suffixes = ("si^a", "s'") - __verb_suffixes = ( - "esh'", - "ei`te", - "ui`te", - "ui^ut", - "ish'", - "ete", - "i`te", - "i^ut", - "nno", - "ila", - "yla", - "ena", - "ite", - "ili", - "yli", - "ilo", - "ylo", - "eno", - "i^at", - "uet", - "eny", - "it'", - "yt'", - "ui^u", - "la", - "na", - "li", - "em", - "lo", - "no", - "et", - "ny", - "t'", - "ei`", - "ui`", - "il", - "yl", - "im", - "ym", - "en", - "it", - "yt", - "i^u", - "i`", - "l", - "n", - ) - __noun_suffixes = ( - "ii^ami", - "ii^akh", - "i^ami", - "ii^am", - "i^akh", - "ami", - "iei`", - "i^am", - "iem", - "akh", - "ii^u", - "'i^u", - "ii^a", - "'i^a", - "ev", - "ov", - "ie", - "'e", - "ei", - "ii", - "ei`", - "oi`", - "ii`", - "em", - "am", - "om", - "i^u", - "i^a", - "a", - "e", - "i", - "i`", - "o", - "u", - "y", - "'", - ) - __superlative_suffixes = ("ei`she", "ei`sh") - __derivational_suffixes = ("ost'", "ost") - - def stem(self, word): - """ - Stem a Russian word and return the stemmed form. - - :param word: The word that is stemmed. - :type word: str or unicode - :return: The stemmed form. - :rtype: unicode - - """ - if word in self.stopwords: - return word - - chr_exceeded = False - for i in range(len(word)): - if ord(word[i]) > 255: - chr_exceeded = True - break - - if not chr_exceeded: - return word - - word = self.__cyrillic_to_roman(word) - - step1_success = False - adjectival_removed = False - verb_removed = False - undouble_success = False - superlative_removed = False - - rv, r2 = self.__regions_russian(word) - - # Step 1 - for suffix in self.__perfective_gerund_suffixes: - if rv.endswith(suffix): - if suffix in ("v", "vshi", "vshis'"): - if ( - rv[-len(suffix) - 3 : -len(suffix)] == "i^a" - or rv[-len(suffix) - 1 : -len(suffix)] == "a" - ): - word = word[: -len(suffix)] - r2 = r2[: -len(suffix)] - rv = rv[: -len(suffix)] - step1_success = True - break - else: - word = word[: -len(suffix)] - r2 = r2[: -len(suffix)] - rv = rv[: -len(suffix)] - step1_success = True - break - - if not step1_success: - for suffix in self.__reflexive_suffixes: - if rv.endswith(suffix): - word = word[: -len(suffix)] - r2 = r2[: -len(suffix)] - rv = rv[: -len(suffix)] - break - - for suffix in self.__adjectival_suffixes: - if rv.endswith(suffix): - if suffix in ( - "i^ushchi^ui^u", - "i^ushchi^ai^a", - "i^ushchui^u", - "i^ushchai^a", - "i^ushchoi^u", - "i^ushchei^u", - "i^ushchimi", - "i^ushchymi", - "i^ushchego", - "i^ushchogo", - "i^ushchemu", - "i^ushchomu", - "i^ushchikh", - "i^ushchykh", - "shchi^ui^u", - "shchi^ai^a", - "i^ushchee", - "i^ushchie", - "i^ushchye", - "i^ushchoe", - "i^ushchei`", - "i^ushchii`", - "i^ushchyi`", - "i^ushchoi`", - "i^ushchem", - "i^ushchim", - "i^ushchym", - "i^ushchom", - "vshi^ui^u", - "vshi^ai^a", - "shchui^u", - "shchai^a", - "shchoi^u", - "shchei^u", - "emi^ui^u", - "emi^ai^a", - "nni^ui^u", - "nni^ai^a", - "shchimi", - "shchymi", - "shchego", - "shchogo", - "shchemu", - "shchomu", - "shchikh", - "shchykh", - "vshui^u", - "vshai^a", - "vshoi^u", - "vshei^u", - "shchee", - "shchie", - "shchye", - "shchoe", - "shchei`", - "shchii`", - "shchyi`", - "shchoi`", - "shchem", - "shchim", - "shchym", - "shchom", - "vshimi", - "vshymi", - "vshego", - "vshogo", - "vshemu", - "vshomu", - "vshikh", - "vshykh", - "emui^u", - "emai^a", - "emoi^u", - "emei^u", - "nnui^u", - "nnai^a", - "nnoi^u", - "nnei^u", - "vshee", - "vshie", - "vshye", - "vshoe", - "vshei`", - "vshii`", - "vshyi`", - "vshoi`", - "vshem", - "vshim", - "vshym", - "vshom", - "emimi", - "emymi", - "emego", - "emogo", - "ememu", - "emomu", - "emikh", - "emykh", - "nnimi", - "nnymi", - "nnego", - "nnogo", - "nnemu", - "nnomu", - "nnikh", - "nnykh", - "emee", - "emie", - "emye", - "emoe", - "emei`", - "emii`", - "emyi`", - "emoi`", - "emem", - "emim", - "emym", - "emom", - "nnee", - "nnie", - "nnye", - "nnoe", - "nnei`", - "nnii`", - "nnyi`", - "nnoi`", - "nnem", - "nnim", - "nnym", - "nnom", - ): - if ( - rv[-len(suffix) - 3 : -len(suffix)] == "i^a" - or rv[-len(suffix) - 1 : -len(suffix)] == "a" - ): - word = word[: -len(suffix)] - r2 = r2[: -len(suffix)] - rv = rv[: -len(suffix)] - adjectival_removed = True - break - else: - word = word[: -len(suffix)] - r2 = r2[: -len(suffix)] - rv = rv[: -len(suffix)] - adjectival_removed = True - break - - if not adjectival_removed: - for suffix in self.__verb_suffixes: - if rv.endswith(suffix): - if suffix in ( - "la", - "na", - "ete", - "i`te", - "li", - "i`", - "l", - "em", - "n", - "lo", - "no", - "et", - "i^ut", - "ny", - "t'", - "esh'", - "nno", - ): - if ( - rv[-len(suffix) - 3 : -len(suffix)] == "i^a" - or rv[-len(suffix) - 1 : -len(suffix)] == "a" - ): - word = word[: -len(suffix)] - r2 = r2[: -len(suffix)] - rv = rv[: -len(suffix)] - verb_removed = True - break - else: - word = word[: -len(suffix)] - r2 = r2[: -len(suffix)] - rv = rv[: -len(suffix)] - verb_removed = True - break - - if not adjectival_removed and not verb_removed: - for suffix in self.__noun_suffixes: - if rv.endswith(suffix): - word = word[: -len(suffix)] - r2 = r2[: -len(suffix)] - rv = rv[: -len(suffix)] - break - - # Step 2 - if rv.endswith("i"): - word = word[:-1] - r2 = r2[:-1] - - # Step 3 - for suffix in self.__derivational_suffixes: - if r2.endswith(suffix): - word = word[: -len(suffix)] - break - - # Step 4 - if word.endswith("nn"): - word = word[:-1] - undouble_success = True - - if not undouble_success: - for suffix in self.__superlative_suffixes: - if word.endswith(suffix): - word = word[: -len(suffix)] - superlative_removed = True - break - if word.endswith("nn"): - word = word[:-1] - - if not undouble_success and not superlative_removed: - if word.endswith("'"): - word = word[:-1] - - word = self.__roman_to_cyrillic(word) - - return word - - def __regions_russian(self, word): - """ - Return the regions RV and R2 which are used by the Russian stemmer. - - In any word, RV is the region after the first vowel, - or the end of the word if it contains no vowel. - - R2 is the region after the first non-vowel following - a vowel in R1, or the end of the word if there is no such non-vowel. - - R1 is the region after the first non-vowel following a vowel, - or the end of the word if there is no such non-vowel. - - :param word: The Russian word whose regions RV and R2 are determined. - :type word: str or unicode - :return: the regions RV and R2 for the respective Russian word. - :rtype: tuple - :note: This helper method is invoked by the stem method of the subclass - RussianStemmer. It is not to be invoked directly! - - """ - r1 = "" - r2 = "" - rv = "" - - vowels = ("A", "U", "E", "a", "e", "i", "o", "u", "y") - word = word.replace("i^a", "A").replace("i^u", "U").replace("e`", "E") - - for i in range(1, len(word)): - if word[i] not in vowels and word[i - 1] in vowels: - r1 = word[i + 1 :] - break - - for i in range(1, len(r1)): - if r1[i] not in vowels and r1[i - 1] in vowels: - r2 = r1[i + 1 :] - break - - for i in range(len(word)): - if word[i] in vowels: - rv = word[i + 1 :] - break - - r2 = r2.replace("A", "i^a").replace("U", "i^u").replace("E", "e`") - rv = rv.replace("A", "i^a").replace("U", "i^u").replace("E", "e`") - - return (rv, r2) - - def __cyrillic_to_roman(self, word): - """ - Transliterate a Russian word into the Roman alphabet. - - A Russian word whose letters consist of the Cyrillic - alphabet are transliterated into the Roman alphabet - in order to ease the forthcoming stemming process. - - :param word: The word that is transliterated. - :type word: unicode - :return: the transliterated word. - :rtype: unicode - :note: This helper method is invoked by the stem method of the subclass - RussianStemmer. It is not to be invoked directly! - - """ - word = ( - word.replace("\u0410", "a") - .replace("\u0430", "a") - .replace("\u0411", "b") - .replace("\u0431", "b") - .replace("\u0412", "v") - .replace("\u0432", "v") - .replace("\u0413", "g") - .replace("\u0433", "g") - .replace("\u0414", "d") - .replace("\u0434", "d") - .replace("\u0415", "e") - .replace("\u0435", "e") - .replace("\u0401", "e") - .replace("\u0451", "e") - .replace("\u0416", "zh") - .replace("\u0436", "zh") - .replace("\u0417", "z") - .replace("\u0437", "z") - .replace("\u0418", "i") - .replace("\u0438", "i") - .replace("\u0419", "i`") - .replace("\u0439", "i`") - .replace("\u041A", "k") - .replace("\u043A", "k") - .replace("\u041B", "l") - .replace("\u043B", "l") - .replace("\u041C", "m") - .replace("\u043C", "m") - .replace("\u041D", "n") - .replace("\u043D", "n") - .replace("\u041E", "o") - .replace("\u043E", "o") - .replace("\u041F", "p") - .replace("\u043F", "p") - .replace("\u0420", "r") - .replace("\u0440", "r") - .replace("\u0421", "s") - .replace("\u0441", "s") - .replace("\u0422", "t") - .replace("\u0442", "t") - .replace("\u0423", "u") - .replace("\u0443", "u") - .replace("\u0424", "f") - .replace("\u0444", "f") - .replace("\u0425", "kh") - .replace("\u0445", "kh") - .replace("\u0426", "t^s") - .replace("\u0446", "t^s") - .replace("\u0427", "ch") - .replace("\u0447", "ch") - .replace("\u0428", "sh") - .replace("\u0448", "sh") - .replace("\u0429", "shch") - .replace("\u0449", "shch") - .replace("\u042A", "''") - .replace("\u044A", "''") - .replace("\u042B", "y") - .replace("\u044B", "y") - .replace("\u042C", "'") - .replace("\u044C", "'") - .replace("\u042D", "e`") - .replace("\u044D", "e`") - .replace("\u042E", "i^u") - .replace("\u044E", "i^u") - .replace("\u042F", "i^a") - .replace("\u044F", "i^a") - ) - - return word - - def __roman_to_cyrillic(self, word): - """ - Transliterate a Russian word back into the Cyrillic alphabet. - - A Russian word formerly transliterated into the Roman alphabet - in order to ease the stemming process, is transliterated back - into the Cyrillic alphabet, its original form. - - :param word: The word that is transliterated. - :type word: str or unicode - :return: word, the transliterated word. - :rtype: unicode - :note: This helper method is invoked by the stem method of the subclass - RussianStemmer. It is not to be invoked directly! - - """ - word = ( - word.replace("i^u", "\u044E") - .replace("i^a", "\u044F") - .replace("shch", "\u0449") - .replace("kh", "\u0445") - .replace("t^s", "\u0446") - .replace("ch", "\u0447") - .replace("e`", "\u044D") - .replace("i`", "\u0439") - .replace("sh", "\u0448") - .replace("k", "\u043A") - .replace("e", "\u0435") - .replace("zh", "\u0436") - .replace("a", "\u0430") - .replace("b", "\u0431") - .replace("v", "\u0432") - .replace("g", "\u0433") - .replace("d", "\u0434") - .replace("e", "\u0435") - .replace("z", "\u0437") - .replace("i", "\u0438") - .replace("l", "\u043B") - .replace("m", "\u043C") - .replace("n", "\u043D") - .replace("o", "\u043E") - .replace("p", "\u043F") - .replace("r", "\u0440") - .replace("s", "\u0441") - .replace("t", "\u0442") - .replace("u", "\u0443") - .replace("f", "\u0444") - .replace("''", "\u044A") - .replace("y", "\u044B") - .replace("'", "\u044C") - ) - - return word - - -class SpanishStemmer(_StandardStemmer): - - """ - The Spanish Snowball stemmer. - - :cvar __vowels: The Spanish vowels. - :type __vowels: unicode - :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. - :type __step0_suffixes: tuple - :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. - :type __step1_suffixes: tuple - :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm. - :type __step2a_suffixes: tuple - :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm. - :type __step2b_suffixes: tuple - :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. - :type __step3_suffixes: tuple - :note: A detailed description of the Spanish - stemming algorithm can be found under - http://snowball.tartarus.org/algorithms/spanish/stemmer.html - - """ - - __vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xFC" - __step0_suffixes = ( - "selas", - "selos", - "sela", - "selo", - "las", - "les", - "los", - "nos", - "me", - "se", - "la", - "le", - "lo", - ) - __step1_suffixes = ( - "amientos", - "imientos", - "amiento", - "imiento", - "acion", - "aciones", - "uciones", - "adoras", - "adores", - "ancias", - "log\xEDas", - "encias", - "amente", - "idades", - "anzas", - "ismos", - "ables", - "ibles", - "istas", - "adora", - "aci\xF3n", - "antes", - "ancia", - "log\xEDa", - "uci\xf3n", - "encia", - "mente", - "anza", - "icos", - "icas", - "ismo", - "able", - "ible", - "ista", - "osos", - "osas", - "ador", - "ante", - "idad", - "ivas", - "ivos", - "ico", - "ica", - "oso", - "osa", - "iva", - "ivo", - ) - __step2a_suffixes = ( - "yeron", - "yendo", - "yamos", - "yais", - "yan", - "yen", - "yas", - "yes", - "ya", - "ye", - "yo", - "y\xF3", - ) - __step2b_suffixes = ( - "ar\xEDamos", - "er\xEDamos", - "ir\xEDamos", - "i\xE9ramos", - "i\xE9semos", - "ar\xEDais", - "aremos", - "er\xEDais", - "eremos", - "ir\xEDais", - "iremos", - "ierais", - "ieseis", - "asteis", - "isteis", - "\xE1bamos", - "\xE1ramos", - "\xE1semos", - "ar\xEDan", - "ar\xEDas", - "ar\xE9is", - "er\xEDan", - "er\xEDas", - "er\xE9is", - "ir\xEDan", - "ir\xEDas", - "ir\xE9is", - "ieran", - "iesen", - "ieron", - "iendo", - "ieras", - "ieses", - "abais", - "arais", - "aseis", - "\xE9amos", - "ar\xE1n", - "ar\xE1s", - "ar\xEDa", - "er\xE1n", - "er\xE1s", - "er\xEDa", - "ir\xE1n", - "ir\xE1s", - "ir\xEDa", - "iera", - "iese", - "aste", - "iste", - "aban", - "aran", - "asen", - "aron", - "ando", - "abas", - "adas", - "idas", - "aras", - "ases", - "\xEDais", - "ados", - "idos", - "amos", - "imos", - "emos", - "ar\xE1", - "ar\xE9", - "er\xE1", - "er\xE9", - "ir\xE1", - "ir\xE9", - "aba", - "ada", - "ida", - "ara", - "ase", - "\xEDan", - "ado", - "ido", - "\xEDas", - "\xE1is", - "\xE9is", - "\xEDa", - "ad", - "ed", - "id", - "an", - "i\xF3", - "ar", - "er", - "ir", - "as", - "\xEDs", - "en", - "es", - ) - __step3_suffixes = ("os", "a", "e", "o", "\xE1", "\xE9", "\xED", "\xF3") - - def stem(self, word): - """ - Stem a Spanish word and return the stemmed form. - - :param word: The word that is stemmed. - :type word: str or unicode - :return: The stemmed form. - :rtype: unicode - - """ - word = word.lower() - - if word in self.stopwords: - return word - - step1_success = False - - r1, r2 = self._r1r2_standard(word, self.__vowels) - rv = self._rv_standard(word, self.__vowels) - - # STEP 0: Attached pronoun - for suffix in self.__step0_suffixes: - if not (word.endswith(suffix) and rv.endswith(suffix)): - continue - - if ( - rv[: -len(suffix)].endswith( - ( - "ando", - "\xE1ndo", - "ar", - "\xE1r", - "er", - "\xE9r", - "iendo", - "i\xE9ndo", - "ir", - "\xEDr", - ) - ) - ) or ( - rv[: -len(suffix)].endswith("yendo") - and word[: -len(suffix)].endswith("uyendo") - ): - - word = self.__replace_accented(word[: -len(suffix)]) - r1 = self.__replace_accented(r1[: -len(suffix)]) - r2 = self.__replace_accented(r2[: -len(suffix)]) - rv = self.__replace_accented(rv[: -len(suffix)]) - break - - # STEP 1: Standard suffix removal - for suffix in self.__step1_suffixes: - if not word.endswith(suffix): - continue - - if suffix == "amente" and r1.endswith(suffix): - step1_success = True - word = word[:-6] - r2 = r2[:-6] - rv = rv[:-6] - - if r2.endswith("iv"): - word = word[:-2] - r2 = r2[:-2] - rv = rv[:-2] - - if r2.endswith("at"): - word = word[:-2] - rv = rv[:-2] - - elif r2.endswith(("os", "ic", "ad")): - word = word[:-2] - rv = rv[:-2] - - elif r2.endswith(suffix): - step1_success = True - if suffix in ( - "adora", - "ador", - "aci\xF3n", - "adoras", - "adores", - "acion", - "aciones", - "ante", - "antes", - "ancia", - "ancias", - ): - word = word[: -len(suffix)] - r2 = r2[: -len(suffix)] - rv = rv[: -len(suffix)] - - if r2.endswith("ic"): - word = word[:-2] - rv = rv[:-2] - - elif suffix in ("log\xEDa", "log\xEDas"): - word = suffix_replace(word, suffix, "log") - rv = suffix_replace(rv, suffix, "log") - - elif suffix in ("uci\xF3n", "uciones"): - word = suffix_replace(word, suffix, "u") - rv = suffix_replace(rv, suffix, "u") - - elif suffix in ("encia", "encias"): - word = suffix_replace(word, suffix, "ente") - rv = suffix_replace(rv, suffix, "ente") - - elif suffix == "mente": - word = word[: -len(suffix)] - r2 = r2[: -len(suffix)] - rv = rv[: -len(suffix)] - - if r2.endswith(("ante", "able", "ible")): - word = word[:-4] - rv = rv[:-4] - - elif suffix in ("idad", "idades"): - word = word[: -len(suffix)] - r2 = r2[: -len(suffix)] - rv = rv[: -len(suffix)] - - for pre_suff in ("abil", "ic", "iv"): - if r2.endswith(pre_suff): - word = word[: -len(pre_suff)] - rv = rv[: -len(pre_suff)] - - elif suffix in ("ivo", "iva", "ivos", "ivas"): - word = word[: -len(suffix)] - r2 = r2[: -len(suffix)] - rv = rv[: -len(suffix)] - if r2.endswith("at"): - word = word[:-2] - rv = rv[:-2] - else: - word = word[: -len(suffix)] - rv = rv[: -len(suffix)] - break - - # STEP 2a: Verb suffixes beginning 'y' - if not step1_success: - for suffix in self.__step2a_suffixes: - if rv.endswith(suffix) and word[-len(suffix) - 1 : -len(suffix)] == "u": - word = word[: -len(suffix)] - rv = rv[: -len(suffix)] - break - - # STEP 2b: Other verb suffixes - for suffix in self.__step2b_suffixes: - if rv.endswith(suffix): - word = word[: -len(suffix)] - rv = rv[: -len(suffix)] - if suffix in ("en", "es", "\xE9is", "emos"): - if word.endswith("gu"): - word = word[:-1] - - if rv.endswith("gu"): - rv = rv[:-1] - break - - # STEP 3: Residual suffix - for suffix in self.__step3_suffixes: - if rv.endswith(suffix): - word = word[: -len(suffix)] - if suffix in ("e", "\xE9"): - rv = rv[: -len(suffix)] - - if word[-2:] == "gu" and rv.endswith("u"): - word = word[:-1] - break - - word = self.__replace_accented(word) - - return word - - def __replace_accented(self, word): - """ - Replaces all accented letters on a word with their non-accented - counterparts. - - :param word: A spanish word, with or without accents - :type word: str or unicode - :return: a word with the accented letters (á, é, í, ó, ú) replaced with - their non-accented counterparts (a, e, i, o, u) - :rtype: str or unicode - """ - return ( - word.replace("\xE1", "a") - .replace("\xE9", "e") - .replace("\xED", "i") - .replace("\xF3", "o") - .replace("\xFA", "u") - ) - - -class SwedishStemmer(_ScandinavianStemmer): - - """ - The Swedish Snowball stemmer. - - :cvar __vowels: The Swedish vowels. - :type __vowels: unicode - :cvar __s_ending: Letters that may directly appear before a word final 's'. - :type __s_ending: unicode - :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. - :type __step1_suffixes: tuple - :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. - :type __step2_suffixes: tuple - :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. - :type __step3_suffixes: tuple - :note: A detailed description of the Swedish - stemming algorithm can be found under - http://snowball.tartarus.org/algorithms/swedish/stemmer.html - - """ - - __vowels = "aeiouy\xE4\xE5\xF6" - __s_ending = "bcdfghjklmnoprtvy" - __step1_suffixes = ( - "heterna", - "hetens", - "heter", - "heten", - "anden", - "arnas", - "ernas", - "ornas", - "andes", - "andet", - "arens", - "arna", - "erna", - "orna", - "ande", - "arne", - "aste", - "aren", - "ades", - "erns", - "ade", - "are", - "ern", - "ens", - "het", - "ast", - "ad", - "en", - "ar", - "er", - "or", - "as", - "es", - "at", - "a", - "e", - "s", - ) - __step2_suffixes = ("dd", "gd", "nn", "dt", "gt", "kt", "tt") - __step3_suffixes = ("fullt", "l\xF6st", "els", "lig", "ig") - - def stem(self, word): - """ - Stem a Swedish word and return the stemmed form. - - :param word: The word that is stemmed. - :type word: str or unicode - :return: The stemmed form. - :rtype: unicode - - """ - word = word.lower() - - if word in self.stopwords: - return word - - r1 = self._r1_scandinavian(word, self.__vowels) - - # STEP 1 - for suffix in self.__step1_suffixes: - if r1.endswith(suffix): - if suffix == "s": - if word[-2] in self.__s_ending: - word = word[:-1] - r1 = r1[:-1] - else: - word = word[: -len(suffix)] - r1 = r1[: -len(suffix)] - break - - # STEP 2 - for suffix in self.__step2_suffixes: - if r1.endswith(suffix): - word = word[:-1] - r1 = r1[:-1] - break - - # STEP 3 - for suffix in self.__step3_suffixes: - if r1.endswith(suffix): - if suffix in ("els", "lig", "ig"): - word = word[: -len(suffix)] - elif suffix in ("fullt", "l\xF6st"): - word = word[:-1] - break - - return word - - -def demo(): - """ - This function provides a demonstration of the Snowball stemmers. - - After invoking this function and specifying a language, - it stems an excerpt of the Universal Declaration of Human Rights - (which is a part of the NLTK corpus collection) and then prints - out the original and the stemmed text. - - """ - - from nltk.corpus import udhr - - udhr_corpus = { - "arabic": "Arabic_Alarabia-Arabic", - "danish": "Danish_Dansk-Latin1", - "dutch": "Dutch_Nederlands-Latin1", - "english": "English-Latin1", - "finnish": "Finnish_Suomi-Latin1", - "french": "French_Francais-Latin1", - "german": "German_Deutsch-Latin1", - "hungarian": "Hungarian_Magyar-UTF8", - "italian": "Italian_Italiano-Latin1", - "norwegian": "Norwegian-Latin1", - "porter": "English-Latin1", - "portuguese": "Portuguese_Portugues-Latin1", - "romanian": "Romanian_Romana-Latin2", - "russian": "Russian-UTF8", - "spanish": "Spanish-Latin1", - "swedish": "Swedish_Svenska-Latin1", - } - - print("\n") - print("******************************") - print("Demo for the Snowball stemmers") - print("******************************") - - while True: - - language = input( - "Please enter the name of the language " - + "to be demonstrated\n" - + "/".join(SnowballStemmer.languages) - + "\n" - + "(enter 'exit' in order to leave): " - ) - - if language == "exit": - break - - if language not in SnowballStemmer.languages: - print( - "\nOops, there is no stemmer for this language. " - + "Please try again.\n" - ) - continue - - stemmer = SnowballStemmer(language) - excerpt = udhr.words(udhr_corpus[language])[:300] - - stemmed = " ".join(stemmer.stem(word) for word in excerpt) - stemmed = re.sub(r"(.{,70})\s", r"\1\n", stemmed + " ").rstrip() - excerpt = " ".join(excerpt) - excerpt = re.sub(r"(.{,70})\s", r"\1\n", excerpt + " ").rstrip() - - print("\n") - print("-" * 70) - print("ORIGINAL".center(70)) - print(excerpt) - print("\n\n") - print("STEMMED RESULTS".center(70)) - print(stemmed) - print("-" * 70) - print("\n") diff --git a/pipeline/nltk/stem/util.py b/pipeline/nltk/stem/util.py deleted file mode 100644 index 71d3a661e582e5dd60ef0a2bdcb2674ef29d472c..0000000000000000000000000000000000000000 --- a/pipeline/nltk/stem/util.py +++ /dev/null @@ -1,25 +0,0 @@ -# Natural Language Toolkit: Stemmer Utilities -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Helder -# URL: -# For license information, see LICENSE.TXT - - -def suffix_replace(original, old, new): - """ - Replaces the old suffix of the original string by a new suffix - """ - return original[: -len(old)] + new - - -def prefix_replace(original, old, new): - """ - Replaces the old prefix of the original string by a new suffix - - :param original: string - :param old: string - :param new: string - :return: string - """ - return new + original[len(old) :] diff --git a/pipeline/nltk/stem/wordnet.py b/pipeline/nltk/stem/wordnet.py deleted file mode 100644 index 0ccb056f7e2172f46d03236d76942c3080d2f107..0000000000000000000000000000000000000000 --- a/pipeline/nltk/stem/wordnet.py +++ /dev/null @@ -1,49 +0,0 @@ -# Natural Language Toolkit: WordNet stemmer interface -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Edward Loper -# URL: -# For license information, see LICENSE.TXT - -from nltk.corpus import wordnet as wn - - -class WordNetLemmatizer: - """ - WordNet Lemmatizer - - Lemmatize using WordNet's built-in morphy function. - Returns the input word unchanged if it cannot be found in WordNet. - - >>> from nltk.stem import WordNetLemmatizer - >>> wnl = WordNetLemmatizer() - >>> print(wnl.lemmatize('dogs')) - dog - >>> print(wnl.lemmatize('churches')) - church - >>> print(wnl.lemmatize('aardwolves')) - aardwolf - >>> print(wnl.lemmatize('abaci')) - abacus - >>> print(wnl.lemmatize('hardrock')) - hardrock - """ - - def lemmatize(self, word: str, pos: str = "n") -> str: - """Lemmatize `word` using WordNet's built-in morphy function. - Returns the input word unchanged if it cannot be found in WordNet. - - :param word: The input word to lemmatize. - :type word: str - :param pos: The Part Of Speech tag. Valid options are `"n"` for nouns, - `"v"` for verbs, `"a"` for adjectives, `"r"` for adverbs and `"s"` - for satellite adjectives. - :param pos: str - :return: The lemma of `word`, for the given `pos`. - """ - lemmas = wn._morphy(word, pos) - return min(lemmas, key=len) if lemmas else word - - def __repr__(self): - return "" diff --git a/pipeline/nltk/tag/__init__.py b/pipeline/nltk/tag/__init__.py deleted file mode 100644 index 3f537dd6c7a9badc43313a8d2b4c5efed9b1b6ce..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tag/__init__.py +++ /dev/null @@ -1,184 +0,0 @@ -# Natural Language Toolkit: Taggers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird (minor additions) -# URL: -# For license information, see LICENSE.TXT -""" -NLTK Taggers - -This package contains classes and interfaces for part-of-speech -tagging, or simply "tagging". - -A "tag" is a case-sensitive string that specifies some property of a token, -such as its part of speech. Tagged tokens are encoded as tuples -``(tag, token)``. For example, the following tagged token combines -the word ``'fly'`` with a noun part of speech tag (``'NN'``): - - >>> tagged_tok = ('fly', 'NN') - -An off-the-shelf tagger is available for English. It uses the Penn Treebank tagset: - - >>> from nltk import pos_tag, word_tokenize - >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) # doctest: +NORMALIZE_WHITESPACE - [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'), - ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')] - -A Russian tagger is also available if you specify lang="rus". It uses -the Russian National Corpus tagset: - - >>> pos_tag(word_tokenize("Илья оторопел и дважды перечитал бумажку."), lang='rus') # doctest: +SKIP - [('Илья', 'S'), ('оторопел', 'V'), ('и', 'CONJ'), ('дважды', 'ADV'), ('перечитал', 'V'), - ('бумажку', 'S'), ('.', 'NONLEX')] - -This package defines several taggers, which take a list of tokens, -assign a tag to each one, and return the resulting list of tagged tokens. -Most of the taggers are built automatically based on a training corpus. -For example, the unigram tagger tags each word *w* by checking what -the most frequent tag for *w* was in a training corpus: - - >>> from nltk.corpus import brown - >>> from nltk.tag import UnigramTagger - >>> tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500]) - >>> sent = ['Mitchell', 'decried', 'the', 'high', 'rate', 'of', 'unemployment'] - >>> for word, tag in tagger.tag(sent): - ... print(word, '->', tag) - Mitchell -> NP - decried -> None - the -> AT - high -> JJ - rate -> NN - of -> IN - unemployment -> None - -Note that words that the tagger has not seen during training receive a tag -of ``None``. - -We evaluate a tagger on data that was not seen during training: - - >>> round(tagger.accuracy(brown.tagged_sents(categories='news')[500:600]), 3) - 0.735 - -For more information, please consult chapter 5 of the NLTK Book. - -isort:skip_file -""" - -from nltk.tag.api import TaggerI -from nltk.tag.util import str2tuple, tuple2str, untag -from nltk.tag.sequential import ( - SequentialBackoffTagger, - ContextTagger, - DefaultTagger, - NgramTagger, - UnigramTagger, - BigramTagger, - TrigramTagger, - AffixTagger, - RegexpTagger, - ClassifierBasedTagger, - ClassifierBasedPOSTagger, -) -from nltk.tag.brill import BrillTagger -from nltk.tag.brill_trainer import BrillTaggerTrainer -from nltk.tag.tnt import TnT -from nltk.tag.hunpos import HunposTagger -from nltk.tag.stanford import StanfordTagger, StanfordPOSTagger, StanfordNERTagger -from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer -from nltk.tag.senna import SennaTagger, SennaChunkTagger, SennaNERTagger -from nltk.tag.mapping import tagset_mapping, map_tag -from nltk.tag.crf import CRFTagger -from nltk.tag.perceptron import PerceptronTagger - -from nltk.data import load, find - -RUS_PICKLE = ( - "taggers/averaged_perceptron_tagger_ru/averaged_perceptron_tagger_ru.pickle" -) - - -def _get_tagger(lang=None): - if lang == "rus": - tagger = PerceptronTagger(False) - ap_russian_model_loc = "file:" + str(find(RUS_PICKLE)) - tagger.load(ap_russian_model_loc) - else: - tagger = PerceptronTagger() - return tagger - - -def _pos_tag(tokens, tagset=None, tagger=None, lang=None): - # Currently only supports English and Russian. - if lang not in ["eng", "rus"]: - raise NotImplementedError( - "Currently, NLTK pos_tag only supports English and Russian " - "(i.e. lang='eng' or lang='rus')" - ) - # Throws Error if tokens is of string type - elif isinstance(tokens, str): - raise TypeError("tokens: expected a list of strings, got a string") - - else: - tagged_tokens = tagger.tag(tokens) - if tagset: # Maps to the specified tagset. - if lang == "eng": - tagged_tokens = [ - (token, map_tag("en-ptb", tagset, tag)) - for (token, tag) in tagged_tokens - ] - elif lang == "rus": - # Note that the new Russian pos tags from the model contains suffixes, - # see https://github.com/nltk/nltk/issues/2151#issuecomment-430709018 - tagged_tokens = [ - (token, map_tag("ru-rnc-new", tagset, tag.partition("=")[0])) - for (token, tag) in tagged_tokens - ] - return tagged_tokens - - -def pos_tag(tokens, tagset=None, lang="eng"): - """ - Use NLTK's currently recommended part of speech tagger to - tag the given list of tokens. - - >>> from nltk.tag import pos_tag - >>> from nltk.tokenize import word_tokenize - >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) # doctest: +NORMALIZE_WHITESPACE - [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'), - ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')] - >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal') # doctest: +NORMALIZE_WHITESPACE - [('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'), - ("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')] - - NB. Use `pos_tag_sents()` for efficient tagging of more than one sentence. - - :param tokens: Sequence of tokens to be tagged - :type tokens: list(str) - :param tagset: the tagset to be used, e.g. universal, wsj, brown - :type tagset: str - :param lang: the ISO 639 code of the language, e.g. 'eng' for English, 'rus' for Russian - :type lang: str - :return: The tagged tokens - :rtype: list(tuple(str, str)) - """ - tagger = _get_tagger(lang) - return _pos_tag(tokens, tagset, tagger, lang) - - -def pos_tag_sents(sentences, tagset=None, lang="eng"): - """ - Use NLTK's currently recommended part of speech tagger to tag the - given list of sentences, each consisting of a list of tokens. - - :param sentences: List of sentences to be tagged - :type sentences: list(list(str)) - :param tagset: the tagset to be used, e.g. universal, wsj, brown - :type tagset: str - :param lang: the ISO 639 code of the language, e.g. 'eng' for English, 'rus' for Russian - :type lang: str - :return: The list of tagged sentences - :rtype: list(list(tuple(str, str))) - """ - tagger = _get_tagger(lang) - return [_pos_tag(sent, tagset, tagger, lang) for sent in sentences] diff --git a/pipeline/nltk/tag/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/tag/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 7f90153c5e5a555953a027bec8eaff24f725add9..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tag/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tag/__pycache__/api.cpython-39.pyc b/pipeline/nltk/tag/__pycache__/api.cpython-39.pyc deleted file mode 100644 index b13242e1292d3ed0d9a4a8eb61f35a53f7c4484b..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tag/__pycache__/api.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tag/__pycache__/brill.cpython-39.pyc b/pipeline/nltk/tag/__pycache__/brill.cpython-39.pyc deleted file mode 100644 index ae9cbbcc0eb3bea396dde0123ca6fc202d2af37e..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tag/__pycache__/brill.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tag/__pycache__/brill_trainer.cpython-39.pyc b/pipeline/nltk/tag/__pycache__/brill_trainer.cpython-39.pyc deleted file mode 100644 index 995ed3ffc5892babeed6beffb35838cc34d2c7bc..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tag/__pycache__/brill_trainer.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tag/__pycache__/crf.cpython-39.pyc b/pipeline/nltk/tag/__pycache__/crf.cpython-39.pyc deleted file mode 100644 index d9500956dddbe3ea8af08d01a41e986aeaadc2c0..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tag/__pycache__/crf.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tag/__pycache__/hmm.cpython-39.pyc b/pipeline/nltk/tag/__pycache__/hmm.cpython-39.pyc deleted file mode 100644 index 912ff04e59eaa4905c3037d3f5b96afcf73af36d..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tag/__pycache__/hmm.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tag/__pycache__/hunpos.cpython-39.pyc b/pipeline/nltk/tag/__pycache__/hunpos.cpython-39.pyc deleted file mode 100644 index d0e87f55669e541244fc1f09483eb67464778911..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tag/__pycache__/hunpos.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tag/__pycache__/mapping.cpython-39.pyc b/pipeline/nltk/tag/__pycache__/mapping.cpython-39.pyc deleted file mode 100644 index 5b84fac6cfef52d908dc47e2f3ac41f58daefe1c..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tag/__pycache__/mapping.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tag/__pycache__/perceptron.cpython-39.pyc b/pipeline/nltk/tag/__pycache__/perceptron.cpython-39.pyc deleted file mode 100644 index 1ea753a5374449a0981c1030bab1238e78c93e84..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tag/__pycache__/perceptron.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tag/__pycache__/senna.cpython-39.pyc b/pipeline/nltk/tag/__pycache__/senna.cpython-39.pyc deleted file mode 100644 index 1d5c1945afb23606affdb0339c24b9b96a8e54c0..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tag/__pycache__/senna.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tag/__pycache__/sequential.cpython-39.pyc b/pipeline/nltk/tag/__pycache__/sequential.cpython-39.pyc deleted file mode 100644 index 7989aede8bce519b305731f3b6dcc99e8d055ca2..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tag/__pycache__/sequential.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tag/__pycache__/stanford.cpython-39.pyc b/pipeline/nltk/tag/__pycache__/stanford.cpython-39.pyc deleted file mode 100644 index b53c2548548475a26ed74258275c88f750e01528..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tag/__pycache__/stanford.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tag/__pycache__/tnt.cpython-39.pyc b/pipeline/nltk/tag/__pycache__/tnt.cpython-39.pyc deleted file mode 100644 index 57b49c96f91c4bc269061c19bb0bdcf460a35030..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tag/__pycache__/tnt.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tag/__pycache__/util.cpython-39.pyc b/pipeline/nltk/tag/__pycache__/util.cpython-39.pyc deleted file mode 100644 index 7419cab399b2e8f38ede30e5c983f2b410de180b..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tag/__pycache__/util.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tag/api.py b/pipeline/nltk/tag/api.py deleted file mode 100644 index 27e45026cabe6d747f4b4a7dc108b7c3cec1c6f9..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tag/api.py +++ /dev/null @@ -1,296 +0,0 @@ -# Natural Language Toolkit: Tagger Interface -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird (minor additions) -# Tom Aarsen <> -# URL: -# For license information, see LICENSE.TXT - -""" -Interface for tagging each token in a sentence with supplementary -information, such as its part of speech. -""" -from abc import ABCMeta, abstractmethod -from functools import lru_cache -from itertools import chain -from typing import Dict - -from nltk.internals import deprecated, overridden -from nltk.metrics import ConfusionMatrix, accuracy -from nltk.tag.util import untag - - -class TaggerI(metaclass=ABCMeta): - """ - A processing interface for assigning a tag to each token in a list. - Tags are case sensitive strings that identify some property of each - token, such as its part of speech or its sense. - - Some taggers require specific types for their tokens. This is - generally indicated by the use of a sub-interface to ``TaggerI``. - For example, featureset taggers, which are subclassed from - ``FeaturesetTagger``, require that each token be a ``featureset``. - - Subclasses must define: - - either ``tag()`` or ``tag_sents()`` (or both) - """ - - @abstractmethod - def tag(self, tokens): - """ - Determine the most appropriate tag sequence for the given - token sequence, and return a corresponding list of tagged - tokens. A tagged token is encoded as a tuple ``(token, tag)``. - - :rtype: list(tuple(str, str)) - """ - if overridden(self.tag_sents): - return self.tag_sents([tokens])[0] - - def tag_sents(self, sentences): - """ - Apply ``self.tag()`` to each element of *sentences*. I.e.:: - - return [self.tag(sent) for sent in sentences] - """ - return [self.tag(sent) for sent in sentences] - - @deprecated("Use accuracy(gold) instead.") - def evaluate(self, gold): - return self.accuracy(gold) - - def accuracy(self, gold): - """ - Score the accuracy of the tagger against the gold standard. - Strip the tags from the gold standard text, retag it using - the tagger, then compute the accuracy score. - - :param gold: The list of tagged sentences to score the tagger on. - :type gold: list(list(tuple(str, str))) - :rtype: float - """ - - tagged_sents = self.tag_sents(untag(sent) for sent in gold) - gold_tokens = list(chain.from_iterable(gold)) - test_tokens = list(chain.from_iterable(tagged_sents)) - return accuracy(gold_tokens, test_tokens) - - @lru_cache(maxsize=1) - def _confusion_cached(self, gold): - """ - Inner function used after ``gold`` is converted to a - ``tuple(tuple(tuple(str, str)))``. That way, we can use caching on - creating a ConfusionMatrix. - - :param gold: The list of tagged sentences to run the tagger with, - also used as the reference values in the generated confusion matrix. - :type gold: tuple(tuple(tuple(str, str))) - :rtype: ConfusionMatrix - """ - - tagged_sents = self.tag_sents(untag(sent) for sent in gold) - gold_tokens = [token for _word, token in chain.from_iterable(gold)] - test_tokens = [token for _word, token in chain.from_iterable(tagged_sents)] - return ConfusionMatrix(gold_tokens, test_tokens) - - def confusion(self, gold): - """ - Return a ConfusionMatrix with the tags from ``gold`` as the reference - values, with the predictions from ``tag_sents`` as the predicted values. - - >>> from nltk.tag import PerceptronTagger - >>> from nltk.corpus import treebank - >>> tagger = PerceptronTagger() - >>> gold_data = treebank.tagged_sents()[:10] - >>> print(tagger.confusion(gold_data)) - | - | - | N | - | O P | - | N J J N N P P R R V V V V V W | - | ' E C C D E I J J J M N N N O R P R B R T V B B B B B D ` | - | ' , - . C D T X N J R S D N P S S P $ B R P O B D G N P Z T ` | - -------+----------------------------------------------------------------------------------------------+ - '' | <1> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | - , | .<15> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | - -NONE- | . . <.> . . 2 . . . 2 . . . 5 1 . . . . 2 . . . . . . . . . . . | - . | . . .<10> . . . . . . . . . . . . . . . . . . . . . . . . . . . | - CC | . . . . <1> . . . . . . . . . . . . . . . . . . . . . . . . . . | - CD | . . . . . <5> . . . . . . . . . . . . . . . . . . . . . . . . . | - DT | . . . . . .<20> . . . . . . . . . . . . . . . . . . . . . . . . | - EX | . . . . . . . <1> . . . . . . . . . . . . . . . . . . . . . . . | - IN | . . . . . . . .<22> . . . . . . . . . . 3 . . . . . . . . . . . | - JJ | . . . . . . . . .<16> . . . . 1 . . . . 1 . . . . . . . . . . . | - JJR | . . . . . . . . . . <.> . . . . . . . . . . . . . . . . . . . . | - JJS | . . . . . . . . . . . <1> . . . . . . . . . . . . . . . . . . . | - MD | . . . . . . . . . . . . <1> . . . . . . . . . . . . . . . . . . | - NN | . . . . . . . . . . . . .<28> 1 1 . . . . . . . . . . . . . . . | - NNP | . . . . . . . . . . . . . .<25> . . . . . . . . . . . . . . . . | - NNS | . . . . . . . . . . . . . . .<19> . . . . . . . . . . . . . . . | - POS | . . . . . . . . . . . . . . . . <1> . . . . . . . . . . . . . . | - PRP | . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . . . . | - PRP$ | . . . . . . . . . . . . . . . . . . <2> . . . . . . . . . . . . | - RB | . . . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . . | - RBR | . . . . . . . . . . 1 . . . . . . . . . <1> . . . . . . . . . . | - RP | . . . . . . . . . . . . . . . . . . . . . <1> . . . . . . . . . | - TO | . . . . . . . . . . . . . . . . . . . . . . <5> . . . . . . . . | - VB | . . . . . . . . . . . . . . . . . . . . . . . <3> . . . . . . . | - VBD | . . . . . . . . . . . . . 1 . . . . . . . . . . <6> . . . . . . | - VBG | . . . . . . . . . . . . . 1 . . . . . . . . . . . <4> . . . . . | - VBN | . . . . . . . . . . . . . . . . . . . . . . . . 1 . <4> . . . . | - VBP | . . . . . . . . . . . . . . . . . . . . . . . . . . . <3> . . . | - VBZ | . . . . . . . . . . . . . . . . . . . . . . . . . . . . <7> . . | - WDT | . . . . . . . . 2 . . . . . . . . . . . . . . . . . . . . <.> . | - `` | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <1>| - -------+----------------------------------------------------------------------------------------------+ - (row = reference; col = test) - - - :param gold: The list of tagged sentences to run the tagger with, - also used as the reference values in the generated confusion matrix. - :type gold: list(list(tuple(str, str))) - :rtype: ConfusionMatrix - """ - - return self._confusion_cached(tuple(tuple(sent) for sent in gold)) - - def recall(self, gold) -> Dict[str, float]: - """ - Compute the recall for each tag from ``gold`` or from running ``tag`` - on the tokenized sentences from ``gold``. Then, return the dictionary - with mappings from tag to recall. The recall is defined as: - - - *r* = true positive / (true positive + false positive) - - :param gold: The list of tagged sentences to score the tagger on. - :type gold: list(list(tuple(str, str))) - :return: A mapping from tags to recall - :rtype: Dict[str, float] - """ - - cm = self.confusion(gold) - return {tag: cm.recall(tag) for tag in cm._values} - - def precision(self, gold): - """ - Compute the precision for each tag from ``gold`` or from running ``tag`` - on the tokenized sentences from ``gold``. Then, return the dictionary - with mappings from tag to precision. The precision is defined as: - - - *p* = true positive / (true positive + false negative) - - :param gold: The list of tagged sentences to score the tagger on. - :type gold: list(list(tuple(str, str))) - :return: A mapping from tags to precision - :rtype: Dict[str, float] - """ - - cm = self.confusion(gold) - return {tag: cm.precision(tag) for tag in cm._values} - - def f_measure(self, gold, alpha=0.5): - """ - Compute the f-measure for each tag from ``gold`` or from running ``tag`` - on the tokenized sentences from ``gold``. Then, return the dictionary - with mappings from tag to f-measure. The f-measure is the harmonic mean - of the ``precision`` and ``recall``, weighted by ``alpha``. - In particular, given the precision *p* and recall *r* defined by: - - - *p* = true positive / (true positive + false negative) - - *r* = true positive / (true positive + false positive) - - The f-measure is: - - - *1/(alpha/p + (1-alpha)/r)* - - With ``alpha = 0.5``, this reduces to: - - - *2pr / (p + r)* - - :param gold: The list of tagged sentences to score the tagger on. - :type gold: list(list(tuple(str, str))) - :param alpha: Ratio of the cost of false negative compared to false - positives. Defaults to 0.5, where the costs are equal. - :type alpha: float - :return: A mapping from tags to precision - :rtype: Dict[str, float] - """ - cm = self.confusion(gold) - return {tag: cm.f_measure(tag, alpha) for tag in cm._values} - - def evaluate_per_tag(self, gold, alpha=0.5, truncate=None, sort_by_count=False): - """Tabulate the **recall**, **precision** and **f-measure** - for each tag from ``gold`` or from running ``tag`` on the tokenized - sentences from ``gold``. - - >>> from nltk.tag import PerceptronTagger - >>> from nltk.corpus import treebank - >>> tagger = PerceptronTagger() - >>> gold_data = treebank.tagged_sents()[:10] - >>> print(tagger.evaluate_per_tag(gold_data)) - Tag | Prec. | Recall | F-measure - -------+--------+--------+----------- - '' | 1.0000 | 1.0000 | 1.0000 - , | 1.0000 | 1.0000 | 1.0000 - -NONE- | 0.0000 | 0.0000 | 0.0000 - . | 1.0000 | 1.0000 | 1.0000 - CC | 1.0000 | 1.0000 | 1.0000 - CD | 0.7143 | 1.0000 | 0.8333 - DT | 1.0000 | 1.0000 | 1.0000 - EX | 1.0000 | 1.0000 | 1.0000 - IN | 0.9167 | 0.8800 | 0.8980 - JJ | 0.8889 | 0.8889 | 0.8889 - JJR | 0.0000 | 0.0000 | 0.0000 - JJS | 1.0000 | 1.0000 | 1.0000 - MD | 1.0000 | 1.0000 | 1.0000 - NN | 0.8000 | 0.9333 | 0.8615 - NNP | 0.8929 | 1.0000 | 0.9434 - NNS | 0.9500 | 1.0000 | 0.9744 - POS | 1.0000 | 1.0000 | 1.0000 - PRP | 1.0000 | 1.0000 | 1.0000 - PRP$ | 1.0000 | 1.0000 | 1.0000 - RB | 0.4000 | 1.0000 | 0.5714 - RBR | 1.0000 | 0.5000 | 0.6667 - RP | 1.0000 | 1.0000 | 1.0000 - TO | 1.0000 | 1.0000 | 1.0000 - VB | 1.0000 | 1.0000 | 1.0000 - VBD | 0.8571 | 0.8571 | 0.8571 - VBG | 1.0000 | 0.8000 | 0.8889 - VBN | 1.0000 | 0.8000 | 0.8889 - VBP | 1.0000 | 1.0000 | 1.0000 - VBZ | 1.0000 | 1.0000 | 1.0000 - WDT | 0.0000 | 0.0000 | 0.0000 - `` | 1.0000 | 1.0000 | 1.0000 - - - :param gold: The list of tagged sentences to score the tagger on. - :type gold: list(list(tuple(str, str))) - :param alpha: Ratio of the cost of false negative compared to false - positives, as used in the f-measure computation. Defaults to 0.5, - where the costs are equal. - :type alpha: float - :param truncate: If specified, then only show the specified - number of values. Any sorting (e.g., sort_by_count) - will be performed before truncation. Defaults to None - :type truncate: int, optional - :param sort_by_count: Whether to sort the outputs on number of - occurrences of that tag in the ``gold`` data, defaults to False - :type sort_by_count: bool, optional - :return: A tabulated recall, precision and f-measure string - :rtype: str - """ - cm = self.confusion(gold) - return cm.evaluate(alpha=alpha, truncate=truncate, sort_by_count=sort_by_count) - - def _check_params(self, train, model): - if (train and model) or (not train and not model): - raise ValueError("Must specify either training data or trained model.") - - -class FeaturesetTaggerI(TaggerI): - """ - A tagger that requires tokens to be ``featuresets``. A featureset - is a dictionary that maps from feature names to feature - values. See ``nltk.classify`` for more information about features - and featuresets. - """ diff --git a/pipeline/nltk/tag/brill.py b/pipeline/nltk/tag/brill.py deleted file mode 100644 index d3bd1cd3b6cb10c4b62b7d23910e2a8ba9568cd2..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tag/brill.py +++ /dev/null @@ -1,449 +0,0 @@ -# Natural Language Toolkit: Transformation-based learning -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Marcus Uneson -# based on previous (nltk2) version by -# Christopher Maloof, Edward Loper, Steven Bird -# URL: -# For license information, see LICENSE.TXT - -from collections import Counter, defaultdict - -from nltk import jsontags -from nltk.tag import TaggerI -from nltk.tbl import Feature, Template - -###################################################################### -# Brill Templates -###################################################################### - - -@jsontags.register_tag -class Word(Feature): - """ - Feature which examines the text (word) of nearby tokens. - """ - - json_tag = "nltk.tag.brill.Word" - - @staticmethod - def extract_property(tokens, index): - """@return: The given token's text.""" - return tokens[index][0] - - -@jsontags.register_tag -class Pos(Feature): - """ - Feature which examines the tags of nearby tokens. - """ - - json_tag = "nltk.tag.brill.Pos" - - @staticmethod - def extract_property(tokens, index): - """@return: The given token's tag.""" - return tokens[index][1] - - -def nltkdemo18(): - """ - Return 18 templates, from the original nltk demo, in multi-feature syntax - """ - return [ - Template(Pos([-1])), - Template(Pos([1])), - Template(Pos([-2])), - Template(Pos([2])), - Template(Pos([-2, -1])), - Template(Pos([1, 2])), - Template(Pos([-3, -2, -1])), - Template(Pos([1, 2, 3])), - Template(Pos([-1]), Pos([1])), - Template(Word([-1])), - Template(Word([1])), - Template(Word([-2])), - Template(Word([2])), - Template(Word([-2, -1])), - Template(Word([1, 2])), - Template(Word([-3, -2, -1])), - Template(Word([1, 2, 3])), - Template(Word([-1]), Word([1])), - ] - - -def nltkdemo18plus(): - """ - Return 18 templates, from the original nltk demo, and additionally a few - multi-feature ones (the motivation is easy comparison with nltkdemo18) - """ - return nltkdemo18() + [ - Template(Word([-1]), Pos([1])), - Template(Pos([-1]), Word([1])), - Template(Word([-1]), Word([0]), Pos([1])), - Template(Pos([-1]), Word([0]), Word([1])), - Template(Pos([-1]), Word([0]), Pos([1])), - ] - - -def fntbl37(): - """ - Return 37 templates taken from the postagging task of the - fntbl distribution https://www.cs.jhu.edu/~rflorian/fntbl/ - (37 is after excluding a handful which do not condition on Pos[0]; - fntbl can do that but the current nltk implementation cannot.) - """ - return [ - Template(Word([0]), Word([1]), Word([2])), - Template(Word([-1]), Word([0]), Word([1])), - Template(Word([0]), Word([-1])), - Template(Word([0]), Word([1])), - Template(Word([0]), Word([2])), - Template(Word([0]), Word([-2])), - Template(Word([1, 2])), - Template(Word([-2, -1])), - Template(Word([1, 2, 3])), - Template(Word([-3, -2, -1])), - Template(Word([0]), Pos([2])), - Template(Word([0]), Pos([-2])), - Template(Word([0]), Pos([1])), - Template(Word([0]), Pos([-1])), - Template(Word([0])), - Template(Word([-2])), - Template(Word([2])), - Template(Word([1])), - Template(Word([-1])), - Template(Pos([-1]), Pos([1])), - Template(Pos([1]), Pos([2])), - Template(Pos([-1]), Pos([-2])), - Template(Pos([1])), - Template(Pos([-1])), - Template(Pos([-2])), - Template(Pos([2])), - Template(Pos([1, 2, 3])), - Template(Pos([1, 2])), - Template(Pos([-3, -2, -1])), - Template(Pos([-2, -1])), - Template(Pos([1]), Word([0]), Word([1])), - Template(Pos([1]), Word([0]), Word([-1])), - Template(Pos([-1]), Word([-1]), Word([0])), - Template(Pos([-1]), Word([0]), Word([1])), - Template(Pos([-2]), Pos([-1])), - Template(Pos([1]), Pos([2])), - Template(Pos([1]), Pos([2]), Word([1])), - ] - - -def brill24(): - """ - Return 24 templates of the seminal TBL paper, Brill (1995) - """ - return [ - Template(Pos([-1])), - Template(Pos([1])), - Template(Pos([-2])), - Template(Pos([2])), - Template(Pos([-2, -1])), - Template(Pos([1, 2])), - Template(Pos([-3, -2, -1])), - Template(Pos([1, 2, 3])), - Template(Pos([-1]), Pos([1])), - Template(Pos([-2]), Pos([-1])), - Template(Pos([1]), Pos([2])), - Template(Word([-1])), - Template(Word([1])), - Template(Word([-2])), - Template(Word([2])), - Template(Word([-2, -1])), - Template(Word([1, 2])), - Template(Word([-1, 0])), - Template(Word([0, 1])), - Template(Word([0])), - Template(Word([-1]), Pos([-1])), - Template(Word([1]), Pos([1])), - Template(Word([0]), Word([-1]), Pos([-1])), - Template(Word([0]), Word([1]), Pos([1])), - ] - - -def describe_template_sets(): - """ - Print the available template sets in this demo, with a short description" - """ - import inspect - import sys - - # a bit of magic to get all functions in this module - templatesets = inspect.getmembers(sys.modules[__name__], inspect.isfunction) - for (name, obj) in templatesets: - if name == "describe_template_sets": - continue - print(name, obj.__doc__, "\n") - - -###################################################################### -# The Brill Tagger -###################################################################### - - -@jsontags.register_tag -class BrillTagger(TaggerI): - """ - Brill's transformational rule-based tagger. Brill taggers use an - initial tagger (such as ``tag.DefaultTagger``) to assign an initial - tag sequence to a text; and then apply an ordered list of - transformational rules to correct the tags of individual tokens. - These transformation rules are specified by the ``TagRule`` - interface. - - Brill taggers can be created directly, from an initial tagger and - a list of transformational rules; but more often, Brill taggers - are created by learning rules from a training corpus, using one - of the TaggerTrainers available. - """ - - json_tag = "nltk.tag.BrillTagger" - - def __init__(self, initial_tagger, rules, training_stats=None): - """ - :param initial_tagger: The initial tagger - :type initial_tagger: TaggerI - - :param rules: An ordered list of transformation rules that - should be used to correct the initial tagging. - :type rules: list(TagRule) - - :param training_stats: A dictionary of statistics collected - during training, for possible later use - :type training_stats: dict - - """ - self._initial_tagger = initial_tagger - self._rules = tuple(rules) - self._training_stats = training_stats - - def encode_json_obj(self): - return self._initial_tagger, self._rules, self._training_stats - - @classmethod - def decode_json_obj(cls, obj): - _initial_tagger, _rules, _training_stats = obj - return cls(_initial_tagger, _rules, _training_stats) - - def rules(self): - """ - Return the ordered list of transformation rules that this tagger has learnt - - :return: the ordered list of transformation rules that correct the initial tagging - :rtype: list of Rules - """ - return self._rules - - def train_stats(self, statistic=None): - """ - Return a named statistic collected during training, or a dictionary of all - available statistics if no name given - - :param statistic: name of statistic - :type statistic: str - :return: some statistic collected during training of this tagger - :rtype: any (but usually a number) - """ - if statistic is None: - return self._training_stats - else: - return self._training_stats.get(statistic) - - def tag(self, tokens): - # Inherit documentation from TaggerI - - # Run the initial tagger. - tagged_tokens = self._initial_tagger.tag(tokens) - - # Create a dictionary that maps each tag to a list of the - # indices of tokens that have that tag. - tag_to_positions = defaultdict(set) - for i, (token, tag) in enumerate(tagged_tokens): - tag_to_positions[tag].add(i) - - # Apply each rule, in order. Only try to apply rules at - # positions that have the desired original tag. - for rule in self._rules: - # Find the positions where it might apply - positions = tag_to_positions.get(rule.original_tag, []) - # Apply the rule at those positions. - changed = rule.apply(tagged_tokens, positions) - # Update tag_to_positions with the positions of tags that - # were modified. - for i in changed: - tag_to_positions[rule.original_tag].remove(i) - tag_to_positions[rule.replacement_tag].add(i) - - return tagged_tokens - - def print_template_statistics(self, test_stats=None, printunused=True): - """ - Print a list of all templates, ranked according to efficiency. - - If test_stats is available, the templates are ranked according to their - relative contribution (summed for all rules created from a given template, - weighted by score) to the performance on the test set. If no test_stats, then - statistics collected during training are used instead. There is also - an unweighted measure (just counting the rules). This is less informative, - though, as many low-score rules will appear towards end of training. - - :param test_stats: dictionary of statistics collected during testing - :type test_stats: dict of str -> any (but usually numbers) - :param printunused: if True, print a list of all unused templates - :type printunused: bool - :return: None - :rtype: None - """ - tids = [r.templateid for r in self._rules] - train_stats = self.train_stats() - - trainscores = train_stats["rulescores"] - assert len(trainscores) == len( - tids - ), "corrupt statistics: " "{} train scores for {} rules".format( - trainscores, tids - ) - template_counts = Counter(tids) - weighted_traincounts = Counter() - for (tid, score) in zip(tids, trainscores): - weighted_traincounts[tid] += score - tottrainscores = sum(trainscores) - - # det_tplsort() is for deterministic sorting; - # the otherwise convenient Counter.most_common() unfortunately - # does not break ties deterministically - # between python versions and will break cross-version tests - def det_tplsort(tpl_value): - return (tpl_value[1], repr(tpl_value[0])) - - def print_train_stats(): - print( - "TEMPLATE STATISTICS (TRAIN) {} templates, {} rules)".format( - len(template_counts), len(tids) - ) - ) - print( - "TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} " - "final: {finalerrors:5d} {finalacc:.4f}".format(**train_stats) - ) - head = "#ID | Score (train) | #Rules | Template" - print(head, "\n", "-" * len(head), sep="") - train_tplscores = sorted( - weighted_traincounts.items(), key=det_tplsort, reverse=True - ) - for (tid, trainscore) in train_tplscores: - s = "{} | {:5d} {:5.3f} |{:4d} {:.3f} | {}".format( - tid, - trainscore, - trainscore / tottrainscores, - template_counts[tid], - template_counts[tid] / len(tids), - Template.ALLTEMPLATES[int(tid)], - ) - print(s) - - def print_testtrain_stats(): - testscores = test_stats["rulescores"] - print( - "TEMPLATE STATISTICS (TEST AND TRAIN) ({} templates, {} rules)".format( - len(template_counts), len(tids) - ) - ) - print( - "TEST ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} " - "final: {finalerrors:5d} {finalacc:.4f} ".format(**test_stats) - ) - print( - "TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} " - "final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats) - ) - weighted_testcounts = Counter() - for (tid, score) in zip(tids, testscores): - weighted_testcounts[tid] += score - tottestscores = sum(testscores) - head = "#ID | Score (test) | Score (train) | #Rules | Template" - print(head, "\n", "-" * len(head), sep="") - test_tplscores = sorted( - weighted_testcounts.items(), key=det_tplsort, reverse=True - ) - for (tid, testscore) in test_tplscores: - s = "{:s} |{:5d} {:6.3f} | {:4d} {:.3f} |{:4d} {:.3f} | {:s}".format( - tid, - testscore, - testscore / tottestscores, - weighted_traincounts[tid], - weighted_traincounts[tid] / tottrainscores, - template_counts[tid], - template_counts[tid] / len(tids), - Template.ALLTEMPLATES[int(tid)], - ) - print(s) - - def print_unused_templates(): - usedtpls = {int(tid) for tid in tids} - unused = [ - (tid, tpl) - for (tid, tpl) in enumerate(Template.ALLTEMPLATES) - if tid not in usedtpls - ] - print(f"UNUSED TEMPLATES ({len(unused)})") - - for (tid, tpl) in unused: - print(f"{tid:03d} {str(tpl):s}") - - if test_stats is None: - print_train_stats() - else: - print_testtrain_stats() - print() - if printunused: - print_unused_templates() - print() - - def batch_tag_incremental(self, sequences, gold): - """ - Tags by applying each rule to the entire corpus (rather than all rules to a - single sequence). The point is to collect statistics on the test set for - individual rules. - - NOTE: This is inefficient (does not build any index, so will traverse the entire - corpus N times for N rules) -- usually you would not care about statistics for - individual rules and thus use batch_tag() instead - - :param sequences: lists of token sequences (sentences, in some applications) to be tagged - :type sequences: list of list of strings - :param gold: the gold standard - :type gold: list of list of strings - :returns: tuple of (tagged_sequences, ordered list of rule scores (one for each rule)) - """ - - def counterrors(xs): - return sum(t[1] != g[1] for pair in zip(xs, gold) for (t, g) in zip(*pair)) - - testing_stats = {} - testing_stats["tokencount"] = sum(len(t) for t in sequences) - testing_stats["sequencecount"] = len(sequences) - tagged_tokenses = [self._initial_tagger.tag(tokens) for tokens in sequences] - testing_stats["initialerrors"] = counterrors(tagged_tokenses) - testing_stats["initialacc"] = ( - 1 - testing_stats["initialerrors"] / testing_stats["tokencount"] - ) - # Apply each rule to the entire corpus, in order - errors = [testing_stats["initialerrors"]] - for rule in self._rules: - for tagged_tokens in tagged_tokenses: - rule.apply(tagged_tokens) - errors.append(counterrors(tagged_tokenses)) - testing_stats["rulescores"] = [ - err0 - err1 for (err0, err1) in zip(errors, errors[1:]) - ] - testing_stats["finalerrors"] = errors[-1] - testing_stats["finalacc"] = ( - 1 - testing_stats["finalerrors"] / testing_stats["tokencount"] - ) - return (tagged_tokenses, testing_stats) diff --git a/pipeline/nltk/tag/brill_trainer.py b/pipeline/nltk/tag/brill_trainer.py deleted file mode 100644 index 236fd9858e755b501f3a8f384b68a383b6902f99..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tag/brill_trainer.py +++ /dev/null @@ -1,629 +0,0 @@ -# Natural Language Toolkit: Transformation-based learning -# -# Copyright (C) 2001-2013 NLTK Project -# Author: Marcus Uneson -# based on previous (nltk2) version by -# Christopher Maloof, Edward Loper, Steven Bird -# URL: -# For license information, see LICENSE.TXT - -import bisect -import textwrap -from collections import defaultdict - -from nltk.tag import BrillTagger, untag - -###################################################################### -# Brill Tagger Trainer -###################################################################### - - -class BrillTaggerTrainer: - """ - A trainer for tbl taggers. - """ - - def __init__( - self, initial_tagger, templates, trace=0, deterministic=None, ruleformat="str" - ): - """ - Construct a Brill tagger from a baseline tagger and a - set of templates - - :param initial_tagger: the baseline tagger - :type initial_tagger: Tagger - :param templates: templates to be used in training - :type templates: list of Templates - :param trace: verbosity level - :type trace: int - :param deterministic: if True, adjudicate ties deterministically - :type deterministic: bool - :param ruleformat: format of reported Rules - :type ruleformat: str - :return: An untrained BrillTagger - :rtype: BrillTagger - """ - - if deterministic is None: - deterministic = trace > 0 - self._initial_tagger = initial_tagger - self._templates = templates - self._trace = trace - self._deterministic = deterministic - self._ruleformat = ruleformat - - self._tag_positions = None - """Mapping from tags to lists of positions that use that tag.""" - - self._rules_by_position = None - """Mapping from positions to the set of rules that are known - to occur at that position. Position is (sentnum, wordnum). - Initially, this will only contain positions where each rule - applies in a helpful way; but when we examine a rule, we'll - extend this list to also include positions where each rule - applies in a harmful or neutral way.""" - - self._positions_by_rule = None - """Mapping from rule to position to effect, specifying the - effect that each rule has on the overall score, at each - position. Position is (sentnum, wordnum); and effect is - -1, 0, or 1. As with _rules_by_position, this mapping starts - out only containing rules with positive effects; but when - we examine a rule, we'll extend this mapping to include - the positions where the rule is harmful or neutral.""" - - self._rules_by_score = None - """Mapping from scores to the set of rules whose effect on the - overall score is upper bounded by that score. Invariant: - rulesByScore[s] will contain r iff the sum of - _positions_by_rule[r] is s.""" - - self._rule_scores = None - """Mapping from rules to upper bounds on their effects on the - overall score. This is the inverse mapping to _rules_by_score. - Invariant: ruleScores[r] = sum(_positions_by_rule[r])""" - - self._first_unknown_position = None - """Mapping from rules to the first position where we're unsure - if the rule applies. This records the next position we - need to check to see if the rule messed anything up.""" - - # Training - - def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): - r""" - Trains the Brill tagger on the corpus *train_sents*, - producing at most *max_rules* transformations, each of which - reduces the net number of errors in the corpus by at least - *min_score*, and each of which has accuracy not lower than - *min_acc*. - - >>> # Relevant imports - >>> from nltk.tbl.template import Template - >>> from nltk.tag.brill import Pos, Word - >>> from nltk.tag import untag, RegexpTagger, BrillTaggerTrainer - - >>> # Load some data - >>> from nltk.corpus import treebank - >>> training_data = treebank.tagged_sents()[:100] - >>> baseline_data = treebank.tagged_sents()[100:200] - >>> gold_data = treebank.tagged_sents()[200:300] - >>> testing_data = [untag(s) for s in gold_data] - - >>> backoff = RegexpTagger([ - ... (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers - ... (r'(The|the|A|a|An|an)$', 'AT'), # articles - ... (r'.*able$', 'JJ'), # adjectives - ... (r'.*ness$', 'NN'), # nouns formed from adjectives - ... (r'.*ly$', 'RB'), # adverbs - ... (r'.*s$', 'NNS'), # plural nouns - ... (r'.*ing$', 'VBG'), # gerunds - ... (r'.*ed$', 'VBD'), # past tense verbs - ... (r'.*', 'NN') # nouns (default) - ... ]) - - >>> baseline = backoff #see NOTE1 - >>> baseline.accuracy(gold_data) #doctest: +ELLIPSIS - 0.243... - - >>> # Set up templates - >>> Template._cleartemplates() #clear any templates created in earlier tests - >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] - - >>> # Construct a BrillTaggerTrainer - >>> tt = BrillTaggerTrainer(baseline, templates, trace=3) - - >>> tagger1 = tt.train(training_data, max_rules=10) - TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None) - Finding initial useful rules... - Found 847 useful rules. - - B | - S F r O | Score = Fixed - Broken - c i o t | R Fixed = num tags changed incorrect -> correct - o x k h | u Broken = num tags changed correct -> incorrect - r e e e | l Other = num tags changed incorrect -> incorrect - e d n r | e - ------------------+------------------------------------------------------- - 132 132 0 0 | AT->DT if Pos:NN@[-1] - 85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0] - 69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0] - 51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0] - 47 63 16 162 | NN->IN if Pos:NNS@[-1] - 33 33 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0] - 26 26 0 0 | IN->. if Pos:NNS@[-1] & Word:.@[0] - 24 24 0 0 | IN->, if Pos:NNS@[-1] & Word:,@[0] - 22 27 5 24 | NN->-NONE- if Pos:VBD@[-1] - 17 17 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0] - - >>> tagger1.rules()[1:3] - (Rule('001', 'NN', ',', [(Pos([-1]),'NN'), (Word([0]),',')]), Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')])) - - >>> train_stats = tagger1.train_stats() - >>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] - [1776, 1270, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]] - - >>> tagger1.print_template_statistics(printunused=False) - TEMPLATE STATISTICS (TRAIN) 2 templates, 10 rules) - TRAIN ( 2417 tokens) initial 1776 0.2652 final: 1270 0.4746 - #ID | Score (train) | #Rules | Template - -------------------------------------------- - 001 | 305 0.603 | 7 0.700 | Template(Pos([-1]),Word([0])) - 000 | 201 0.397 | 3 0.300 | Template(Pos([-1])) - - - - >>> round(tagger1.accuracy(gold_data),5) - 0.43834 - - >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data) - - >>> tagged[33][12:] == [('foreign', 'IN'), ('debt', 'NN'), ('of', 'IN'), ('$', 'NN'), ('64', 'CD'), - ... ('billion', 'NN'), ('*U*', 'NN'), ('--', 'NN'), ('the', 'DT'), ('third-highest', 'NN'), ('in', 'NN'), - ... ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')] - True - - >>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] - [1859, 1380, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]] - - >>> # A high-accuracy tagger - >>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99) - TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99) - Finding initial useful rules... - Found 847 useful rules. - - B | - S F r O | Score = Fixed - Broken - c i o t | R Fixed = num tags changed incorrect -> correct - o x k h | u Broken = num tags changed correct -> incorrect - r e e e | l Other = num tags changed incorrect -> incorrect - e d n r | e - ------------------+------------------------------------------------------- - 132 132 0 0 | AT->DT if Pos:NN@[-1] - 85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0] - 69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0] - 51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0] - 36 36 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0] - 26 26 0 0 | NN->. if Pos:NNS@[-1] & Word:.@[0] - 24 24 0 0 | NN->, if Pos:NNS@[-1] & Word:,@[0] - 19 19 0 6 | NN->VB if Pos:TO@[-1] - 18 18 0 0 | CD->-NONE- if Pos:NN@[-1] & Word:0@[0] - 18 18 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0] - - >>> round(tagger2.accuracy(gold_data), 8) - 0.43996744 - - >>> tagger2.rules()[2:4] - (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')])) - - # NOTE1: (!!FIXME) A far better baseline uses nltk.tag.UnigramTagger, - # with a RegexpTagger only as backoff. For instance, - # >>> baseline = UnigramTagger(baseline_data, backoff=backoff) - # However, as of Nov 2013, nltk.tag.UnigramTagger does not yield consistent results - # between python versions. The simplistic backoff above is a workaround to make doctests - # get consistent input. - - :param train_sents: training data - :type train_sents: list(list(tuple)) - :param max_rules: output at most max_rules rules - :type max_rules: int - :param min_score: stop training when no rules better than min_score can be found - :type min_score: int - :param min_acc: discard any rule with lower accuracy than min_acc - :type min_acc: float or None - :return: the learned tagger - :rtype: BrillTagger - """ - # FIXME: several tests are a bit too dependent on tracing format - # FIXME: tests in trainer.fast and trainer.brillorig are exact duplicates - - # Basic idea: Keep track of the rules that apply at each position. - # And keep track of the positions to which each rule applies. - - # Create a new copy of the training corpus, and run the - # initial tagger on it. We will progressively update this - # test corpus to look more like the training corpus. - test_sents = [ - list(self._initial_tagger.tag(untag(sent))) for sent in train_sents - ] - - # Collect some statistics on the training process - trainstats = {} - trainstats["min_acc"] = min_acc - trainstats["min_score"] = min_score - trainstats["tokencount"] = sum(len(t) for t in test_sents) - trainstats["sequencecount"] = len(test_sents) - trainstats["templatecount"] = len(self._templates) - trainstats["rulescores"] = [] - trainstats["initialerrors"] = sum( - tag[1] != truth[1] - for paired in zip(test_sents, train_sents) - for (tag, truth) in zip(*paired) - ) - trainstats["initialacc"] = ( - 1 - trainstats["initialerrors"] / trainstats["tokencount"] - ) - if self._trace > 0: - print( - "TBL train (fast) (seqs: {sequencecount}; tokens: {tokencount}; " - "tpls: {templatecount}; min score: {min_score}; min acc: {min_acc})".format( - **trainstats - ) - ) - - # Initialize our mappings. This will find any errors made - # by the initial tagger, and use those to generate repair - # rules, which are added to the rule mappings. - if self._trace: - print("Finding initial useful rules...") - self._init_mappings(test_sents, train_sents) - if self._trace: - print(f" Found {len(self._rule_scores)} useful rules.") - - # Let the user know what we're up to. - if self._trace > 2: - self._trace_header() - elif self._trace == 1: - print("Selecting rules...") - - # Repeatedly select the best rule, and add it to `rules`. - rules = [] - try: - while len(rules) < max_rules: - # Find the best rule, and add it to our rule list. - rule = self._best_rule(train_sents, test_sents, min_score, min_acc) - if rule: - rules.append(rule) - score = self._rule_scores[rule] - trainstats["rulescores"].append(score) - else: - break # No more good rules left! - - # Report the rule that we found. - if self._trace > 1: - self._trace_rule(rule) - - # Apply the new rule at the relevant sites - self._apply_rule(rule, test_sents) - - # Update _tag_positions[rule.original_tag] and - # _tag_positions[rule.replacement_tag] for the affected - # positions (i.e., self._positions_by_rule[rule]). - self._update_tag_positions(rule) - - # Update rules that were affected by the change. - self._update_rules(rule, train_sents, test_sents) - - # The user can cancel training manually: - except KeyboardInterrupt: - print(f"Training stopped manually -- {len(rules)} rules found") - - # Discard our tag position mapping & rule mappings. - self._clean() - trainstats["finalerrors"] = trainstats["initialerrors"] - sum( - trainstats["rulescores"] - ) - trainstats["finalacc"] = ( - 1 - trainstats["finalerrors"] / trainstats["tokencount"] - ) - # Create and return a tagger from the rules we found. - return BrillTagger(self._initial_tagger, rules, trainstats) - - def _init_mappings(self, test_sents, train_sents): - """ - Initialize the tag position mapping & the rule related - mappings. For each error in test_sents, find new rules that - would correct them, and add them to the rule mappings. - """ - self._tag_positions = defaultdict(list) - self._rules_by_position = defaultdict(set) - self._positions_by_rule = defaultdict(dict) - self._rules_by_score = defaultdict(set) - self._rule_scores = defaultdict(int) - self._first_unknown_position = defaultdict(int) - # Scan through the corpus, initializing the tag_positions - # mapping and all the rule-related mappings. - for sentnum, sent in enumerate(test_sents): - for wordnum, (word, tag) in enumerate(sent): - - # Initialize tag_positions - self._tag_positions[tag].append((sentnum, wordnum)) - - # If it's an error token, update the rule-related mappings. - correct_tag = train_sents[sentnum][wordnum][1] - if tag != correct_tag: - for rule in self._find_rules(sent, wordnum, correct_tag): - self._update_rule_applies(rule, sentnum, wordnum, train_sents) - - def _clean(self): - self._tag_positions = None - self._rules_by_position = None - self._positions_by_rule = None - self._rules_by_score = None - self._rule_scores = None - self._first_unknown_position = None - - def _find_rules(self, sent, wordnum, new_tag): - """ - Use the templates to find rules that apply at index *wordnum* - in the sentence *sent* and generate the tag *new_tag*. - """ - for template in self._templates: - yield from template.applicable_rules(sent, wordnum, new_tag) - - def _update_rule_applies(self, rule, sentnum, wordnum, train_sents): - """ - Update the rule data tables to reflect the fact that - *rule* applies at the position *(sentnum, wordnum)*. - """ - pos = sentnum, wordnum - - # If the rule is already known to apply here, ignore. - # (This only happens if the position's tag hasn't changed.) - if pos in self._positions_by_rule[rule]: - return - - # Update self._positions_by_rule. - correct_tag = train_sents[sentnum][wordnum][1] - if rule.replacement_tag == correct_tag: - self._positions_by_rule[rule][pos] = 1 - elif rule.original_tag == correct_tag: - self._positions_by_rule[rule][pos] = -1 - else: # was wrong, remains wrong - self._positions_by_rule[rule][pos] = 0 - - # Update _rules_by_position - self._rules_by_position[pos].add(rule) - - # Update _rule_scores. - old_score = self._rule_scores[rule] - self._rule_scores[rule] += self._positions_by_rule[rule][pos] - - # Update _rules_by_score. - self._rules_by_score[old_score].discard(rule) - self._rules_by_score[self._rule_scores[rule]].add(rule) - - def _update_rule_not_applies(self, rule, sentnum, wordnum): - """ - Update the rule data tables to reflect the fact that *rule* - does not apply at the position *(sentnum, wordnum)*. - """ - pos = sentnum, wordnum - - # Update _rule_scores. - old_score = self._rule_scores[rule] - self._rule_scores[rule] -= self._positions_by_rule[rule][pos] - - # Update _rules_by_score. - self._rules_by_score[old_score].discard(rule) - self._rules_by_score[self._rule_scores[rule]].add(rule) - - # Update _positions_by_rule - del self._positions_by_rule[rule][pos] - self._rules_by_position[pos].remove(rule) - - # Optional addition: if the rule now applies nowhere, delete - # all its dictionary entries. - - def _best_rule(self, train_sents, test_sents, min_score, min_acc): - """ - Find the next best rule. This is done by repeatedly taking a - rule with the highest score and stepping through the corpus to - see where it applies. When it makes an error (decreasing its - score) it's bumped down, and we try a new rule with the - highest score. When we find a rule which has the highest - score *and* which has been tested against the entire corpus, we - can conclude that it's the next best rule. - """ - for max_score in sorted(self._rules_by_score.keys(), reverse=True): - if len(self._rules_by_score) == 0: - return None - if max_score < min_score or max_score <= 0: - return None - best_rules = list(self._rules_by_score[max_score]) - if self._deterministic: - best_rules.sort(key=repr) - for rule in best_rules: - positions = self._tag_positions[rule.original_tag] - - unk = self._first_unknown_position.get(rule, (0, -1)) - start = bisect.bisect_left(positions, unk) - - for i in range(start, len(positions)): - sentnum, wordnum = positions[i] - if rule.applies(test_sents[sentnum], wordnum): - self._update_rule_applies(rule, sentnum, wordnum, train_sents) - if self._rule_scores[rule] < max_score: - self._first_unknown_position[rule] = (sentnum, wordnum + 1) - break # The update demoted the rule. - - if self._rule_scores[rule] == max_score: - self._first_unknown_position[rule] = (len(train_sents) + 1, 0) - # optimization: if no min_acc threshold given, don't bother computing accuracy - if min_acc is None: - return rule - else: - changes = self._positions_by_rule[rule].values() - num_fixed = len([c for c in changes if c == 1]) - num_broken = len([c for c in changes if c == -1]) - # acc here is fixed/(fixed+broken); could also be - # fixed/(fixed+broken+other) == num_fixed/len(changes) - acc = num_fixed / (num_fixed + num_broken) - if acc >= min_acc: - return rule - # else: rule too inaccurate, discard and try next - - # We demoted (or skipped due to < min_acc, if that was given) - # all the rules with score==max_score. - - assert min_acc is not None or not self._rules_by_score[max_score] - if not self._rules_by_score[max_score]: - del self._rules_by_score[max_score] - - def _apply_rule(self, rule, test_sents): - """ - Update *test_sents* by applying *rule* everywhere where its - conditions are met. - """ - update_positions = set(self._positions_by_rule[rule]) - new_tag = rule.replacement_tag - - if self._trace > 3: - self._trace_apply(len(update_positions)) - - # Update test_sents. - for (sentnum, wordnum) in update_positions: - text = test_sents[sentnum][wordnum][0] - test_sents[sentnum][wordnum] = (text, new_tag) - - def _update_tag_positions(self, rule): - """ - Update _tag_positions to reflect the changes to tags that are - made by *rule*. - """ - # Update the tag index. - for pos in self._positions_by_rule[rule]: - # Delete the old tag. - old_tag_positions = self._tag_positions[rule.original_tag] - old_index = bisect.bisect_left(old_tag_positions, pos) - del old_tag_positions[old_index] - # Insert the new tag. - new_tag_positions = self._tag_positions[rule.replacement_tag] - bisect.insort_left(new_tag_positions, pos) - - def _update_rules(self, rule, train_sents, test_sents): - """ - Check if we should add or remove any rules from consideration, - given the changes made by *rule*. - """ - # Collect a list of all positions that might be affected. - neighbors = set() - for sentnum, wordnum in self._positions_by_rule[rule]: - for template in self._templates: - n = template.get_neighborhood(test_sents[sentnum], wordnum) - neighbors.update([(sentnum, i) for i in n]) - - # Update the rules at each position. - num_obsolete = num_new = num_unseen = 0 - for sentnum, wordnum in neighbors: - test_sent = test_sents[sentnum] - correct_tag = train_sents[sentnum][wordnum][1] - - # Check if the change causes any rule at this position to - # stop matching; if so, then update our rule mappings - # accordingly. - old_rules = set(self._rules_by_position[sentnum, wordnum]) - for old_rule in old_rules: - if not old_rule.applies(test_sent, wordnum): - num_obsolete += 1 - self._update_rule_not_applies(old_rule, sentnum, wordnum) - - # Check if the change causes our templates to propose any - # new rules for this position. - for template in self._templates: - for new_rule in template.applicable_rules( - test_sent, wordnum, correct_tag - ): - if new_rule not in old_rules: - num_new += 1 - if new_rule not in self._rule_scores: - num_unseen += 1 - old_rules.add(new_rule) - self._update_rule_applies( - new_rule, sentnum, wordnum, train_sents - ) - - # We may have caused other rules to match here, that are - # not proposed by our templates -- in particular, rules - # that are harmful or neutral. We therefore need to - # update any rule whose first_unknown_position is past - # this rule. - for new_rule, pos in self._first_unknown_position.items(): - if pos > (sentnum, wordnum): - if new_rule not in old_rules: - num_new += 1 - if new_rule.applies(test_sent, wordnum): - self._update_rule_applies( - new_rule, sentnum, wordnum, train_sents - ) - - if self._trace > 3: - self._trace_update_rules(num_obsolete, num_new, num_unseen) - - # Tracing - - def _trace_header(self): - print( - """ - B | - S F r O | Score = Fixed - Broken - c i o t | R Fixed = num tags changed incorrect -> correct - o x k h | u Broken = num tags changed correct -> incorrect - r e e e | l Other = num tags changed incorrect -> incorrect - e d n r | e -------------------+------------------------------------------------------- - """.rstrip() - ) - - def _trace_rule(self, rule): - assert self._rule_scores[rule] == sum(self._positions_by_rule[rule].values()) - - changes = self._positions_by_rule[rule].values() - num_fixed = len([c for c in changes if c == 1]) - num_broken = len([c for c in changes if c == -1]) - num_other = len([c for c in changes if c == 0]) - score = self._rule_scores[rule] - - rulestr = rule.format(self._ruleformat) - if self._trace > 2: - print( - "{:4d}{:4d}{:4d}{:4d} |".format( - score, num_fixed, num_broken, num_other - ), - end=" ", - ) - print( - textwrap.fill( - rulestr, - initial_indent=" " * 20, - width=79, - subsequent_indent=" " * 18 + "| ", - ).strip() - ) - else: - print(rulestr) - - def _trace_apply(self, num_updates): - prefix = " " * 18 + "|" - print(prefix) - print(prefix, f"Applying rule to {num_updates} positions.") - - def _trace_update_rules(self, num_obsolete, num_new, num_unseen): - prefix = " " * 18 + "|" - print(prefix, "Updated rule tables:") - print(prefix, (f" - {num_obsolete} rule applications removed")) - print( - prefix, - (f" - {num_new} rule applications added ({num_unseen} novel)"), - ) - print(prefix) diff --git a/pipeline/nltk/tag/crf.py b/pipeline/nltk/tag/crf.py deleted file mode 100644 index dfc728c8d55c5eecadd7dc214f756f5224b7f017..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tag/crf.py +++ /dev/null @@ -1,207 +0,0 @@ -# Natural Language Toolkit: Interface to the CRFSuite Tagger -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Long Duong -# URL: -# For license information, see LICENSE.TXT - -""" -A module for POS tagging using CRFSuite -""" - -import re -import unicodedata - -from nltk.tag.api import TaggerI - -try: - import pycrfsuite -except ImportError: - pass - - -class CRFTagger(TaggerI): - """ - A module for POS tagging using CRFSuite https://pypi.python.org/pypi/python-crfsuite - - >>> from nltk.tag import CRFTagger - >>> ct = CRFTagger() # doctest: +SKIP - - >>> train_data = [[('University','Noun'), ('is','Verb'), ('a','Det'), ('good','Adj'), ('place','Noun')], - ... [('dog','Noun'),('eat','Verb'),('meat','Noun')]] - - >>> ct.train(train_data,'model.crf.tagger') # doctest: +SKIP - >>> ct.tag_sents([['dog','is','good'], ['Cat','eat','meat']]) # doctest: +SKIP - [[('dog', 'Noun'), ('is', 'Verb'), ('good', 'Adj')], [('Cat', 'Noun'), ('eat', 'Verb'), ('meat', 'Noun')]] - - >>> gold_sentences = [[('dog','Noun'),('is','Verb'),('good','Adj')] , [('Cat','Noun'),('eat','Verb'), ('meat','Noun')]] - >>> ct.accuracy(gold_sentences) # doctest: +SKIP - 1.0 - - Setting learned model file - >>> ct = CRFTagger() # doctest: +SKIP - >>> ct.set_model_file('model.crf.tagger') # doctest: +SKIP - >>> ct.accuracy(gold_sentences) # doctest: +SKIP - 1.0 - """ - - def __init__(self, feature_func=None, verbose=False, training_opt={}): - """ - Initialize the CRFSuite tagger - - :param feature_func: The function that extracts features for each token of a sentence. This function should take - 2 parameters: tokens and index which extract features at index position from tokens list. See the build in - _get_features function for more detail. - :param verbose: output the debugging messages during training. - :type verbose: boolean - :param training_opt: python-crfsuite training options - :type training_opt: dictionary - - Set of possible training options (using LBFGS training algorithm). - :'feature.minfreq': The minimum frequency of features. - :'feature.possible_states': Force to generate possible state features. - :'feature.possible_transitions': Force to generate possible transition features. - :'c1': Coefficient for L1 regularization. - :'c2': Coefficient for L2 regularization. - :'max_iterations': The maximum number of iterations for L-BFGS optimization. - :'num_memories': The number of limited memories for approximating the inverse hessian matrix. - :'epsilon': Epsilon for testing the convergence of the objective. - :'period': The duration of iterations to test the stopping criterion. - :'delta': The threshold for the stopping criterion; an L-BFGS iteration stops when the - improvement of the log likelihood over the last ${period} iterations is no greater than this threshold. - :'linesearch': The line search algorithm used in L-BFGS updates: - - - 'MoreThuente': More and Thuente's method, - - 'Backtracking': Backtracking method with regular Wolfe condition, - - 'StrongBacktracking': Backtracking method with strong Wolfe condition - :'max_linesearch': The maximum number of trials for the line search algorithm. - """ - - self._model_file = "" - self._tagger = pycrfsuite.Tagger() - - if feature_func is None: - self._feature_func = self._get_features - else: - self._feature_func = feature_func - - self._verbose = verbose - self._training_options = training_opt - self._pattern = re.compile(r"\d") - - def set_model_file(self, model_file): - self._model_file = model_file - self._tagger.open(self._model_file) - - def _get_features(self, tokens, idx): - """ - Extract basic features about this word including - - Current word - - is it capitalized? - - Does it have punctuation? - - Does it have a number? - - Suffixes up to length 3 - - Note that : we might include feature over previous word, next word etc. - - :return: a list which contains the features - :rtype: list(str) - """ - token = tokens[idx] - - feature_list = [] - - if not token: - return feature_list - - # Capitalization - if token[0].isupper(): - feature_list.append("CAPITALIZATION") - - # Number - if re.search(self._pattern, token) is not None: - feature_list.append("HAS_NUM") - - # Punctuation - punc_cat = {"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"} - if all(unicodedata.category(x) in punc_cat for x in token): - feature_list.append("PUNCTUATION") - - # Suffix up to length 3 - if len(token) > 1: - feature_list.append("SUF_" + token[-1:]) - if len(token) > 2: - feature_list.append("SUF_" + token[-2:]) - if len(token) > 3: - feature_list.append("SUF_" + token[-3:]) - - feature_list.append("WORD_" + token) - - return feature_list - - def tag_sents(self, sents): - """ - Tag a list of sentences. NB before using this function, user should specify the mode_file either by - - - Train a new model using ``train`` function - - Use the pre-trained model which is set via ``set_model_file`` function - - :params sentences: list of sentences needed to tag. - :type sentences: list(list(str)) - :return: list of tagged sentences. - :rtype: list(list(tuple(str,str))) - """ - if self._model_file == "": - raise Exception( - " No model file is found !! Please use train or set_model_file function" - ) - - # We need the list of sentences instead of the list generator for matching the input and output - result = [] - for tokens in sents: - features = [self._feature_func(tokens, i) for i in range(len(tokens))] - labels = self._tagger.tag(features) - - if len(labels) != len(tokens): - raise Exception(" Predicted Length Not Matched, Expect Errors !") - - tagged_sent = list(zip(tokens, labels)) - result.append(tagged_sent) - - return result - - def train(self, train_data, model_file): - """ - Train the CRF tagger using CRFSuite - :params train_data : is the list of annotated sentences. - :type train_data : list (list(tuple(str,str))) - :params model_file : the model will be saved to this file. - - """ - trainer = pycrfsuite.Trainer(verbose=self._verbose) - trainer.set_params(self._training_options) - - for sent in train_data: - tokens, labels = zip(*sent) - features = [self._feature_func(tokens, i) for i in range(len(tokens))] - trainer.append(features, labels) - - # Now train the model, the output should be model_file - trainer.train(model_file) - # Save the model file - self.set_model_file(model_file) - - def tag(self, tokens): - """ - Tag a sentence using Python CRFSuite Tagger. NB before using this function, user should specify the mode_file either by - - - Train a new model using ``train`` function - - Use the pre-trained model which is set via ``set_model_file`` function - - :params tokens: list of tokens needed to tag. - :type tokens: list(str) - :return: list of tagged tokens. - :rtype: list(tuple(str,str)) - """ - - return self.tag_sents([tokens])[0] diff --git a/pipeline/nltk/tag/hmm.py b/pipeline/nltk/tag/hmm.py deleted file mode 100644 index 6577789b883828ce01e84c0864de57eead81f12b..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tag/hmm.py +++ /dev/null @@ -1,1329 +0,0 @@ -# Natural Language Toolkit: Hidden Markov Model -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Trevor Cohn -# Philip Blunsom -# Tiago Tresoldi (fixes) -# Steven Bird (fixes) -# Joseph Frazee (fixes) -# Steven Xu (fixes) -# URL: -# For license information, see LICENSE.TXT - -""" -Hidden Markov Models (HMMs) largely used to assign the correct label sequence -to sequential data or assess the probability of a given label and data -sequence. These models are finite state machines characterised by a number of -states, transitions between these states, and output symbols emitted while in -each state. The HMM is an extension to the Markov chain, where each state -corresponds deterministically to a given event. In the HMM the observation is -a probabilistic function of the state. HMMs share the Markov chain's -assumption, being that the probability of transition from one state to another -only depends on the current state - i.e. the series of states that led to the -current state are not used. They are also time invariant. - -The HMM is a directed graph, with probability weighted edges (representing the -probability of a transition between the source and sink states) where each -vertex emits an output symbol when entered. The symbol (or observation) is -non-deterministically generated. For this reason, knowing that a sequence of -output observations was generated by a given HMM does not mean that the -corresponding sequence of states (and what the current state is) is known. -This is the 'hidden' in the hidden markov model. - -Formally, a HMM can be characterised by: - -- the output observation alphabet. This is the set of symbols which may be - observed as output of the system. -- the set of states. -- the transition probabilities *a_{ij} = P(s_t = j | s_{t-1} = i)*. These - represent the probability of transition to each state from a given state. -- the output probability matrix *b_i(k) = P(X_t = o_k | s_t = i)*. These - represent the probability of observing each symbol in a given state. -- the initial state distribution. This gives the probability of starting - in each state. - -To ground this discussion, take a common NLP application, part-of-speech (POS) -tagging. An HMM is desirable for this task as the highest probability tag -sequence can be calculated for a given sequence of word forms. This differs -from other tagging techniques which often tag each word individually, seeking -to optimise each individual tagging greedily without regard to the optimal -combination of tags for a larger unit, such as a sentence. The HMM does this -with the Viterbi algorithm, which efficiently computes the optimal path -through the graph given the sequence of words forms. - -In POS tagging the states usually have a 1:1 correspondence with the tag -alphabet - i.e. each state represents a single tag. The output observation -alphabet is the set of word forms (the lexicon), and the remaining three -parameters are derived by a training regime. With this information the -probability of a given sentence can be easily derived, by simply summing the -probability of each distinct path through the model. Similarly, the highest -probability tagging sequence can be derived with the Viterbi algorithm, -yielding a state sequence which can be mapped into a tag sequence. - -This discussion assumes that the HMM has been trained. This is probably the -most difficult task with the model, and requires either MLE estimates of the -parameters or unsupervised learning using the Baum-Welch algorithm, a variant -of EM. - -For more information, please consult the source code for this module, -which includes extensive demonstration code. -""" - -import itertools -import re - -try: - import numpy as np -except ImportError: - pass - -from nltk.metrics import accuracy -from nltk.probability import ( - ConditionalFreqDist, - ConditionalProbDist, - DictionaryConditionalProbDist, - DictionaryProbDist, - FreqDist, - LidstoneProbDist, - MLEProbDist, - MutableProbDist, - RandomProbDist, -) -from nltk.tag.api import TaggerI -from nltk.util import LazyMap, unique_list - -_TEXT = 0 # index of text in a tuple -_TAG = 1 # index of tag in a tuple - - -def _identity(labeled_symbols): - return labeled_symbols - - -class HiddenMarkovModelTagger(TaggerI): - """ - Hidden Markov model class, a generative model for labelling sequence data. - These models define the joint probability of a sequence of symbols and - their labels (state transitions) as the product of the starting state - probability, the probability of each state transition, and the probability - of each observation being generated from each state. This is described in - more detail in the module documentation. - - This implementation is based on the HMM description in Chapter 8, Huang, - Acero and Hon, Spoken Language Processing and includes an extension for - training shallow HMM parsers or specialized HMMs as in Molina et. - al, 2002. A specialized HMM modifies training data by applying a - specialization function to create a new training set that is more - appropriate for sequential tagging with an HMM. A typical use case is - chunking. - - :param symbols: the set of output symbols (alphabet) - :type symbols: seq of any - :param states: a set of states representing state space - :type states: seq of any - :param transitions: transition probabilities; Pr(s_i | s_j) is the - probability of transition from state i given the model is in - state_j - :type transitions: ConditionalProbDistI - :param outputs: output probabilities; Pr(o_k | s_i) is the probability - of emitting symbol k when entering state i - :type outputs: ConditionalProbDistI - :param priors: initial state distribution; Pr(s_i) is the probability - of starting in state i - :type priors: ProbDistI - :param transform: an optional function for transforming training - instances, defaults to the identity function. - :type transform: callable - """ - - def __init__( - self, symbols, states, transitions, outputs, priors, transform=_identity - ): - self._symbols = unique_list(symbols) - self._states = unique_list(states) - self._transitions = transitions - self._outputs = outputs - self._priors = priors - self._cache = None - self._transform = transform - - @classmethod - def _train( - cls, - labeled_sequence, - test_sequence=None, - unlabeled_sequence=None, - transform=_identity, - estimator=None, - **kwargs, - ): - - if estimator is None: - - def estimator(fd, bins): - return LidstoneProbDist(fd, 0.1, bins) - - labeled_sequence = LazyMap(transform, labeled_sequence) - symbols = unique_list(word for sent in labeled_sequence for word, tag in sent) - tag_set = unique_list(tag for sent in labeled_sequence for word, tag in sent) - - trainer = HiddenMarkovModelTrainer(tag_set, symbols) - hmm = trainer.train_supervised(labeled_sequence, estimator=estimator) - hmm = cls( - hmm._symbols, - hmm._states, - hmm._transitions, - hmm._outputs, - hmm._priors, - transform=transform, - ) - - if test_sequence: - hmm.test(test_sequence, verbose=kwargs.get("verbose", False)) - - if unlabeled_sequence: - max_iterations = kwargs.get("max_iterations", 5) - hmm = trainer.train_unsupervised( - unlabeled_sequence, model=hmm, max_iterations=max_iterations - ) - if test_sequence: - hmm.test(test_sequence, verbose=kwargs.get("verbose", False)) - - return hmm - - @classmethod - def train( - cls, labeled_sequence, test_sequence=None, unlabeled_sequence=None, **kwargs - ): - """ - Train a new HiddenMarkovModelTagger using the given labeled and - unlabeled training instances. Testing will be performed if test - instances are provided. - - :return: a hidden markov model tagger - :rtype: HiddenMarkovModelTagger - :param labeled_sequence: a sequence of labeled training instances, - i.e. a list of sentences represented as tuples - :type labeled_sequence: list(list) - :param test_sequence: a sequence of labeled test instances - :type test_sequence: list(list) - :param unlabeled_sequence: a sequence of unlabeled training instances, - i.e. a list of sentences represented as words - :type unlabeled_sequence: list(list) - :param transform: an optional function for transforming training - instances, defaults to the identity function, see ``transform()`` - :type transform: function - :param estimator: an optional function or class that maps a - condition's frequency distribution to its probability - distribution, defaults to a Lidstone distribution with gamma = 0.1 - :type estimator: class or function - :param verbose: boolean flag indicating whether training should be - verbose or include printed output - :type verbose: bool - :param max_iterations: number of Baum-Welch iterations to perform - :type max_iterations: int - """ - return cls._train(labeled_sequence, test_sequence, unlabeled_sequence, **kwargs) - - def probability(self, sequence): - """ - Returns the probability of the given symbol sequence. If the sequence - is labelled, then returns the joint probability of the symbol, state - sequence. Otherwise, uses the forward algorithm to find the - probability over all label sequences. - - :return: the probability of the sequence - :rtype: float - :param sequence: the sequence of symbols which must contain the TEXT - property, and optionally the TAG property - :type sequence: Token - """ - return 2 ** (self.log_probability(self._transform(sequence))) - - def log_probability(self, sequence): - """ - Returns the log-probability of the given symbol sequence. If the - sequence is labelled, then returns the joint log-probability of the - symbol, state sequence. Otherwise, uses the forward algorithm to find - the log-probability over all label sequences. - - :return: the log-probability of the sequence - :rtype: float - :param sequence: the sequence of symbols which must contain the TEXT - property, and optionally the TAG property - :type sequence: Token - """ - sequence = self._transform(sequence) - - T = len(sequence) - - if T > 0 and sequence[0][_TAG]: - last_state = sequence[0][_TAG] - p = self._priors.logprob(last_state) + self._output_logprob( - last_state, sequence[0][_TEXT] - ) - for t in range(1, T): - state = sequence[t][_TAG] - p += self._transitions[last_state].logprob( - state - ) + self._output_logprob(state, sequence[t][_TEXT]) - last_state = state - return p - else: - alpha = self._forward_probability(sequence) - p = logsumexp2(alpha[T - 1]) - return p - - def tag(self, unlabeled_sequence): - """ - Tags the sequence with the highest probability state sequence. This - uses the best_path method to find the Viterbi path. - - :return: a labelled sequence of symbols - :rtype: list - :param unlabeled_sequence: the sequence of unlabeled symbols - :type unlabeled_sequence: list - """ - unlabeled_sequence = self._transform(unlabeled_sequence) - return self._tag(unlabeled_sequence) - - def _tag(self, unlabeled_sequence): - path = self._best_path(unlabeled_sequence) - return list(zip(unlabeled_sequence, path)) - - def _output_logprob(self, state, symbol): - """ - :return: the log probability of the symbol being observed in the given - state - :rtype: float - """ - return self._outputs[state].logprob(symbol) - - def _create_cache(self): - """ - The cache is a tuple (P, O, X, S) where: - - - S maps symbols to integers. I.e., it is the inverse - mapping from self._symbols; for each symbol s in - self._symbols, the following is true:: - - self._symbols[S[s]] == s - - - O is the log output probabilities:: - - O[i,k] = log( P(token[t]=sym[k]|tag[t]=state[i]) ) - - - X is the log transition probabilities:: - - X[i,j] = log( P(tag[t]=state[j]|tag[t-1]=state[i]) ) - - - P is the log prior probabilities:: - - P[i] = log( P(tag[0]=state[i]) ) - """ - if not self._cache: - N = len(self._states) - M = len(self._symbols) - P = np.zeros(N, np.float32) - X = np.zeros((N, N), np.float32) - O = np.zeros((N, M), np.float32) - for i in range(N): - si = self._states[i] - P[i] = self._priors.logprob(si) - for j in range(N): - X[i, j] = self._transitions[si].logprob(self._states[j]) - for k in range(M): - O[i, k] = self._output_logprob(si, self._symbols[k]) - S = {} - for k in range(M): - S[self._symbols[k]] = k - self._cache = (P, O, X, S) - - def _update_cache(self, symbols): - # add new symbols to the symbol table and repopulate the output - # probabilities and symbol table mapping - if symbols: - self._create_cache() - P, O, X, S = self._cache - for symbol in symbols: - if symbol not in self._symbols: - self._cache = None - self._symbols.append(symbol) - # don't bother with the work if there aren't any new symbols - if not self._cache: - N = len(self._states) - M = len(self._symbols) - Q = O.shape[1] - # add new columns to the output probability table without - # destroying the old probabilities - O = np.hstack([O, np.zeros((N, M - Q), np.float32)]) - for i in range(N): - si = self._states[i] - # only calculate probabilities for new symbols - for k in range(Q, M): - O[i, k] = self._output_logprob(si, self._symbols[k]) - # only create symbol mappings for new symbols - for k in range(Q, M): - S[self._symbols[k]] = k - self._cache = (P, O, X, S) - - def reset_cache(self): - self._cache = None - - def best_path(self, unlabeled_sequence): - """ - Returns the state sequence of the optimal (most probable) path through - the HMM. Uses the Viterbi algorithm to calculate this part by dynamic - programming. - - :return: the state sequence - :rtype: sequence of any - :param unlabeled_sequence: the sequence of unlabeled symbols - :type unlabeled_sequence: list - """ - unlabeled_sequence = self._transform(unlabeled_sequence) - return self._best_path(unlabeled_sequence) - - def _best_path(self, unlabeled_sequence): - T = len(unlabeled_sequence) - N = len(self._states) - self._create_cache() - self._update_cache(unlabeled_sequence) - P, O, X, S = self._cache - - V = np.zeros((T, N), np.float32) - B = -np.ones((T, N), int) - - V[0] = P + O[:, S[unlabeled_sequence[0]]] - for t in range(1, T): - for j in range(N): - vs = V[t - 1, :] + X[:, j] - best = np.argmax(vs) - V[t, j] = vs[best] + O[j, S[unlabeled_sequence[t]]] - B[t, j] = best - - current = np.argmax(V[T - 1, :]) - sequence = [current] - for t in range(T - 1, 0, -1): - last = B[t, current] - sequence.append(last) - current = last - - sequence.reverse() - return list(map(self._states.__getitem__, sequence)) - - def best_path_simple(self, unlabeled_sequence): - """ - Returns the state sequence of the optimal (most probable) path through - the HMM. Uses the Viterbi algorithm to calculate this part by dynamic - programming. This uses a simple, direct method, and is included for - teaching purposes. - - :return: the state sequence - :rtype: sequence of any - :param unlabeled_sequence: the sequence of unlabeled symbols - :type unlabeled_sequence: list - """ - unlabeled_sequence = self._transform(unlabeled_sequence) - return self._best_path_simple(unlabeled_sequence) - - def _best_path_simple(self, unlabeled_sequence): - T = len(unlabeled_sequence) - N = len(self._states) - V = np.zeros((T, N), np.float64) - B = {} - - # find the starting log probabilities for each state - symbol = unlabeled_sequence[0] - for i, state in enumerate(self._states): - V[0, i] = self._priors.logprob(state) + self._output_logprob(state, symbol) - B[0, state] = None - - # find the maximum log probabilities for reaching each state at time t - for t in range(1, T): - symbol = unlabeled_sequence[t] - for j in range(N): - sj = self._states[j] - best = None - for i in range(N): - si = self._states[i] - va = V[t - 1, i] + self._transitions[si].logprob(sj) - if not best or va > best[0]: - best = (va, si) - V[t, j] = best[0] + self._output_logprob(sj, symbol) - B[t, sj] = best[1] - - # find the highest probability final state - best = None - for i in range(N): - val = V[T - 1, i] - if not best or val > best[0]: - best = (val, self._states[i]) - - # traverse the back-pointers B to find the state sequence - current = best[1] - sequence = [current] - for t in range(T - 1, 0, -1): - last = B[t, current] - sequence.append(last) - current = last - - sequence.reverse() - return sequence - - def random_sample(self, rng, length): - """ - Randomly sample the HMM to generate a sentence of a given length. This - samples the prior distribution then the observation distribution and - transition distribution for each subsequent observation and state. - This will mostly generate unintelligible garbage, but can provide some - amusement. - - :return: the randomly created state/observation sequence, - generated according to the HMM's probability - distributions. The SUBTOKENS have TEXT and TAG - properties containing the observation and state - respectively. - :rtype: list - :param rng: random number generator - :type rng: Random (or any object with a random() method) - :param length: desired output length - :type length: int - """ - - # sample the starting state and symbol prob dists - tokens = [] - state = self._sample_probdist(self._priors, rng.random(), self._states) - symbol = self._sample_probdist( - self._outputs[state], rng.random(), self._symbols - ) - tokens.append((symbol, state)) - - for i in range(1, length): - # sample the state transition and symbol prob dists - state = self._sample_probdist( - self._transitions[state], rng.random(), self._states - ) - symbol = self._sample_probdist( - self._outputs[state], rng.random(), self._symbols - ) - tokens.append((symbol, state)) - - return tokens - - def _sample_probdist(self, probdist, p, samples): - cum_p = 0 - for sample in samples: - add_p = probdist.prob(sample) - if cum_p <= p <= cum_p + add_p: - return sample - cum_p += add_p - raise Exception("Invalid probability distribution - " "does not sum to one") - - def entropy(self, unlabeled_sequence): - """ - Returns the entropy over labellings of the given sequence. This is - given by:: - - H(O) = - sum_S Pr(S | O) log Pr(S | O) - - where the summation ranges over all state sequences, S. Let - *Z = Pr(O) = sum_S Pr(S, O)}* where the summation ranges over all state - sequences and O is the observation sequence. As such the entropy can - be re-expressed as:: - - H = - sum_S Pr(S | O) log [ Pr(S, O) / Z ] - = log Z - sum_S Pr(S | O) log Pr(S, 0) - = log Z - sum_S Pr(S | O) [ log Pr(S_0) + sum_t Pr(S_t | S_{t-1}) + sum_t Pr(O_t | S_t) ] - - The order of summation for the log terms can be flipped, allowing - dynamic programming to be used to calculate the entropy. Specifically, - we use the forward and backward probabilities (alpha, beta) giving:: - - H = log Z - sum_s0 alpha_0(s0) beta_0(s0) / Z * log Pr(s0) - + sum_t,si,sj alpha_t(si) Pr(sj | si) Pr(O_t+1 | sj) beta_t(sj) / Z * log Pr(sj | si) - + sum_t,st alpha_t(st) beta_t(st) / Z * log Pr(O_t | st) - - This simply uses alpha and beta to find the probabilities of partial - sequences, constrained to include the given state(s) at some point in - time. - """ - unlabeled_sequence = self._transform(unlabeled_sequence) - - T = len(unlabeled_sequence) - N = len(self._states) - - alpha = self._forward_probability(unlabeled_sequence) - beta = self._backward_probability(unlabeled_sequence) - normalisation = logsumexp2(alpha[T - 1]) - - entropy = normalisation - - # starting state, t = 0 - for i, state in enumerate(self._states): - p = 2 ** (alpha[0, i] + beta[0, i] - normalisation) - entropy -= p * self._priors.logprob(state) - # print('p(s_0 = %s) =' % state, p) - - # state transitions - for t0 in range(T - 1): - t1 = t0 + 1 - for i0, s0 in enumerate(self._states): - for i1, s1 in enumerate(self._states): - p = 2 ** ( - alpha[t0, i0] - + self._transitions[s0].logprob(s1) - + self._outputs[s1].logprob(unlabeled_sequence[t1][_TEXT]) - + beta[t1, i1] - - normalisation - ) - entropy -= p * self._transitions[s0].logprob(s1) - # print('p(s_%d = %s, s_%d = %s) =' % (t0, s0, t1, s1), p) - - # symbol emissions - for t in range(T): - for i, state in enumerate(self._states): - p = 2 ** (alpha[t, i] + beta[t, i] - normalisation) - entropy -= p * self._outputs[state].logprob( - unlabeled_sequence[t][_TEXT] - ) - # print('p(s_%d = %s) =' % (t, state), p) - - return entropy - - def point_entropy(self, unlabeled_sequence): - """ - Returns the pointwise entropy over the possible states at each - position in the chain, given the observation sequence. - """ - unlabeled_sequence = self._transform(unlabeled_sequence) - - T = len(unlabeled_sequence) - N = len(self._states) - - alpha = self._forward_probability(unlabeled_sequence) - beta = self._backward_probability(unlabeled_sequence) - normalisation = logsumexp2(alpha[T - 1]) - - entropies = np.zeros(T, np.float64) - probs = np.zeros(N, np.float64) - for t in range(T): - for s in range(N): - probs[s] = alpha[t, s] + beta[t, s] - normalisation - - for s in range(N): - entropies[t] -= 2 ** (probs[s]) * probs[s] - - return entropies - - def _exhaustive_entropy(self, unlabeled_sequence): - unlabeled_sequence = self._transform(unlabeled_sequence) - - T = len(unlabeled_sequence) - N = len(self._states) - - labellings = [[state] for state in self._states] - for t in range(T - 1): - current = labellings - labellings = [] - for labelling in current: - for state in self._states: - labellings.append(labelling + [state]) - - log_probs = [] - for labelling in labellings: - labeled_sequence = unlabeled_sequence[:] - for t, label in enumerate(labelling): - labeled_sequence[t] = (labeled_sequence[t][_TEXT], label) - lp = self.log_probability(labeled_sequence) - log_probs.append(lp) - normalisation = _log_add(*log_probs) - - entropy = 0 - for lp in log_probs: - lp -= normalisation - entropy -= 2 ** (lp) * lp - - return entropy - - def _exhaustive_point_entropy(self, unlabeled_sequence): - unlabeled_sequence = self._transform(unlabeled_sequence) - - T = len(unlabeled_sequence) - N = len(self._states) - - labellings = [[state] for state in self._states] - for t in range(T - 1): - current = labellings - labellings = [] - for labelling in current: - for state in self._states: - labellings.append(labelling + [state]) - - log_probs = [] - for labelling in labellings: - labelled_sequence = unlabeled_sequence[:] - for t, label in enumerate(labelling): - labelled_sequence[t] = (labelled_sequence[t][_TEXT], label) - lp = self.log_probability(labelled_sequence) - log_probs.append(lp) - - normalisation = _log_add(*log_probs) - - probabilities = _ninf_array((T, N)) - - for labelling, lp in zip(labellings, log_probs): - lp -= normalisation - for t, label in enumerate(labelling): - index = self._states.index(label) - probabilities[t, index] = _log_add(probabilities[t, index], lp) - - entropies = np.zeros(T, np.float64) - for t in range(T): - for s in range(N): - entropies[t] -= 2 ** (probabilities[t, s]) * probabilities[t, s] - - return entropies - - def _transitions_matrix(self): - """Return a matrix of transition log probabilities.""" - trans_iter = ( - self._transitions[sj].logprob(si) - for sj in self._states - for si in self._states - ) - - transitions_logprob = np.fromiter(trans_iter, dtype=np.float64) - N = len(self._states) - return transitions_logprob.reshape((N, N)).T - - def _outputs_vector(self, symbol): - """ - Return a vector with log probabilities of emitting a symbol - when entering states. - """ - out_iter = (self._output_logprob(sj, symbol) for sj in self._states) - return np.fromiter(out_iter, dtype=np.float64) - - def _forward_probability(self, unlabeled_sequence): - """ - Return the forward probability matrix, a T by N array of - log-probabilities, where T is the length of the sequence and N is the - number of states. Each entry (t, s) gives the probability of being in - state s at time t after observing the partial symbol sequence up to - and including t. - - :param unlabeled_sequence: the sequence of unlabeled symbols - :type unlabeled_sequence: list - :return: the forward log probability matrix - :rtype: array - """ - T = len(unlabeled_sequence) - N = len(self._states) - alpha = _ninf_array((T, N)) - - transitions_logprob = self._transitions_matrix() - - # Initialization - symbol = unlabeled_sequence[0][_TEXT] - for i, state in enumerate(self._states): - alpha[0, i] = self._priors.logprob(state) + self._output_logprob( - state, symbol - ) - - # Induction - for t in range(1, T): - symbol = unlabeled_sequence[t][_TEXT] - output_logprob = self._outputs_vector(symbol) - - for i in range(N): - summand = alpha[t - 1] + transitions_logprob[i] - alpha[t, i] = logsumexp2(summand) + output_logprob[i] - - return alpha - - def _backward_probability(self, unlabeled_sequence): - """ - Return the backward probability matrix, a T by N array of - log-probabilities, where T is the length of the sequence and N is the - number of states. Each entry (t, s) gives the probability of being in - state s at time t after observing the partial symbol sequence from t - .. T. - - :return: the backward log probability matrix - :rtype: array - :param unlabeled_sequence: the sequence of unlabeled symbols - :type unlabeled_sequence: list - """ - T = len(unlabeled_sequence) - N = len(self._states) - beta = _ninf_array((T, N)) - - transitions_logprob = self._transitions_matrix().T - - # initialise the backward values; - # "1" is an arbitrarily chosen value from Rabiner tutorial - beta[T - 1, :] = np.log2(1) - - # inductively calculate remaining backward values - for t in range(T - 2, -1, -1): - symbol = unlabeled_sequence[t + 1][_TEXT] - outputs = self._outputs_vector(symbol) - - for i in range(N): - summand = transitions_logprob[i] + beta[t + 1] + outputs - beta[t, i] = logsumexp2(summand) - - return beta - - def test(self, test_sequence, verbose=False, **kwargs): - """ - Tests the HiddenMarkovModelTagger instance. - - :param test_sequence: a sequence of labeled test instances - :type test_sequence: list(list) - :param verbose: boolean flag indicating whether training should be - verbose or include printed output - :type verbose: bool - """ - - def words(sent): - return [word for (word, tag) in sent] - - def tags(sent): - return [tag for (word, tag) in sent] - - def flatten(seq): - return list(itertools.chain(*seq)) - - test_sequence = self._transform(test_sequence) - predicted_sequence = list(map(self._tag, map(words, test_sequence))) - - if verbose: - for test_sent, predicted_sent in zip(test_sequence, predicted_sequence): - print( - "Test:", - " ".join(f"{token}/{tag}" for (token, tag) in test_sent), - ) - print() - print("Untagged:", " ".join("%s" % token for (token, tag) in test_sent)) - print() - print( - "HMM-tagged:", - " ".join(f"{token}/{tag}" for (token, tag) in predicted_sent), - ) - print() - print( - "Entropy:", - self.entropy([(token, None) for (token, tag) in predicted_sent]), - ) - print() - print("-" * 60) - - test_tags = flatten(map(tags, test_sequence)) - predicted_tags = flatten(map(tags, predicted_sequence)) - - acc = accuracy(test_tags, predicted_tags) - count = sum(len(sent) for sent in test_sequence) - print("accuracy over %d tokens: %.2f" % (count, acc * 100)) - - def __repr__(self): - return "" % ( - len(self._states), - len(self._symbols), - ) - - -class HiddenMarkovModelTrainer: - """ - Algorithms for learning HMM parameters from training data. These include - both supervised learning (MLE) and unsupervised learning (Baum-Welch). - - Creates an HMM trainer to induce an HMM with the given states and - output symbol alphabet. A supervised and unsupervised training - method may be used. If either of the states or symbols are not given, - these may be derived from supervised training. - - :param states: the set of state labels - :type states: sequence of any - :param symbols: the set of observation symbols - :type symbols: sequence of any - """ - - def __init__(self, states=None, symbols=None): - self._states = states if states else [] - self._symbols = symbols if symbols else [] - - def train(self, labeled_sequences=None, unlabeled_sequences=None, **kwargs): - """ - Trains the HMM using both (or either of) supervised and unsupervised - techniques. - - :return: the trained model - :rtype: HiddenMarkovModelTagger - :param labelled_sequences: the supervised training data, a set of - labelled sequences of observations - ex: [ (word_1, tag_1),...,(word_n,tag_n) ] - :type labelled_sequences: list - :param unlabeled_sequences: the unsupervised training data, a set of - sequences of observations - ex: [ word_1, ..., word_n ] - :type unlabeled_sequences: list - :param kwargs: additional arguments to pass to the training methods - """ - assert labeled_sequences or unlabeled_sequences - model = None - if labeled_sequences: - model = self.train_supervised(labeled_sequences, **kwargs) - if unlabeled_sequences: - if model: - kwargs["model"] = model - model = self.train_unsupervised(unlabeled_sequences, **kwargs) - return model - - def _baum_welch_step(self, sequence, model, symbol_to_number): - - N = len(model._states) - M = len(model._symbols) - T = len(sequence) - - # compute forward and backward probabilities - alpha = model._forward_probability(sequence) - beta = model._backward_probability(sequence) - - # find the log probability of the sequence - lpk = logsumexp2(alpha[T - 1]) - - A_numer = _ninf_array((N, N)) - B_numer = _ninf_array((N, M)) - A_denom = _ninf_array(N) - B_denom = _ninf_array(N) - - transitions_logprob = model._transitions_matrix().T - - for t in range(T): - symbol = sequence[t][_TEXT] # not found? FIXME - next_symbol = None - if t < T - 1: - next_symbol = sequence[t + 1][_TEXT] # not found? FIXME - xi = symbol_to_number[symbol] - - next_outputs_logprob = model._outputs_vector(next_symbol) - alpha_plus_beta = alpha[t] + beta[t] - - if t < T - 1: - numer_add = ( - transitions_logprob - + next_outputs_logprob - + beta[t + 1] - + alpha[t].reshape(N, 1) - ) - A_numer = np.logaddexp2(A_numer, numer_add) - A_denom = np.logaddexp2(A_denom, alpha_plus_beta) - else: - B_denom = np.logaddexp2(A_denom, alpha_plus_beta) - - B_numer[:, xi] = np.logaddexp2(B_numer[:, xi], alpha_plus_beta) - - return lpk, A_numer, A_denom, B_numer, B_denom - - def train_unsupervised(self, unlabeled_sequences, update_outputs=True, **kwargs): - """ - Trains the HMM using the Baum-Welch algorithm to maximise the - probability of the data sequence. This is a variant of the EM - algorithm, and is unsupervised in that it doesn't need the state - sequences for the symbols. The code is based on 'A Tutorial on Hidden - Markov Models and Selected Applications in Speech Recognition', - Lawrence Rabiner, IEEE, 1989. - - :return: the trained model - :rtype: HiddenMarkovModelTagger - :param unlabeled_sequences: the training data, a set of - sequences of observations - :type unlabeled_sequences: list - - kwargs may include following parameters: - - :param model: a HiddenMarkovModelTagger instance used to begin - the Baum-Welch algorithm - :param max_iterations: the maximum number of EM iterations - :param convergence_logprob: the maximum change in log probability to - allow convergence - """ - - # create a uniform HMM, which will be iteratively refined, unless - # given an existing model - model = kwargs.get("model") - if not model: - priors = RandomProbDist(self._states) - transitions = DictionaryConditionalProbDist( - {state: RandomProbDist(self._states) for state in self._states} - ) - outputs = DictionaryConditionalProbDist( - {state: RandomProbDist(self._symbols) for state in self._states} - ) - model = HiddenMarkovModelTagger( - self._symbols, self._states, transitions, outputs, priors - ) - - self._states = model._states - self._symbols = model._symbols - - N = len(self._states) - M = len(self._symbols) - symbol_numbers = {sym: i for i, sym in enumerate(self._symbols)} - - # update model prob dists so that they can be modified - # model._priors = MutableProbDist(model._priors, self._states) - - model._transitions = DictionaryConditionalProbDist( - { - s: MutableProbDist(model._transitions[s], self._states) - for s in self._states - } - ) - - if update_outputs: - model._outputs = DictionaryConditionalProbDist( - { - s: MutableProbDist(model._outputs[s], self._symbols) - for s in self._states - } - ) - - model.reset_cache() - - # iterate until convergence - converged = False - last_logprob = None - iteration = 0 - max_iterations = kwargs.get("max_iterations", 1000) - epsilon = kwargs.get("convergence_logprob", 1e-6) - - while not converged and iteration < max_iterations: - A_numer = _ninf_array((N, N)) - B_numer = _ninf_array((N, M)) - A_denom = _ninf_array(N) - B_denom = _ninf_array(N) - - logprob = 0 - for sequence in unlabeled_sequences: - sequence = list(sequence) - if not sequence: - continue - - ( - lpk, - seq_A_numer, - seq_A_denom, - seq_B_numer, - seq_B_denom, - ) = self._baum_welch_step(sequence, model, symbol_numbers) - - # add these sums to the global A and B values - for i in range(N): - A_numer[i] = np.logaddexp2(A_numer[i], seq_A_numer[i] - lpk) - B_numer[i] = np.logaddexp2(B_numer[i], seq_B_numer[i] - lpk) - - A_denom = np.logaddexp2(A_denom, seq_A_denom - lpk) - B_denom = np.logaddexp2(B_denom, seq_B_denom - lpk) - - logprob += lpk - - # use the calculated values to update the transition and output - # probability values - for i in range(N): - logprob_Ai = A_numer[i] - A_denom[i] - logprob_Bi = B_numer[i] - B_denom[i] - - # We should normalize all probabilities (see p.391 Huang et al) - # Let sum(P) be K. - # We can divide each Pi by K to make sum(P) == 1. - # Pi' = Pi/K - # log2(Pi') = log2(Pi) - log2(K) - logprob_Ai -= logsumexp2(logprob_Ai) - logprob_Bi -= logsumexp2(logprob_Bi) - - # update output and transition probabilities - si = self._states[i] - - for j in range(N): - sj = self._states[j] - model._transitions[si].update(sj, logprob_Ai[j]) - - if update_outputs: - for k in range(M): - ok = self._symbols[k] - model._outputs[si].update(ok, logprob_Bi[k]) - - # Rabiner says the priors don't need to be updated. I don't - # believe him. FIXME - - # test for convergence - if iteration > 0 and abs(logprob - last_logprob) < epsilon: - converged = True - - print("iteration", iteration, "logprob", logprob) - iteration += 1 - last_logprob = logprob - - return model - - def train_supervised(self, labelled_sequences, estimator=None): - """ - Supervised training maximising the joint probability of the symbol and - state sequences. This is done via collecting frequencies of - transitions between states, symbol observations while within each - state and which states start a sentence. These frequency distributions - are then normalised into probability estimates, which can be - smoothed if desired. - - :return: the trained model - :rtype: HiddenMarkovModelTagger - :param labelled_sequences: the training data, a set of - labelled sequences of observations - :type labelled_sequences: list - :param estimator: a function taking - a FreqDist and a number of bins and returning a CProbDistI; - otherwise a MLE estimate is used - """ - - # default to the MLE estimate - if estimator is None: - estimator = lambda fdist, bins: MLEProbDist(fdist) - - # count occurrences of starting states, transitions out of each state - # and output symbols observed in each state - known_symbols = set(self._symbols) - known_states = set(self._states) - - starting = FreqDist() - transitions = ConditionalFreqDist() - outputs = ConditionalFreqDist() - for sequence in labelled_sequences: - lasts = None - for token in sequence: - state = token[_TAG] - symbol = token[_TEXT] - if lasts is None: - starting[state] += 1 - else: - transitions[lasts][state] += 1 - outputs[state][symbol] += 1 - lasts = state - - # update the state and symbol lists - if state not in known_states: - self._states.append(state) - known_states.add(state) - - if symbol not in known_symbols: - self._symbols.append(symbol) - known_symbols.add(symbol) - - # create probability distributions (with smoothing) - N = len(self._states) - pi = estimator(starting, N) - A = ConditionalProbDist(transitions, estimator, N) - B = ConditionalProbDist(outputs, estimator, len(self._symbols)) - - return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi) - - -def _ninf_array(shape): - res = np.empty(shape, np.float64) - res.fill(-np.inf) - return res - - -def logsumexp2(arr): - max_ = arr.max() - return np.log2(np.sum(2 ** (arr - max_))) + max_ - - -def _log_add(*values): - """ - Adds the logged values, returning the logarithm of the addition. - """ - x = max(values) - if x > -np.inf: - sum_diffs = 0 - for value in values: - sum_diffs += 2 ** (value - x) - return x + np.log2(sum_diffs) - else: - return x - - -def _create_hmm_tagger(states, symbols, A, B, pi): - def pd(values, samples): - d = dict(zip(samples, values)) - return DictionaryProbDist(d) - - def cpd(array, conditions, samples): - d = {} - for values, condition in zip(array, conditions): - d[condition] = pd(values, samples) - return DictionaryConditionalProbDist(d) - - A = cpd(A, states, states) - B = cpd(B, states, symbols) - pi = pd(pi, states) - return HiddenMarkovModelTagger( - symbols=symbols, states=states, transitions=A, outputs=B, priors=pi - ) - - -def _market_hmm_example(): - """ - Return an example HMM (described at page 381, Huang et al) - """ - states = ["bull", "bear", "static"] - symbols = ["up", "down", "unchanged"] - A = np.array([[0.6, 0.2, 0.2], [0.5, 0.3, 0.2], [0.4, 0.1, 0.5]], np.float64) - B = np.array([[0.7, 0.1, 0.2], [0.1, 0.6, 0.3], [0.3, 0.3, 0.4]], np.float64) - pi = np.array([0.5, 0.2, 0.3], np.float64) - - model = _create_hmm_tagger(states, symbols, A, B, pi) - return model, states, symbols - - -def demo(): - # demonstrates HMM probability calculation - - print() - print("HMM probability calculation demo") - print() - - model, states, symbols = _market_hmm_example() - - print("Testing", model) - - for test in [ - ["up", "up"], - ["up", "down", "up"], - ["down"] * 5, - ["unchanged"] * 5 + ["up"], - ]: - - sequence = [(t, None) for t in test] - - print("Testing with state sequence", test) - print("probability =", model.probability(sequence)) - print("tagging = ", model.tag([word for (word, tag) in sequence])) - print("p(tagged) = ", model.probability(sequence)) - print("H = ", model.entropy(sequence)) - print("H_exh = ", model._exhaustive_entropy(sequence)) - print("H(point) = ", model.point_entropy(sequence)) - print("H_exh(point)=", model._exhaustive_point_entropy(sequence)) - print() - - -def load_pos(num_sents): - from nltk.corpus import brown - - sentences = brown.tagged_sents(categories="news")[:num_sents] - - tag_re = re.compile(r"[*]|--|[^+*-]+") - tag_set = set() - symbols = set() - - cleaned_sentences = [] - for sentence in sentences: - for i in range(len(sentence)): - word, tag = sentence[i] - word = word.lower() # normalize - symbols.add(word) # log this word - # Clean up the tag. - tag = tag_re.match(tag).group() - tag_set.add(tag) - sentence[i] = (word, tag) # store cleaned-up tagged token - cleaned_sentences += [sentence] - - return cleaned_sentences, list(tag_set), list(symbols) - - -def demo_pos(): - # demonstrates POS tagging using supervised training - - print() - print("HMM POS tagging demo") - print() - - print("Training HMM...") - labelled_sequences, tag_set, symbols = load_pos(20000) - trainer = HiddenMarkovModelTrainer(tag_set, symbols) - hmm = trainer.train_supervised( - labelled_sequences[10:], - estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins), - ) - - print("Testing...") - hmm.test(labelled_sequences[:10], verbose=True) - - -def _untag(sentences): - unlabeled = [] - for sentence in sentences: - unlabeled.append([(token[_TEXT], None) for token in sentence]) - return unlabeled - - -def demo_pos_bw( - test=10, supervised=20, unsupervised=10, verbose=True, max_iterations=5 -): - # demonstrates the Baum-Welch algorithm in POS tagging - - print() - print("Baum-Welch demo for POS tagging") - print() - - print("Training HMM (supervised, %d sentences)..." % supervised) - - sentences, tag_set, symbols = load_pos(test + supervised + unsupervised) - - symbols = set() - for sentence in sentences: - for token in sentence: - symbols.add(token[_TEXT]) - - trainer = HiddenMarkovModelTrainer(tag_set, list(symbols)) - hmm = trainer.train_supervised( - sentences[test : test + supervised], - estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins), - ) - - hmm.test(sentences[:test], verbose=verbose) - - print("Training (unsupervised, %d sentences)..." % unsupervised) - # it's rather slow - so only use 10 samples by default - unlabeled = _untag(sentences[test + supervised :]) - hmm = trainer.train_unsupervised( - unlabeled, model=hmm, max_iterations=max_iterations - ) - hmm.test(sentences[:test], verbose=verbose) - - -def demo_bw(): - # demo Baum Welch by generating some sequences and then performing - # unsupervised training on them - - print() - print("Baum-Welch demo for market example") - print() - - model, states, symbols = _market_hmm_example() - - # generate some random sequences - training = [] - import random - - rng = random.Random() - rng.seed(0) - for i in range(10): - item = model.random_sample(rng, 5) - training.append([(i[0], None) for i in item]) - - # train on those examples, starting with the model that generated them - trainer = HiddenMarkovModelTrainer(states, symbols) - hmm = trainer.train_unsupervised(training, model=model, max_iterations=1000) diff --git a/pipeline/nltk/tag/hunpos.py b/pipeline/nltk/tag/hunpos.py deleted file mode 100644 index e001c6d6dbc1257515ed1149abe6bab06f1c7337..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tag/hunpos.py +++ /dev/null @@ -1,142 +0,0 @@ -# Natural Language Toolkit: Interface to the HunPos POS-tagger -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Peter Ljunglöf -# Dávid Márk Nemeskey (modifications) -# Attila Zséder (modifications) -# URL: -# For license information, see LICENSE.TXT - -""" -A module for interfacing with the HunPos open-source POS-tagger. -""" - -import os -from subprocess import PIPE, Popen - -from nltk.internals import find_binary, find_file -from nltk.tag.api import TaggerI - -_hunpos_url = "https://code.google.com/p/hunpos/" - -_hunpos_charset = "ISO-8859-1" -"""The default encoding used by hunpos: ISO-8859-1.""" - - -class HunposTagger(TaggerI): - """ - A class for pos tagging with HunPos. The input is the paths to: - - a model trained on training data - - (optionally) the path to the hunpos-tag binary - - (optionally) the encoding of the training data (default: ISO-8859-1) - - Check whether the required "hunpos-tag" binary is available: - - >>> from nltk.test.setup_fixt import check_binary - >>> check_binary('hunpos-tag') - - Example: - >>> from nltk.tag import HunposTagger - >>> ht = HunposTagger('en_wsj.model') - >>> ht.tag('What is the airspeed of an unladen swallow ?'.split()) - [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'VB'), ('?', '.')] - >>> ht.close() - - This class communicates with the hunpos-tag binary via pipes. When the - tagger object is no longer needed, the close() method should be called to - free system resources. The class supports the context manager interface; if - used in a with statement, the close() method is invoked automatically: - - >>> with HunposTagger('en_wsj.model') as ht: - ... ht.tag('What is the airspeed of an unladen swallow ?'.split()) - ... - [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'VB'), ('?', '.')] - """ - - def __init__( - self, path_to_model, path_to_bin=None, encoding=_hunpos_charset, verbose=False - ): - """ - Starts the hunpos-tag executable and establishes a connection with it. - - :param path_to_model: The model file. - :param path_to_bin: The hunpos-tag binary. - :param encoding: The encoding used by the model. Unicode tokens - passed to the tag() and tag_sents() methods are converted to - this charset when they are sent to hunpos-tag. - The default is ISO-8859-1 (Latin-1). - - This parameter is ignored for str tokens, which are sent as-is. - The caller must ensure that tokens are encoded in the right charset. - """ - self._closed = True - hunpos_paths = [ - ".", - "/usr/bin", - "/usr/local/bin", - "/opt/local/bin", - "/Applications/bin", - "~/bin", - "~/Applications/bin", - ] - hunpos_paths = list(map(os.path.expanduser, hunpos_paths)) - - self._hunpos_bin = find_binary( - "hunpos-tag", - path_to_bin, - env_vars=("HUNPOS_TAGGER",), - searchpath=hunpos_paths, - url=_hunpos_url, - verbose=verbose, - ) - - self._hunpos_model = find_file( - path_to_model, env_vars=("HUNPOS_TAGGER",), verbose=verbose - ) - self._encoding = encoding - self._hunpos = Popen( - [self._hunpos_bin, self._hunpos_model], - shell=False, - stdin=PIPE, - stdout=PIPE, - stderr=PIPE, - ) - self._closed = False - - def __del__(self): - self.close() - - def close(self): - """Closes the pipe to the hunpos executable.""" - if not self._closed: - self._hunpos.communicate() - self._closed = True - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.close() - - def tag(self, tokens): - """Tags a single sentence: a list of words. - The tokens should not contain any newline characters. - """ - for token in tokens: - assert "\n" not in token, "Tokens should not contain newlines" - if isinstance(token, str): - token = token.encode(self._encoding) - self._hunpos.stdin.write(token + b"\n") - # We write a final empty line to tell hunpos that the sentence is finished: - self._hunpos.stdin.write(b"\n") - self._hunpos.stdin.flush() - - tagged_tokens = [] - for token in tokens: - tagged = self._hunpos.stdout.readline().strip().split(b"\t") - tag = tagged[1] if len(tagged) > 1 else None - tagged_tokens.append((token, tag)) - # We have to read (and dismiss) the final empty line: - self._hunpos.stdout.readline() - - return tagged_tokens diff --git a/pipeline/nltk/tag/mapping.py b/pipeline/nltk/tag/mapping.py deleted file mode 100644 index 0af1a0eef945b3cfb2bb3a5860b223a42dbaeae7..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tag/mapping.py +++ /dev/null @@ -1,136 +0,0 @@ -# Natural Language Toolkit: Tagset Mapping -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Nathan Schneider -# Steven Bird -# URL: -# For license information, see LICENSE.TXT - -""" -Interface for converting POS tags from various treebanks -to the universal tagset of Petrov, Das, & McDonald. - -The tagset consists of the following 12 coarse tags: - -VERB - verbs (all tenses and modes) -NOUN - nouns (common and proper) -PRON - pronouns -ADJ - adjectives -ADV - adverbs -ADP - adpositions (prepositions and postpositions) -CONJ - conjunctions -DET - determiners -NUM - cardinal numbers -PRT - particles or other function words -X - other: foreign words, typos, abbreviations -. - punctuation - -@see: https://arxiv.org/abs/1104.2086 and https://code.google.com/p/universal-pos-tags/ - -""" - -from collections import defaultdict -from os.path import join - -from nltk.data import load - -_UNIVERSAL_DATA = "taggers/universal_tagset" -_UNIVERSAL_TAGS = ( - "VERB", - "NOUN", - "PRON", - "ADJ", - "ADV", - "ADP", - "CONJ", - "DET", - "NUM", - "PRT", - "X", - ".", -) - -# _MAPPINGS = defaultdict(lambda: defaultdict(dict)) -# the mapping between tagset T1 and T2 returns UNK if applied to an unrecognized tag -_MAPPINGS = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: "UNK"))) - - -def _load_universal_map(fileid): - contents = load(join(_UNIVERSAL_DATA, fileid + ".map"), format="text") - - # When mapping to the Universal Tagset, - # map unknown inputs to 'X' not 'UNK' - _MAPPINGS[fileid]["universal"].default_factory = lambda: "X" - - for line in contents.splitlines(): - line = line.strip() - if line == "": - continue - fine, coarse = line.split("\t") - - assert coarse in _UNIVERSAL_TAGS, f"Unexpected coarse tag: {coarse}" - assert ( - fine not in _MAPPINGS[fileid]["universal"] - ), f"Multiple entries for original tag: {fine}" - - _MAPPINGS[fileid]["universal"][fine] = coarse - - -def tagset_mapping(source, target): - """ - Retrieve the mapping dictionary between tagsets. - - >>> tagset_mapping('ru-rnc', 'universal') == {'!': '.', 'A': 'ADJ', 'C': 'CONJ', 'AD': 'ADV',\ - 'NN': 'NOUN', 'VG': 'VERB', 'COMP': 'CONJ', 'NC': 'NUM', 'VP': 'VERB', 'P': 'ADP',\ - 'IJ': 'X', 'V': 'VERB', 'Z': 'X', 'VI': 'VERB', 'YES_NO_SENT': 'X', 'PTCL': 'PRT'} - True - """ - - if source not in _MAPPINGS or target not in _MAPPINGS[source]: - if target == "universal": - _load_universal_map(source) - # Added the new Russian National Corpus mappings because the - # Russian model for nltk.pos_tag() uses it. - _MAPPINGS["ru-rnc-new"]["universal"] = { - "A": "ADJ", - "A-PRO": "PRON", - "ADV": "ADV", - "ADV-PRO": "PRON", - "ANUM": "ADJ", - "CONJ": "CONJ", - "INTJ": "X", - "NONLEX": ".", - "NUM": "NUM", - "PARENTH": "PRT", - "PART": "PRT", - "PR": "ADP", - "PRAEDIC": "PRT", - "PRAEDIC-PRO": "PRON", - "S": "NOUN", - "S-PRO": "PRON", - "V": "VERB", - } - - return _MAPPINGS[source][target] - - -def map_tag(source, target, source_tag): - """ - Maps the tag from the source tagset to the target tagset. - - >>> map_tag('en-ptb', 'universal', 'VBZ') - 'VERB' - >>> map_tag('en-ptb', 'universal', 'VBP') - 'VERB' - >>> map_tag('en-ptb', 'universal', '``') - '.' - """ - - # we need a systematic approach to naming - if target == "universal": - if source == "wsj": - source = "en-ptb" - if source == "brown": - source = "en-brown" - - return tagset_mapping(source, target)[source_tag] diff --git a/pipeline/nltk/tag/perceptron.py b/pipeline/nltk/tag/perceptron.py deleted file mode 100644 index 9afe08f0c8d6a9d5852a225e6c9569a291fb1e3d..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tag/perceptron.py +++ /dev/null @@ -1,371 +0,0 @@ -# This module is a port of the Textblob Averaged Perceptron Tagger -# Author: Matthew Honnibal , -# Long Duong (NLTK port) -# URL: -# -# Copyright 2013 Matthew Honnibal -# NLTK modifications Copyright 2015 The NLTK Project -# -# This module is provided under the terms of the MIT License. - -import logging -import pickle -import random -from collections import defaultdict - -from nltk import jsontags -from nltk.data import find, load -from nltk.tag.api import TaggerI - -try: - import numpy as np -except ImportError: - pass - -PICKLE = "averaged_perceptron_tagger.pickle" - - -@jsontags.register_tag -class AveragedPerceptron: - - """An averaged perceptron, as implemented by Matthew Honnibal. - - See more implementation details here: - https://explosion.ai/blog/part-of-speech-pos-tagger-in-python - """ - - json_tag = "nltk.tag.perceptron.AveragedPerceptron" - - def __init__(self, weights=None): - # Each feature gets its own weight vector, so weights is a dict-of-dicts - self.weights = weights if weights else {} - self.classes = set() - # The accumulated values, for the averaging. These will be keyed by - # feature/clas tuples - self._totals = defaultdict(int) - # The last time the feature was changed, for the averaging. Also - # keyed by feature/clas tuples - # (tstamps is short for timestamps) - self._tstamps = defaultdict(int) - # Number of instances seen - self.i = 0 - - def _softmax(self, scores): - s = np.fromiter(scores.values(), dtype=float) - exps = np.exp(s) - return exps / np.sum(exps) - - def predict(self, features, return_conf=False): - """Dot-product the features and current weights and return the best label.""" - scores = defaultdict(float) - for feat, value in features.items(): - if feat not in self.weights or value == 0: - continue - weights = self.weights[feat] - for label, weight in weights.items(): - scores[label] += value * weight - - # Do a secondary alphabetic sort, for stability - best_label = max(self.classes, key=lambda label: (scores[label], label)) - # compute the confidence - conf = max(self._softmax(scores)) if return_conf == True else None - - return best_label, conf - - def update(self, truth, guess, features): - """Update the feature weights.""" - - def upd_feat(c, f, w, v): - param = (f, c) - self._totals[param] += (self.i - self._tstamps[param]) * w - self._tstamps[param] = self.i - self.weights[f][c] = w + v - - self.i += 1 - if truth == guess: - return None - for f in features: - weights = self.weights.setdefault(f, {}) - upd_feat(truth, f, weights.get(truth, 0.0), 1.0) - upd_feat(guess, f, weights.get(guess, 0.0), -1.0) - - def average_weights(self): - """Average weights from all iterations.""" - for feat, weights in self.weights.items(): - new_feat_weights = {} - for clas, weight in weights.items(): - param = (feat, clas) - total = self._totals[param] - total += (self.i - self._tstamps[param]) * weight - averaged = round(total / self.i, 3) - if averaged: - new_feat_weights[clas] = averaged - self.weights[feat] = new_feat_weights - - def save(self, path): - """Save the pickled model weights.""" - with open(path, "wb") as fout: - return pickle.dump(dict(self.weights), fout) - - def load(self, path): - """Load the pickled model weights.""" - self.weights = load(path) - - def encode_json_obj(self): - return self.weights - - @classmethod - def decode_json_obj(cls, obj): - return cls(obj) - - -@jsontags.register_tag -class PerceptronTagger(TaggerI): - - """ - Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal. - See more implementation details here: - https://explosion.ai/blog/part-of-speech-pos-tagger-in-python - - >>> from nltk.tag.perceptron import PerceptronTagger - - Train the model - - >>> tagger = PerceptronTagger(load=False) - - >>> tagger.train([[('today','NN'),('is','VBZ'),('good','JJ'),('day','NN')], - ... [('yes','NNS'),('it','PRP'),('beautiful','JJ')]]) - - >>> tagger.tag(['today','is','a','beautiful','day']) - [('today', 'NN'), ('is', 'PRP'), ('a', 'PRP'), ('beautiful', 'JJ'), ('day', 'NN')] - - Use the pretrain model (the default constructor) - - >>> pretrain = PerceptronTagger() - - >>> pretrain.tag('The quick brown fox jumps over the lazy dog'.split()) - [('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')] - - >>> pretrain.tag("The red cat".split()) - [('The', 'DT'), ('red', 'JJ'), ('cat', 'NN')] - """ - - json_tag = "nltk.tag.sequential.PerceptronTagger" - - START = ["-START-", "-START2-"] - END = ["-END-", "-END2-"] - - def __init__(self, load=True): - """ - :param load: Load the pickled model upon instantiation. - """ - self.model = AveragedPerceptron() - self.tagdict = {} - self.classes = set() - if load: - AP_MODEL_LOC = "file:" + str( - find("taggers/averaged_perceptron_tagger/" + PICKLE) - ) - self.load(AP_MODEL_LOC) - - def tag(self, tokens, return_conf=False, use_tagdict=True): - """ - Tag tokenized sentences. - :params tokens: list of word - :type tokens: list(str) - """ - prev, prev2 = self.START - output = [] - - context = self.START + [self.normalize(w) for w in tokens] + self.END - for i, word in enumerate(tokens): - tag, conf = ( - (self.tagdict.get(word), 1.0) if use_tagdict == True else (None, None) - ) - if not tag: - features = self._get_features(i, word, context, prev, prev2) - tag, conf = self.model.predict(features, return_conf) - output.append((word, tag, conf) if return_conf == True else (word, tag)) - - prev2 = prev - prev = tag - - return output - - def train(self, sentences, save_loc=None, nr_iter=5): - """Train a model from sentences, and save it at ``save_loc``. ``nr_iter`` - controls the number of Perceptron training iterations. - - :param sentences: A list or iterator of sentences, where each sentence - is a list of (words, tags) tuples. - :param save_loc: If not ``None``, saves a pickled model in this location. - :param nr_iter: Number of training iterations. - """ - # We'd like to allow ``sentences`` to be either a list or an iterator, - # the latter being especially important for a large training dataset. - # Because ``self._make_tagdict(sentences)`` runs regardless, we make - # it populate ``self._sentences`` (a list) with all the sentences. - # This saves the overheard of just iterating through ``sentences`` to - # get the list by ``sentences = list(sentences)``. - - self._sentences = list() # to be populated by self._make_tagdict... - self._make_tagdict(sentences) - self.model.classes = self.classes - for iter_ in range(nr_iter): - c = 0 - n = 0 - for sentence in self._sentences: - words, tags = zip(*sentence) - - prev, prev2 = self.START - context = self.START + [self.normalize(w) for w in words] + self.END - for i, word in enumerate(words): - guess = self.tagdict.get(word) - if not guess: - feats = self._get_features(i, word, context, prev, prev2) - guess, _ = self.model.predict(feats) - self.model.update(tags[i], guess, feats) - prev2 = prev - prev = guess - c += guess == tags[i] - n += 1 - random.shuffle(self._sentences) - logging.info(f"Iter {iter_}: {c}/{n}={_pc(c, n)}") - - # We don't need the training sentences anymore, and we don't want to - # waste space on them when we pickle the trained tagger. - self._sentences = None - - self.model.average_weights() - # Pickle as a binary file - if save_loc is not None: - with open(save_loc, "wb") as fout: - # changed protocol from -1 to 2 to make pickling Python 2 compatible - pickle.dump((self.model.weights, self.tagdict, self.classes), fout, 2) - - def load(self, loc): - """ - :param loc: Load a pickled model at location. - :type loc: str - """ - - self.model.weights, self.tagdict, self.classes = load(loc) - self.model.classes = self.classes - - def encode_json_obj(self): - return self.model.weights, self.tagdict, list(self.classes) - - @classmethod - def decode_json_obj(cls, obj): - tagger = cls(load=False) - tagger.model.weights, tagger.tagdict, tagger.classes = obj - tagger.classes = set(tagger.classes) - tagger.model.classes = tagger.classes - return tagger - - def normalize(self, word): - """ - Normalization used in pre-processing. - - All words are lower cased - - Groups of digits of length 4 are represented as !YEAR; - - Other digits are represented as !DIGITS - - :rtype: str - """ - if "-" in word and word[0] != "-": - return "!HYPHEN" - if word.isdigit() and len(word) == 4: - return "!YEAR" - if word and word[0].isdigit(): - return "!DIGITS" - return word.lower() - - def _get_features(self, i, word, context, prev, prev2): - """Map tokens into a feature representation, implemented as a - {hashable: int} dict. If the features change, a new model must be - trained. - """ - - def add(name, *args): - features[" ".join((name,) + tuple(args))] += 1 - - i += len(self.START) - features = defaultdict(int) - # It's useful to have a constant feature, which acts sort of like a prior - add("bias") - add("i suffix", word[-3:]) - add("i pref1", word[0] if word else "") - add("i-1 tag", prev) - add("i-2 tag", prev2) - add("i tag+i-2 tag", prev, prev2) - add("i word", context[i]) - add("i-1 tag+i word", prev, context[i]) - add("i-1 word", context[i - 1]) - add("i-1 suffix", context[i - 1][-3:]) - add("i-2 word", context[i - 2]) - add("i+1 word", context[i + 1]) - add("i+1 suffix", context[i + 1][-3:]) - add("i+2 word", context[i + 2]) - return features - - def _make_tagdict(self, sentences): - """ - Make a tag dictionary for single-tag words. - :param sentences: A list of list of (word, tag) tuples. - """ - counts = defaultdict(lambda: defaultdict(int)) - for sentence in sentences: - self._sentences.append(sentence) - for word, tag in sentence: - counts[word][tag] += 1 - self.classes.add(tag) - freq_thresh = 20 - ambiguity_thresh = 0.97 - for word, tag_freqs in counts.items(): - tag, mode = max(tag_freqs.items(), key=lambda item: item[1]) - n = sum(tag_freqs.values()) - # Don't add rare words to the tag dictionary - # Only add quite unambiguous words - if n >= freq_thresh and (mode / n) >= ambiguity_thresh: - self.tagdict[word] = tag - - -def _pc(n, d): - return (n / d) * 100 - - -def _load_data_conll_format(filename): - print("Read from file: ", filename) - with open(filename, "rb") as fin: - sentences = [] - sentence = [] - for line in fin.readlines(): - line = line.strip() - # print line - if len(line) == 0: - sentences.append(sentence) - sentence = [] - continue - tokens = line.split("\t") - word = tokens[1] - tag = tokens[4] - sentence.append((word, tag)) - return sentences - - -def _get_pretrain_model(): - # Train and test on English part of ConLL data (WSJ part of Penn Treebank) - # Train: section 2-11 - # Test : section 23 - tagger = PerceptronTagger() - training = _load_data_conll_format("english_ptb_train.conll") - testing = _load_data_conll_format("english_ptb_test.conll") - print("Size of training and testing (sentence)", len(training), len(testing)) - # Train and save the model - tagger.train(training, PICKLE) - print("Accuracy : ", tagger.accuracy(testing)) - - -if __name__ == "__main__": - # _get_pretrain_model() - pass diff --git a/pipeline/nltk/tag/senna.py b/pipeline/nltk/tag/senna.py deleted file mode 100644 index 7b52b7ee0a7bc01614c3a2a397a6ffce47835999..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tag/senna.py +++ /dev/null @@ -1,134 +0,0 @@ -# Natural Language Toolkit: Senna POS Tagger -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Rami Al-Rfou' -# URL: -# For license information, see LICENSE.TXT - -""" -Senna POS tagger, NER Tagger, Chunk Tagger - -The input is: - -- path to the directory that contains SENNA executables. If the path is incorrect, - SennaTagger will automatically search for executable file specified in SENNA environment variable -- (optionally) the encoding of the input data (default:utf-8) - -Note: Unit tests for this module can be found in test/unit/test_senna.py - ->>> from nltk.tag import SennaTagger ->>> tagger = SennaTagger('/usr/share/senna-v3.0') # doctest: +SKIP ->>> tagger.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP -[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), -('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'NN'), ('?', '.')] - ->>> from nltk.tag import SennaChunkTagger ->>> chktagger = SennaChunkTagger('/usr/share/senna-v3.0') # doctest: +SKIP ->>> chktagger.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP -[('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'), -('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'), -('?', 'O')] - ->>> from nltk.tag import SennaNERTagger ->>> nertagger = SennaNERTagger('/usr/share/senna-v3.0') # doctest: +SKIP ->>> nertagger.tag('Shakespeare theatre was in London .'.split()) # doctest: +SKIP -[('Shakespeare', 'B-PER'), ('theatre', 'O'), ('was', 'O'), ('in', 'O'), -('London', 'B-LOC'), ('.', 'O')] ->>> nertagger.tag('UN headquarters are in NY , USA .'.split()) # doctest: +SKIP -[('UN', 'B-ORG'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), -('NY', 'B-LOC'), (',', 'O'), ('USA', 'B-LOC'), ('.', 'O')] -""" - -from nltk.classify import Senna - - -class SennaTagger(Senna): - def __init__(self, path, encoding="utf-8"): - super().__init__(path, ["pos"], encoding) - - def tag_sents(self, sentences): - """ - Applies the tag method over a list of sentences. This method will return - for each sentence a list of tuples of (word, tag). - """ - tagged_sents = super().tag_sents(sentences) - for i in range(len(tagged_sents)): - for j in range(len(tagged_sents[i])): - annotations = tagged_sents[i][j] - tagged_sents[i][j] = (annotations["word"], annotations["pos"]) - return tagged_sents - - -class SennaChunkTagger(Senna): - def __init__(self, path, encoding="utf-8"): - super().__init__(path, ["chk"], encoding) - - def tag_sents(self, sentences): - """ - Applies the tag method over a list of sentences. This method will return - for each sentence a list of tuples of (word, tag). - """ - tagged_sents = super().tag_sents(sentences) - for i in range(len(tagged_sents)): - for j in range(len(tagged_sents[i])): - annotations = tagged_sents[i][j] - tagged_sents[i][j] = (annotations["word"], annotations["chk"]) - return tagged_sents - - def bio_to_chunks(self, tagged_sent, chunk_type): - """ - Extracts the chunks in a BIO chunk-tagged sentence. - - >>> from nltk.tag import SennaChunkTagger - >>> chktagger = SennaChunkTagger('/usr/share/senna-v3.0') # doctest: +SKIP - >>> sent = 'What is the airspeed of an unladen swallow ?'.split() - >>> tagged_sent = chktagger.tag(sent) # doctest: +SKIP - >>> tagged_sent # doctest: +SKIP - [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'), - ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'), - ('?', 'O')] - >>> list(chktagger.bio_to_chunks(tagged_sent, chunk_type='NP')) # doctest: +SKIP - [('What', '0'), ('the airspeed', '2-3'), ('an unladen swallow', '5-6-7')] - - :param tagged_sent: A list of tuples of word and BIO chunk tag. - :type tagged_sent: list(tuple) - :param tagged_sent: The chunk tag that users want to extract, e.g. 'NP' or 'VP' - :type tagged_sent: str - - :return: An iterable of tuples of chunks that users want to extract - and their corresponding indices. - :rtype: iter(tuple(str)) - """ - current_chunk = [] - current_chunk_position = [] - for idx, word_pos in enumerate(tagged_sent): - word, pos = word_pos - if "-" + chunk_type in pos: # Append the word to the current_chunk. - current_chunk.append(word) - current_chunk_position.append(idx) - else: - if current_chunk: # Flush the full chunk when out of an NP. - _chunk_str = " ".join(current_chunk) - _chunk_pos_str = "-".join(map(str, current_chunk_position)) - yield _chunk_str, _chunk_pos_str - current_chunk = [] - current_chunk_position = [] - if current_chunk: # Flush the last chunk. - yield " ".join(current_chunk), "-".join(map(str, current_chunk_position)) - - -class SennaNERTagger(Senna): - def __init__(self, path, encoding="utf-8"): - super().__init__(path, ["ner"], encoding) - - def tag_sents(self, sentences): - """ - Applies the tag method over a list of sentences. This method will return - for each sentence a list of tuples of (word, tag). - """ - tagged_sents = super().tag_sents(sentences) - for i in range(len(tagged_sents)): - for j in range(len(tagged_sents[i])): - annotations = tagged_sents[i][j] - tagged_sents[i][j] = (annotations["word"], annotations["ner"]) - return tagged_sents diff --git a/pipeline/nltk/tag/sequential.py b/pipeline/nltk/tag/sequential.py deleted file mode 100644 index 3fb85c9fade8079ad5fd4ba7a517939741cb2440..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tag/sequential.py +++ /dev/null @@ -1,755 +0,0 @@ -# Natural Language Toolkit: Sequential Backoff Taggers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird (minor additions) -# Tiago Tresoldi (original affix tagger) -# URL: -# For license information, see LICENSE.TXT - -""" -Classes for tagging sentences sequentially, left to right. The -abstract base class SequentialBackoffTagger serves as the base -class for all the taggers in this module. Tagging of individual words -is performed by the method ``choose_tag()``, which is defined by -subclasses of SequentialBackoffTagger. If a tagger is unable to -determine a tag for the specified token, then its backoff tagger is -consulted instead. Any SequentialBackoffTagger may serve as a -backoff tagger for any other SequentialBackoffTagger. -""" -import ast -import re -from abc import abstractmethod -from typing import List, Optional, Tuple - -from nltk import jsontags -from nltk.classify import NaiveBayesClassifier -from nltk.probability import ConditionalFreqDist -from nltk.tag.api import FeaturesetTaggerI, TaggerI - - -###################################################################### -# Abstract Base Classes -###################################################################### -class SequentialBackoffTagger(TaggerI): - """ - An abstract base class for taggers that tags words sequentially, - left to right. Tagging of individual words is performed by the - ``choose_tag()`` method, which should be defined by subclasses. If - a tagger is unable to determine a tag for the specified token, - then its backoff tagger is consulted. - - :ivar _taggers: A list of all the taggers that should be tried to - tag a token (i.e., self and its backoff taggers). - """ - - def __init__(self, backoff=None): - if backoff is None: - self._taggers = [self] - else: - self._taggers = [self] + backoff._taggers - - @property - def backoff(self): - """The backoff tagger for this tagger.""" - return self._taggers[1] if len(self._taggers) > 1 else None - - def tag(self, tokens): - # docs inherited from TaggerI - tags = [] - for i in range(len(tokens)): - tags.append(self.tag_one(tokens, i, tags)) - return list(zip(tokens, tags)) - - def tag_one(self, tokens, index, history): - """ - Determine an appropriate tag for the specified token, and - return that tag. If this tagger is unable to determine a tag - for the specified token, then its backoff tagger is consulted. - - :rtype: str - :type tokens: list - :param tokens: The list of words that are being tagged. - :type index: int - :param index: The index of the word whose tag should be - returned. - :type history: list(str) - :param history: A list of the tags for all words before *index*. - """ - tag = None - for tagger in self._taggers: - tag = tagger.choose_tag(tokens, index, history) - if tag is not None: - break - return tag - - @abstractmethod - def choose_tag(self, tokens, index, history): - """ - Decide which tag should be used for the specified token, and - return that tag. If this tagger is unable to determine a tag - for the specified token, return None -- do not consult - the backoff tagger. This method should be overridden by - subclasses of SequentialBackoffTagger. - - :rtype: str - :type tokens: list - :param tokens: The list of words that are being tagged. - :type index: int - :param index: The index of the word whose tag should be - returned. - :type history: list(str) - :param history: A list of the tags for all words before *index*. - """ - - -class ContextTagger(SequentialBackoffTagger): - """ - An abstract base class for sequential backoff taggers that choose - a tag for a token based on the value of its "context". Different - subclasses are used to define different contexts. - - A ContextTagger chooses the tag for a token by calculating the - token's context, and looking up the corresponding tag in a table. - This table can be constructed manually; or it can be automatically - constructed based on a training corpus, using the ``_train()`` - factory method. - - :ivar _context_to_tag: Dictionary mapping contexts to tags. - """ - - def __init__(self, context_to_tag, backoff=None): - """ - :param context_to_tag: A dictionary mapping contexts to tags. - :param backoff: The backoff tagger that should be used for this tagger. - """ - super().__init__(backoff) - self._context_to_tag = context_to_tag if context_to_tag else {} - - @abstractmethod - def context(self, tokens, index, history): - """ - :return: the context that should be used to look up the tag - for the specified token; or None if the specified token - should not be handled by this tagger. - :rtype: (hashable) - """ - - def choose_tag(self, tokens, index, history): - context = self.context(tokens, index, history) - return self._context_to_tag.get(context) - - def size(self): - """ - :return: The number of entries in the table used by this - tagger to map from contexts to tags. - """ - return len(self._context_to_tag) - - def __repr__(self): - return f"<{self.__class__.__name__}: size={self.size()}>" - - def _train(self, tagged_corpus, cutoff=0, verbose=False): - """ - Initialize this ContextTagger's ``_context_to_tag`` table - based on the given training data. In particular, for each - context ``c`` in the training data, set - ``_context_to_tag[c]`` to the most frequent tag for that - context. However, exclude any contexts that are already - tagged perfectly by the backoff tagger(s). - - The old value of ``self._context_to_tag`` (if any) is discarded. - - :param tagged_corpus: A tagged corpus. Each item should be - a list of (word, tag tuples. - :param cutoff: If the most likely tag for a context occurs - fewer than cutoff times, then exclude it from the - context-to-tag table for the new tagger. - """ - - token_count = hit_count = 0 - - # A context is considered 'useful' if it's not already tagged - # perfectly by the backoff tagger. - useful_contexts = set() - - # Count how many times each tag occurs in each context. - fd = ConditionalFreqDist() - for sentence in tagged_corpus: - tokens, tags = zip(*sentence) - for index, (token, tag) in enumerate(sentence): - # Record the event. - token_count += 1 - context = self.context(tokens, index, tags[:index]) - if context is None: - continue - fd[context][tag] += 1 - # If the backoff got it wrong, this context is useful: - if self.backoff is None or tag != self.backoff.tag_one( - tokens, index, tags[:index] - ): - useful_contexts.add(context) - - # Build the context_to_tag table -- for each context, figure - # out what the most likely tag is. Only include contexts that - # we've seen at least `cutoff` times. - for context in useful_contexts: - best_tag = fd[context].max() - hits = fd[context][best_tag] - if hits > cutoff: - self._context_to_tag[context] = best_tag - hit_count += hits - - # Display some stats, if requested. - if verbose: - size = len(self._context_to_tag) - backoff = 100 - (hit_count * 100.0) / token_count - pruning = 100 - (size * 100.0) / len(fd.conditions()) - print("[Trained Unigram tagger:", end=" ") - print( - "size={}, backoff={:.2f}%, pruning={:.2f}%]".format( - size, backoff, pruning - ) - ) - - -###################################################################### -# Tagger Classes -###################################################################### - - -@jsontags.register_tag -class DefaultTagger(SequentialBackoffTagger): - """ - A tagger that assigns the same tag to every token. - - >>> from nltk.tag import DefaultTagger - >>> default_tagger = DefaultTagger('NN') - >>> list(default_tagger.tag('This is a test'.split())) - [('This', 'NN'), ('is', 'NN'), ('a', 'NN'), ('test', 'NN')] - - This tagger is recommended as a backoff tagger, in cases where - a more powerful tagger is unable to assign a tag to the word - (e.g. because the word was not seen during training). - - :param tag: The tag to assign to each token - :type tag: str - """ - - json_tag = "nltk.tag.sequential.DefaultTagger" - - def __init__(self, tag): - self._tag = tag - super().__init__(None) - - def encode_json_obj(self): - return self._tag - - @classmethod - def decode_json_obj(cls, obj): - tag = obj - return cls(tag) - - def choose_tag(self, tokens, index, history): - return self._tag # ignore token and history - - def __repr__(self): - return f"" - - -@jsontags.register_tag -class NgramTagger(ContextTagger): - """ - A tagger that chooses a token's tag based on its word string and - on the preceding n word's tags. In particular, a tuple - (tags[i-n:i-1], words[i]) is looked up in a table, and the - corresponding tag is returned. N-gram taggers are typically - trained on a tagged corpus. - - Train a new NgramTagger using the given training data or - the supplied model. In particular, construct a new tagger - whose table maps from each context (tag[i-n:i-1], word[i]) - to the most frequent tag for that context. But exclude any - contexts that are already tagged perfectly by the backoff - tagger. - - :param train: A tagged corpus consisting of a list of tagged - sentences, where each sentence is a list of (word, tag) tuples. - :param backoff: A backoff tagger, to be used by the new - tagger if it encounters an unknown context. - :param cutoff: If the most likely tag for a context occurs - fewer than *cutoff* times, then exclude it from the - context-to-tag table for the new tagger. - """ - - json_tag = "nltk.tag.sequential.NgramTagger" - - def __init__( - self, n, train=None, model=None, backoff=None, cutoff=0, verbose=False - ): - self._n = n - self._check_params(train, model) - - super().__init__(model, backoff) - - if train: - self._train(train, cutoff, verbose) - - def encode_json_obj(self): - _context_to_tag = {repr(k): v for k, v in self._context_to_tag.items()} - if "NgramTagger" in self.__class__.__name__: - return self._n, _context_to_tag, self.backoff - else: - return _context_to_tag, self.backoff - - @classmethod - def decode_json_obj(cls, obj): - try: - _n, _context_to_tag, backoff = obj - except ValueError: - _context_to_tag, backoff = obj - - if not _context_to_tag: - return backoff - - _context_to_tag = {ast.literal_eval(k): v for k, v in _context_to_tag.items()} - - if "NgramTagger" in cls.__name__: - return cls(_n, model=_context_to_tag, backoff=backoff) - else: - return cls(model=_context_to_tag, backoff=backoff) - - def context(self, tokens, index, history): - tag_context = tuple(history[max(0, index - self._n + 1) : index]) - return tag_context, tokens[index] - - -@jsontags.register_tag -class UnigramTagger(NgramTagger): - """ - Unigram Tagger - - The UnigramTagger finds the most likely tag for each word in a training - corpus, and then uses that information to assign tags to new tokens. - - >>> from nltk.corpus import brown - >>> from nltk.tag import UnigramTagger - >>> test_sent = brown.sents(categories='news')[0] - >>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500]) - >>> for tok, tag in unigram_tagger.tag(test_sent): - ... print("({}, {}), ".format(tok, tag)) # doctest: +NORMALIZE_WHITESPACE - (The, AT), (Fulton, NP-TL), (County, NN-TL), (Grand, JJ-TL), - (Jury, NN-TL), (said, VBD), (Friday, NR), (an, AT), - (investigation, NN), (of, IN), (Atlanta's, NP$), (recent, JJ), - (primary, NN), (election, NN), (produced, VBD), (``, ``), - (no, AT), (evidence, NN), ('', ''), (that, CS), (any, DTI), - (irregularities, NNS), (took, VBD), (place, NN), (., .), - - :param train: The corpus of training data, a list of tagged sentences - :type train: list(list(tuple(str, str))) - :param model: The tagger model - :type model: dict - :param backoff: Another tagger which this tagger will consult when it is - unable to tag a word - :type backoff: TaggerI - :param cutoff: The number of instances of training data the tagger must see - in order not to use the backoff tagger - :type cutoff: int - """ - - json_tag = "nltk.tag.sequential.UnigramTagger" - - def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False): - super().__init__(1, train, model, backoff, cutoff, verbose) - - def context(self, tokens, index, history): - return tokens[index] - - -@jsontags.register_tag -class BigramTagger(NgramTagger): - """ - A tagger that chooses a token's tag based its word string and on - the preceding words' tag. In particular, a tuple consisting - of the previous tag and the word is looked up in a table, and - the corresponding tag is returned. - - :param train: The corpus of training data, a list of tagged sentences - :type train: list(list(tuple(str, str))) - :param model: The tagger model - :type model: dict - :param backoff: Another tagger which this tagger will consult when it is - unable to tag a word - :type backoff: TaggerI - :param cutoff: The number of instances of training data the tagger must see - in order not to use the backoff tagger - :type cutoff: int - """ - - json_tag = "nltk.tag.sequential.BigramTagger" - - def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False): - super().__init__(2, train, model, backoff, cutoff, verbose) - - -@jsontags.register_tag -class TrigramTagger(NgramTagger): - """ - A tagger that chooses a token's tag based its word string and on - the preceding two words' tags. In particular, a tuple consisting - of the previous two tags and the word is looked up in a table, and - the corresponding tag is returned. - - :param train: The corpus of training data, a list of tagged sentences - :type train: list(list(tuple(str, str))) - :param model: The tagger model - :type model: dict - :param backoff: Another tagger which this tagger will consult when it is - unable to tag a word - :type backoff: TaggerI - :param cutoff: The number of instances of training data the tagger must see - in order not to use the backoff tagger - :type cutoff: int - """ - - json_tag = "nltk.tag.sequential.TrigramTagger" - - def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False): - super().__init__(3, train, model, backoff, cutoff, verbose) - - -@jsontags.register_tag -class AffixTagger(ContextTagger): - """ - A tagger that chooses a token's tag based on a leading or trailing - substring of its word string. (It is important to note that these - substrings are not necessarily "true" morphological affixes). In - particular, a fixed-length substring of the word is looked up in a - table, and the corresponding tag is returned. Affix taggers are - typically constructed by training them on a tagged corpus. - - Construct a new affix tagger. - - :param affix_length: The length of the affixes that should be - considered during training and tagging. Use negative - numbers for suffixes. - :param min_stem_length: Any words whose length is less than - min_stem_length+abs(affix_length) will be assigned a - tag of None by this tagger. - """ - - json_tag = "nltk.tag.sequential.AffixTagger" - - def __init__( - self, - train=None, - model=None, - affix_length=-3, - min_stem_length=2, - backoff=None, - cutoff=0, - verbose=False, - ): - - self._check_params(train, model) - - super().__init__(model, backoff) - - self._affix_length = affix_length - self._min_word_length = min_stem_length + abs(affix_length) - - if train: - self._train(train, cutoff, verbose) - - def encode_json_obj(self): - return ( - self._affix_length, - self._min_word_length, - self._context_to_tag, - self.backoff, - ) - - @classmethod - def decode_json_obj(cls, obj): - _affix_length, _min_word_length, _context_to_tag, backoff = obj - return cls( - affix_length=_affix_length, - min_stem_length=_min_word_length - abs(_affix_length), - model=_context_to_tag, - backoff=backoff, - ) - - def context(self, tokens, index, history): - token = tokens[index] - if len(token) < self._min_word_length: - return None - elif self._affix_length > 0: - return token[: self._affix_length] - else: - return token[self._affix_length :] - - -@jsontags.register_tag -class RegexpTagger(SequentialBackoffTagger): - r""" - Regular Expression Tagger - - The RegexpTagger assigns tags to tokens by comparing their - word strings to a series of regular expressions. The following tagger - uses word suffixes to make guesses about the correct Brown Corpus part - of speech tag: - - >>> from nltk.corpus import brown - >>> from nltk.tag import RegexpTagger - >>> test_sent = brown.sents(categories='news')[0] - >>> regexp_tagger = RegexpTagger( - ... [(r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers - ... (r'(The|the|A|a|An|an)$', 'AT'), # articles - ... (r'.*able$', 'JJ'), # adjectives - ... (r'.*ness$', 'NN'), # nouns formed from adjectives - ... (r'.*ly$', 'RB'), # adverbs - ... (r'.*s$', 'NNS'), # plural nouns - ... (r'.*ing$', 'VBG'), # gerunds - ... (r'.*ed$', 'VBD'), # past tense verbs - ... (r'.*', 'NN') # nouns (default) - ... ]) - >>> regexp_tagger - - >>> regexp_tagger.tag(test_sent) # doctest: +NORMALIZE_WHITESPACE - [('The', 'AT'), ('Fulton', 'NN'), ('County', 'NN'), ('Grand', 'NN'), ('Jury', 'NN'), - ('said', 'NN'), ('Friday', 'NN'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'NN'), - ("Atlanta's", 'NNS'), ('recent', 'NN'), ('primary', 'NN'), ('election', 'NN'), - ('produced', 'VBD'), ('``', 'NN'), ('no', 'NN'), ('evidence', 'NN'), ("''", 'NN'), - ('that', 'NN'), ('any', 'NN'), ('irregularities', 'NNS'), ('took', 'NN'), - ('place', 'NN'), ('.', 'NN')] - - :type regexps: list(tuple(str, str)) - :param regexps: A list of ``(regexp, tag)`` pairs, each of - which indicates that a word matching ``regexp`` should - be tagged with ``tag``. The pairs will be evaluated in - order. If none of the regexps match a word, then the - optional backoff tagger is invoked, else it is - assigned the tag None. - """ - - json_tag = "nltk.tag.sequential.RegexpTagger" - - def __init__( - self, regexps: List[Tuple[str, str]], backoff: Optional[TaggerI] = None - ): - super().__init__(backoff) - self._regexps = [] - for regexp, tag in regexps: - try: - self._regexps.append((re.compile(regexp), tag)) - except Exception as e: - raise Exception( - f"Invalid RegexpTagger regexp: {e}\n- regexp: {regexp!r}\n- tag: {tag!r}" - ) from e - - def encode_json_obj(self): - return [(regexp.pattern, tag) for regexp, tag in self._regexps], self.backoff - - @classmethod - def decode_json_obj(cls, obj): - regexps, backoff = obj - return cls(regexps, backoff) - - def choose_tag(self, tokens, index, history): - for regexp, tag in self._regexps: - if re.match(regexp, tokens[index]): - return tag - return None - - def __repr__(self): - return f"" - - -class ClassifierBasedTagger(SequentialBackoffTagger, FeaturesetTaggerI): - """ - A sequential tagger that uses a classifier to choose the tag for - each token in a sentence. The featureset input for the classifier - is generated by a feature detector function:: - - feature_detector(tokens, index, history) -> featureset - - Where tokens is the list of unlabeled tokens in the sentence; - index is the index of the token for which feature detection - should be performed; and history is list of the tags for all - tokens before index. - - Construct a new classifier-based sequential tagger. - - :param feature_detector: A function used to generate the - featureset input for the classifier:: - feature_detector(tokens, index, history) -> featureset - - :param train: A tagged corpus consisting of a list of tagged - sentences, where each sentence is a list of (word, tag) tuples. - - :param backoff: A backoff tagger, to be used by the new tagger - if it encounters an unknown context. - - :param classifier_builder: A function used to train a new - classifier based on the data in *train*. It should take - one argument, a list of labeled featuresets (i.e., - (featureset, label) tuples). - - :param classifier: The classifier that should be used by the - tagger. This is only useful if you want to manually - construct the classifier; normally, you would use *train* - instead. - - :param backoff: A backoff tagger, used if this tagger is - unable to determine a tag for a given token. - - :param cutoff_prob: If specified, then this tagger will fall - back on its backoff tagger if the probability of the most - likely tag is less than *cutoff_prob*. - """ - - def __init__( - self, - feature_detector=None, - train=None, - classifier_builder=NaiveBayesClassifier.train, - classifier=None, - backoff=None, - cutoff_prob=None, - verbose=False, - ): - self._check_params(train, classifier) - - super().__init__(backoff) - - if (train and classifier) or (not train and not classifier): - raise ValueError( - "Must specify either training data or " "trained classifier." - ) - - if feature_detector is not None: - self._feature_detector = feature_detector - # The feature detector function, used to generate a featureset - # or each token: feature_detector(tokens, index, history) -> featureset - - self._cutoff_prob = cutoff_prob - """Cutoff probability for tagging -- if the probability of the - most likely tag is less than this, then use backoff.""" - - self._classifier = classifier - """The classifier used to choose a tag for each token.""" - - if train: - self._train(train, classifier_builder, verbose) - - def choose_tag(self, tokens, index, history): - # Use our feature detector to get the featureset. - featureset = self.feature_detector(tokens, index, history) - - # Use the classifier to pick a tag. If a cutoff probability - # was specified, then check that the tag's probability is - # higher than that cutoff first; otherwise, return None. - if self._cutoff_prob is None: - return self._classifier.classify(featureset) - - pdist = self._classifier.prob_classify(featureset) - tag = pdist.max() - return tag if pdist.prob(tag) >= self._cutoff_prob else None - - def _train(self, tagged_corpus, classifier_builder, verbose): - """ - Build a new classifier, based on the given training data - *tagged_corpus*. - """ - - classifier_corpus = [] - if verbose: - print("Constructing training corpus for classifier.") - - for sentence in tagged_corpus: - history = [] - untagged_sentence, tags = zip(*sentence) - for index in range(len(sentence)): - featureset = self.feature_detector(untagged_sentence, index, history) - classifier_corpus.append((featureset, tags[index])) - history.append(tags[index]) - - if verbose: - print(f"Training classifier ({len(classifier_corpus)} instances)") - self._classifier = classifier_builder(classifier_corpus) - - def __repr__(self): - return f"" - - def feature_detector(self, tokens, index, history): - """ - Return the feature detector that this tagger uses to generate - featuresets for its classifier. The feature detector is a - function with the signature:: - - feature_detector(tokens, index, history) -> featureset - - See ``classifier()`` - """ - return self._feature_detector(tokens, index, history) - - def classifier(self): - """ - Return the classifier that this tagger uses to choose a tag - for each word in a sentence. The input for this classifier is - generated using this tagger's feature detector. - See ``feature_detector()`` - """ - return self._classifier - - -class ClassifierBasedPOSTagger(ClassifierBasedTagger): - """ - A classifier based part of speech tagger. - """ - - def feature_detector(self, tokens, index, history): - word = tokens[index] - if index == 0: - prevword = prevprevword = None - prevtag = prevprevtag = None - elif index == 1: - prevword = tokens[index - 1].lower() - prevprevword = None - prevtag = history[index - 1] - prevprevtag = None - else: - prevword = tokens[index - 1].lower() - prevprevword = tokens[index - 2].lower() - prevtag = history[index - 1] - prevprevtag = history[index - 2] - - if re.match(r"[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$", word): - shape = "number" - elif re.match(r"\W+$", word): - shape = "punct" - elif re.match("[A-Z][a-z]+$", word): - shape = "upcase" - elif re.match("[a-z]+$", word): - shape = "downcase" - elif re.match(r"\w+$", word): - shape = "mixedcase" - else: - shape = "other" - - features = { - "prevtag": prevtag, - "prevprevtag": prevprevtag, - "word": word, - "word.lower": word.lower(), - "suffix3": word.lower()[-3:], - "suffix2": word.lower()[-2:], - "suffix1": word.lower()[-1:], - "prevprevword": prevprevword, - "prevword": prevword, - "prevtag+word": f"{prevtag}+{word.lower()}", - "prevprevtag+word": f"{prevprevtag}+{word.lower()}", - "prevword+word": f"{prevword}+{word.lower()}", - "shape": shape, - } - return features diff --git a/pipeline/nltk/tag/stanford.py b/pipeline/nltk/tag/stanford.py deleted file mode 100644 index 7c21e2dd20dec5c3b242d0e5007a4bf51d8ef8f8..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tag/stanford.py +++ /dev/null @@ -1,236 +0,0 @@ -# Natural Language Toolkit: Interface to the Stanford Part-of-speech and Named-Entity Taggers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Nitin Madnani -# Rami Al-Rfou' -# URL: -# For license information, see LICENSE.TXT - -""" -A module for interfacing with the Stanford taggers. - -Tagger models need to be downloaded from https://nlp.stanford.edu/software -and the STANFORD_MODELS environment variable set (a colon-separated -list of paths). - -For more details see the documentation for StanfordPOSTagger and StanfordNERTagger. -""" - -import os -import tempfile -import warnings -from abc import abstractmethod -from subprocess import PIPE - -from nltk.internals import _java_options, config_java, find_file, find_jar, java -from nltk.tag.api import TaggerI - -_stanford_url = "https://nlp.stanford.edu/software" - - -class StanfordTagger(TaggerI): - """ - An interface to Stanford taggers. Subclasses must define: - - - ``_cmd`` property: A property that returns the command that will be - executed. - - ``_SEPARATOR``: Class constant that represents that character that - is used to separate the tokens from their tags. - - ``_JAR`` file: Class constant that represents the jar file name. - """ - - _SEPARATOR = "" - _JAR = "" - - def __init__( - self, - model_filename, - path_to_jar=None, - encoding="utf8", - verbose=False, - java_options="-mx1000m", - ): - # Raise deprecation warning. - warnings.warn( - str( - "\nThe StanfordTokenizer will " - "be deprecated in version 3.2.6.\n" - "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead." - ), - DeprecationWarning, - stacklevel=2, - ) - - if not self._JAR: - warnings.warn( - "The StanfordTagger class is not meant to be " - "instantiated directly. Did you mean " - "StanfordPOSTagger or StanfordNERTagger?" - ) - self._stanford_jar = find_jar( - self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose - ) - - self._stanford_model = find_file( - model_filename, env_vars=("STANFORD_MODELS",), verbose=verbose - ) - - self._encoding = encoding - self.java_options = java_options - - @property - @abstractmethod - def _cmd(self): - """ - A property that returns the command that will be executed. - """ - - def tag(self, tokens): - # This function should return list of tuple rather than list of list - return sum(self.tag_sents([tokens]), []) - - def tag_sents(self, sentences): - encoding = self._encoding - default_options = " ".join(_java_options) - config_java(options=self.java_options, verbose=False) - - # Create a temporary input file - _input_fh, self._input_file_path = tempfile.mkstemp(text=True) - - cmd = list(self._cmd) - cmd.extend(["-encoding", encoding]) - - # Write the actual sentences to the temporary input file - _input_fh = os.fdopen(_input_fh, "wb") - _input = "\n".join(" ".join(x) for x in sentences) - if isinstance(_input, str) and encoding: - _input = _input.encode(encoding) - _input_fh.write(_input) - _input_fh.close() - - # Run the tagger and get the output - stanpos_output, _stderr = java( - cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE - ) - stanpos_output = stanpos_output.decode(encoding) - - # Delete the temporary file - os.unlink(self._input_file_path) - - # Return java configurations to their default values - config_java(options=default_options, verbose=False) - - return self.parse_output(stanpos_output, sentences) - - def parse_output(self, text, sentences=None): - # Output the tagged sentences - tagged_sentences = [] - for tagged_sentence in text.strip().split("\n"): - sentence = [] - for tagged_word in tagged_sentence.strip().split(): - word_tags = tagged_word.strip().split(self._SEPARATOR) - sentence.append( - ("".join(word_tags[:-1]), word_tags[-1].replace("0", "").upper()) - ) - tagged_sentences.append(sentence) - return tagged_sentences - - -class StanfordPOSTagger(StanfordTagger): - """ - A class for pos tagging with Stanford Tagger. The input is the paths to: - - a model trained on training data - - (optionally) the path to the stanford tagger jar file. If not specified here, - then this jar file must be specified in the CLASSPATH environment variable. - - (optionally) the encoding of the training data (default: UTF-8) - - Example: - - >>> from nltk.tag import StanfordPOSTagger - >>> st = StanfordPOSTagger('english-bidirectional-distsim.tagger') # doctest: +SKIP - >>> st.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP - [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')] - """ - - _SEPARATOR = "_" - _JAR = "stanford-postagger.jar" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - @property - def _cmd(self): - return [ - "edu.stanford.nlp.tagger.maxent.MaxentTagger", - "-model", - self._stanford_model, - "-textFile", - self._input_file_path, - "-tokenize", - "false", - "-outputFormatOptions", - "keepEmptySentences", - ] - - -class StanfordNERTagger(StanfordTagger): - """ - A class for Named-Entity Tagging with Stanford Tagger. The input is the paths to: - - - a model trained on training data - - (optionally) the path to the stanford tagger jar file. If not specified here, - then this jar file must be specified in the CLASSPATH environment variable. - - (optionally) the encoding of the training data (default: UTF-8) - - Example: - - >>> from nltk.tag import StanfordNERTagger - >>> st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') # doctest: +SKIP - >>> st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP - [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), - ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), - ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'LOCATION')] - """ - - _SEPARATOR = "/" - _JAR = "stanford-ner.jar" - _FORMAT = "slashTags" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - @property - def _cmd(self): - # Adding -tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions tokenizeNLs=false for not using stanford Tokenizer - return [ - "edu.stanford.nlp.ie.crf.CRFClassifier", - "-loadClassifier", - self._stanford_model, - "-textFile", - self._input_file_path, - "-outputFormat", - self._FORMAT, - "-tokenizerFactory", - "edu.stanford.nlp.process.WhitespaceTokenizer", - "-tokenizerOptions", - '"tokenizeNLs=false"', - ] - - def parse_output(self, text, sentences): - if self._FORMAT == "slashTags": - # Joint together to a big list - tagged_sentences = [] - for tagged_sentence in text.strip().split("\n"): - for tagged_word in tagged_sentence.strip().split(): - word_tags = tagged_word.strip().split(self._SEPARATOR) - tagged_sentences.append(("".join(word_tags[:-1]), word_tags[-1])) - - # Separate it according to the input - result = [] - start = 0 - for sent in sentences: - result.append(tagged_sentences[start : start + len(sent)]) - start += len(sent) - return result - - raise NotImplementedError diff --git a/pipeline/nltk/tag/tnt.py b/pipeline/nltk/tag/tnt.py deleted file mode 100644 index a505104d812532af561ee3d3d9d80611f78db2cd..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tag/tnt.py +++ /dev/null @@ -1,579 +0,0 @@ -# Natural Language Toolkit: TnT Tagger -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Sam Huston -# -# URL: -# For license information, see LICENSE.TXT - -""" -Implementation of 'TnT - A Statisical Part of Speech Tagger' -by Thorsten Brants - -https://aclanthology.org/A00-1031.pdf -""" - -from math import log -from operator import itemgetter - -from nltk.probability import ConditionalFreqDist, FreqDist -from nltk.tag.api import TaggerI - - -class TnT(TaggerI): - """ - TnT - Statistical POS tagger - - IMPORTANT NOTES: - - * DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS - - - It is possible to provide an untrained POS tagger to - create tags for unknown words, see __init__ function - - * SHOULD BE USED WITH SENTENCE-DELIMITED INPUT - - - Due to the nature of this tagger, it works best when - trained over sentence delimited input. - - However it still produces good results if the training - data and testing data are separated on all punctuation eg: [,.?!] - - Input for training is expected to be a list of sentences - where each sentence is a list of (word, tag) tuples - - Input for tag function is a single sentence - Input for tagdata function is a list of sentences - Output is of a similar form - - * Function provided to process text that is unsegmented - - - Please see basic_sent_chop() - - - TnT uses a second order Markov model to produce tags for - a sequence of input, specifically: - - argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T) - - IE: the maximum projection of a set of probabilities - - The set of possible tags for a given word is derived - from the training data. It is the set of all tags - that exact word has been assigned. - - To speed up and get more precision, we can use log addition - to instead multiplication, specifically: - - argmax [Sigma(log(P(t_i|t_i-1,t_i-2))+log(P(w_i|t_i)))] + - log(P(t_T+1|t_T)) - - The probability of a tag for a given word is the linear - interpolation of 3 markov models; a zero-order, first-order, - and a second order model. - - P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) + - l3*P(t_i| t_i-1, t_i-2) - - A beam search is used to limit the memory usage of the algorithm. - The degree of the beam can be changed using N in the initialization. - N represents the maximum number of possible solutions to maintain - while tagging. - - It is possible to differentiate the tags which are assigned to - capitalized words. However this does not result in a significant - gain in the accuracy of the results. - """ - - def __init__(self, unk=None, Trained=False, N=1000, C=False): - """ - Construct a TnT statistical tagger. Tagger must be trained - before being used to tag input. - - :param unk: instance of a POS tagger, conforms to TaggerI - :type unk: TaggerI - :param Trained: Indication that the POS tagger is trained or not - :type Trained: bool - :param N: Beam search degree (see above) - :type N: int - :param C: Capitalization flag - :type C: bool - - Initializer, creates frequency distributions to be used - for tagging - - _lx values represent the portion of the tri/bi/uni taggers - to be used to calculate the probability - - N value is the number of possible solutions to maintain - while tagging. A good value for this is 1000 - - C is a boolean value which specifies to use or - not use the Capitalization of the word as additional - information for tagging. - NOTE: using capitalization may not increase the accuracy - of the tagger - """ - - self._uni = FreqDist() - self._bi = ConditionalFreqDist() - self._tri = ConditionalFreqDist() - self._wd = ConditionalFreqDist() - self._eos = ConditionalFreqDist() - self._l1 = 0.0 - self._l2 = 0.0 - self._l3 = 0.0 - self._N = N - self._C = C - self._T = Trained - - self._unk = unk - - # statistical tools (ignore or delete me) - self.unknown = 0 - self.known = 0 - - def train(self, data): - """ - Uses a set of tagged data to train the tagger. - If an unknown word tagger is specified, - it is trained on the same data. - - :param data: List of lists of (word, tag) tuples - :type data: tuple(str) - """ - - # Ensure that local C flag is initialized before use - C = False - - if self._unk is not None and self._T == False: - self._unk.train(data) - - for sent in data: - history = [("BOS", False), ("BOS", False)] - for w, t in sent: - - # if capitalization is requested, - # and the word begins with a capital - # set local flag C to True - if self._C and w[0].isupper(): - C = True - - self._wd[w][t] += 1 - self._uni[(t, C)] += 1 - self._bi[history[1]][(t, C)] += 1 - self._tri[tuple(history)][(t, C)] += 1 - - history.append((t, C)) - history.pop(0) - - # set local flag C to false for the next word - C = False - - self._eos[t]["EOS"] += 1 - - # compute lambda values from the trained frequency distributions - self._compute_lambda() - - def _compute_lambda(self): - """ - creates lambda values based upon training data - - NOTE: no need to explicitly reference C, - it is contained within the tag variable :: tag == (tag,C) - - for each tag trigram (t1, t2, t3) - depending on the maximum value of - - f(t1,t2,t3)-1 / f(t1,t2)-1 - - f(t2,t3)-1 / f(t2)-1 - - f(t3)-1 / N-1 - - increment l3,l2, or l1 by f(t1,t2,t3) - - ISSUES -- Resolutions: - if 2 values are equal, increment both lambda values - by (f(t1,t2,t3) / 2) - """ - - # temporary lambda variables - tl1 = 0.0 - tl2 = 0.0 - tl3 = 0.0 - - # for each t1,t2 in system - for history in self._tri.conditions(): - (h1, h2) = history - - # for each t3 given t1,t2 in system - # (NOTE: tag actually represents (tag,C)) - # However no effect within this function - for tag in self._tri[history].keys(): - - # if there has only been 1 occurrence of this tag in the data - # then ignore this trigram. - if self._uni[tag] == 1: - continue - - # safe_div provides a safe floating point division - # it returns -1 if the denominator is 0 - c3 = self._safe_div( - (self._tri[history][tag] - 1), (self._tri[history].N() - 1) - ) - c2 = self._safe_div((self._bi[h2][tag] - 1), (self._bi[h2].N() - 1)) - c1 = self._safe_div((self._uni[tag] - 1), (self._uni.N() - 1)) - - # if c1 is the maximum value: - if (c1 > c3) and (c1 > c2): - tl1 += self._tri[history][tag] - - # if c2 is the maximum value - elif (c2 > c3) and (c2 > c1): - tl2 += self._tri[history][tag] - - # if c3 is the maximum value - elif (c3 > c2) and (c3 > c1): - tl3 += self._tri[history][tag] - - # if c3, and c2 are equal and larger than c1 - elif (c3 == c2) and (c3 > c1): - tl2 += self._tri[history][tag] / 2.0 - tl3 += self._tri[history][tag] / 2.0 - - # if c1, and c2 are equal and larger than c3 - # this might be a dumb thing to do....(not sure yet) - elif (c2 == c1) and (c1 > c3): - tl1 += self._tri[history][tag] / 2.0 - tl2 += self._tri[history][tag] / 2.0 - - # otherwise there might be a problem - # eg: all values = 0 - else: - pass - - # Lambda normalisation: - # ensures that l1+l2+l3 = 1 - self._l1 = tl1 / (tl1 + tl2 + tl3) - self._l2 = tl2 / (tl1 + tl2 + tl3) - self._l3 = tl3 / (tl1 + tl2 + tl3) - - def _safe_div(self, v1, v2): - """ - Safe floating point division function, does not allow division by 0 - returns -1 if the denominator is 0 - """ - if v2 == 0: - return -1 - else: - return v1 / v2 - - def tagdata(self, data): - """ - Tags each sentence in a list of sentences - - :param data:list of list of words - :type data: [[string,],] - :return: list of list of (word, tag) tuples - - Invokes tag(sent) function for each sentence - compiles the results into a list of tagged sentences - each tagged sentence is a list of (word, tag) tuples - """ - res = [] - for sent in data: - res1 = self.tag(sent) - res.append(res1) - return res - - def tag(self, data): - """ - Tags a single sentence - - :param data: list of words - :type data: [string,] - - :return: [(word, tag),] - - Calls recursive function '_tagword' - to produce a list of tags - - Associates the sequence of returned tags - with the correct words in the input sequence - - returns a list of (word, tag) tuples - """ - - current_state = [(["BOS", "BOS"], 0.0)] - - sent = list(data) - - tags = self._tagword(sent, current_state) - - res = [] - for i in range(len(sent)): - # unpack and discard the C flags - (t, C) = tags[i + 2] - res.append((sent[i], t)) - - return res - - def _tagword(self, sent, current_states): - """ - :param sent : List of words remaining in the sentence - :type sent : [word,] - :param current_states : List of possible tag combinations for - the sentence so far, and the log probability - associated with each tag combination - :type current_states : [([tag, ], logprob), ] - - Tags the first word in the sentence and - recursively tags the reminder of sentence - - Uses formula specified above to calculate the probability - of a particular tag - """ - - # if this word marks the end of the sentence, - # return the most probable tag - if sent == []: - (h, logp) = current_states[0] - return h - - # otherwise there are more words to be tagged - word = sent[0] - sent = sent[1:] - new_states = [] - - # if the Capitalisation is requested, - # initialise the flag for this word - C = False - if self._C and word[0].isupper(): - C = True - - # if word is known - # compute the set of possible tags - # and their associated log probabilities - if word in self._wd: - self.known += 1 - - for (history, curr_sent_logprob) in current_states: - logprobs = [] - - for t in self._wd[word].keys(): - tC = (t, C) - p_uni = self._uni.freq(tC) - p_bi = self._bi[history[-1]].freq(tC) - p_tri = self._tri[tuple(history[-2:])].freq(tC) - p_wd = self._wd[word][t] / self._uni[tC] - p = self._l1 * p_uni + self._l2 * p_bi + self._l3 * p_tri - p2 = log(p, 2) + log(p_wd, 2) - - # compute the result of appending each tag to this history - new_states.append((history + [tC], curr_sent_logprob + p2)) - - # otherwise a new word, set of possible tags is unknown - else: - self.unknown += 1 - - # since a set of possible tags, - # and the probability of each specific tag - # can not be returned from most classifiers: - # specify that any unknown words are tagged with certainty - p = 1 - - # if no unknown word tagger has been specified - # then use the tag 'Unk' - if self._unk is None: - tag = ("Unk", C) - - # otherwise apply the unknown word tagger - else: - [(_w, t)] = list(self._unk.tag([word])) - tag = (t, C) - - for (history, logprob) in current_states: - history.append(tag) - - new_states = current_states - - # now have computed a set of possible new_states - - # sort states by log prob - # set is now ordered greatest to least log probability - new_states.sort(reverse=True, key=itemgetter(1)) - - # del everything after N (threshold) - # this is the beam search cut - if len(new_states) > self._N: - new_states = new_states[: self._N] - - # compute the tags for the rest of the sentence - # return the best list of tags for the sentence - return self._tagword(sent, new_states) - - -######################################## -# helper function -- basic sentence tokenizer -######################################## - - -def basic_sent_chop(data, raw=True): - """ - Basic method for tokenizing input into sentences - for this tagger: - - :param data: list of tokens (words or (word, tag) tuples) - :type data: str or tuple(str, str) - :param raw: boolean flag marking the input data - as a list of words or a list of tagged words - :type raw: bool - :return: list of sentences - sentences are a list of tokens - tokens are the same as the input - - Function takes a list of tokens and separates the tokens into lists - where each list represents a sentence fragment - This function can separate both tagged and raw sequences into - basic sentences. - - Sentence markers are the set of [,.!?] - - This is a simple method which enhances the performance of the TnT - tagger. Better sentence tokenization will further enhance the results. - """ - - new_data = [] - curr_sent = [] - sent_mark = [",", ".", "?", "!"] - - if raw: - for word in data: - if word in sent_mark: - curr_sent.append(word) - new_data.append(curr_sent) - curr_sent = [] - else: - curr_sent.append(word) - - else: - for (word, tag) in data: - if word in sent_mark: - curr_sent.append((word, tag)) - new_data.append(curr_sent) - curr_sent = [] - else: - curr_sent.append((word, tag)) - return new_data - - -def demo(): - from nltk.corpus import brown - - sents = list(brown.tagged_sents()) - test = list(brown.sents()) - - tagger = TnT() - tagger.train(sents[200:1000]) - - tagged_data = tagger.tagdata(test[100:120]) - - for j in range(len(tagged_data)): - s = tagged_data[j] - t = sents[j + 100] - for i in range(len(s)): - print(s[i], "--", t[i]) - print() - - -def demo2(): - from nltk.corpus import treebank - - d = list(treebank.tagged_sents()) - - t = TnT(N=1000, C=False) - s = TnT(N=1000, C=True) - t.train(d[(11) * 100 :]) - s.train(d[(11) * 100 :]) - - for i in range(10): - tacc = t.accuracy(d[i * 100 : ((i + 1) * 100)]) - tp_un = t.unknown / (t.known + t.unknown) - tp_kn = t.known / (t.known + t.unknown) - t.unknown = 0 - t.known = 0 - - print("Capitalization off:") - print("Accuracy:", tacc) - print("Percentage known:", tp_kn) - print("Percentage unknown:", tp_un) - print("Accuracy over known words:", (tacc / tp_kn)) - - sacc = s.accuracy(d[i * 100 : ((i + 1) * 100)]) - sp_un = s.unknown / (s.known + s.unknown) - sp_kn = s.known / (s.known + s.unknown) - s.unknown = 0 - s.known = 0 - - print("Capitalization on:") - print("Accuracy:", sacc) - print("Percentage known:", sp_kn) - print("Percentage unknown:", sp_un) - print("Accuracy over known words:", (sacc / sp_kn)) - - -def demo3(): - from nltk.corpus import brown, treebank - - d = list(treebank.tagged_sents()) - e = list(brown.tagged_sents()) - - d = d[:1000] - e = e[:1000] - - d10 = int(len(d) * 0.1) - e10 = int(len(e) * 0.1) - - tknacc = 0 - sknacc = 0 - tallacc = 0 - sallacc = 0 - tknown = 0 - sknown = 0 - - for i in range(10): - - t = TnT(N=1000, C=False) - s = TnT(N=1000, C=False) - - dtest = d[(i * d10) : ((i + 1) * d10)] - etest = e[(i * e10) : ((i + 1) * e10)] - - dtrain = d[: (i * d10)] + d[((i + 1) * d10) :] - etrain = e[: (i * e10)] + e[((i + 1) * e10) :] - - t.train(dtrain) - s.train(etrain) - - tacc = t.accuracy(dtest) - tp_un = t.unknown / (t.known + t.unknown) - tp_kn = t.known / (t.known + t.unknown) - tknown += tp_kn - t.unknown = 0 - t.known = 0 - - sacc = s.accuracy(etest) - sp_un = s.unknown / (s.known + s.unknown) - sp_kn = s.known / (s.known + s.unknown) - sknown += sp_kn - s.unknown = 0 - s.known = 0 - - tknacc += tacc / tp_kn - sknacc += sacc / tp_kn - tallacc += tacc - sallacc += sacc - - # print(i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc) - - print("brown: acc over words known:", 10 * tknacc) - print(" : overall accuracy:", 10 * tallacc) - print(" : words known:", 10 * tknown) - print("treebank: acc over words known:", 10 * sknacc) - print(" : overall accuracy:", 10 * sallacc) - print(" : words known:", 10 * sknown) diff --git a/pipeline/nltk/tag/util.py b/pipeline/nltk/tag/util.py deleted file mode 100644 index e35b98195f2b7b448775a49795e0f34d612624a6..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tag/util.py +++ /dev/null @@ -1,72 +0,0 @@ -# Natural Language Toolkit: Tagger Utilities -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird -# URL: -# For license information, see LICENSE.TXT - - -def str2tuple(s, sep="/"): - """ - Given the string representation of a tagged token, return the - corresponding tuple representation. The rightmost occurrence of - *sep* in *s* will be used to divide *s* into a word string and - a tag string. If *sep* does not occur in *s*, return (s, None). - - >>> from nltk.tag.util import str2tuple - >>> str2tuple('fly/NN') - ('fly', 'NN') - - :type s: str - :param s: The string representation of a tagged token. - :type sep: str - :param sep: The separator string used to separate word strings - from tags. - """ - loc = s.rfind(sep) - if loc >= 0: - return (s[:loc], s[loc + len(sep) :].upper()) - else: - return (s, None) - - -def tuple2str(tagged_token, sep="/"): - """ - Given the tuple representation of a tagged token, return the - corresponding string representation. This representation is - formed by concatenating the token's word string, followed by the - separator, followed by the token's tag. (If the tag is None, - then just return the bare word string.) - - >>> from nltk.tag.util import tuple2str - >>> tagged_token = ('fly', 'NN') - >>> tuple2str(tagged_token) - 'fly/NN' - - :type tagged_token: tuple(str, str) - :param tagged_token: The tuple representation of a tagged token. - :type sep: str - :param sep: The separator string used to separate word strings - from tags. - """ - word, tag = tagged_token - if tag is None: - return word - else: - assert sep not in tag, "tag may not contain sep!" - return f"{word}{sep}{tag}" - - -def untag(tagged_sentence): - """ - Given a tagged sentence, return an untagged version of that - sentence. I.e., return a list containing the first element - of each tuple in *tagged_sentence*. - - >>> from nltk.tag.util import untag - >>> untag([('John', 'NNP'), ('saw', 'VBD'), ('Mary', 'NNP')]) - ['John', 'saw', 'Mary'] - - """ - return [w for (w, t) in tagged_sentence] diff --git a/pipeline/nltk/tbl/__init__.py b/pipeline/nltk/tbl/__init__.py deleted file mode 100644 index 3387daec4b489d83a4f87b9652a0309f7c4e1ce5..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tbl/__init__.py +++ /dev/null @@ -1,31 +0,0 @@ -# Natural Language Toolkit: Transformation-based learning -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Marcus Uneson -# based on previous (nltk2) version by -# Christopher Maloof, Edward Loper, Steven Bird -# URL: -# For license information, see LICENSE.TXT - -""" -Transformation Based Learning - -A general purpose package for Transformation Based Learning, -currently used by nltk.tag.BrillTagger. - -isort:skip_file -""" - -from nltk.tbl.template import Template - -# API: Template(...), Template.expand(...) - -from nltk.tbl.feature import Feature - -# API: Feature(...), Feature.expand(...) - -from nltk.tbl.rule import Rule - -# API: Rule.format(...), Rule.templatetid - -from nltk.tbl.erroranalysis import error_list diff --git a/pipeline/nltk/tbl/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/tbl/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 543ffbf0c02ab9d7006e90b3d64b2b301177ad07..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tbl/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tbl/__pycache__/api.cpython-39.pyc b/pipeline/nltk/tbl/__pycache__/api.cpython-39.pyc deleted file mode 100644 index 0cd648137feea7edd0b96e0a46488e1c32cfa174..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tbl/__pycache__/api.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tbl/__pycache__/demo.cpython-39.pyc b/pipeline/nltk/tbl/__pycache__/demo.cpython-39.pyc deleted file mode 100644 index cea05a8a41616b04a3e413c420912839aa41cc4b..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tbl/__pycache__/demo.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tbl/__pycache__/erroranalysis.cpython-39.pyc b/pipeline/nltk/tbl/__pycache__/erroranalysis.cpython-39.pyc deleted file mode 100644 index 05e31ff0f17f9c8271de8beae96af04399ca741a..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tbl/__pycache__/erroranalysis.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tbl/__pycache__/feature.cpython-39.pyc b/pipeline/nltk/tbl/__pycache__/feature.cpython-39.pyc deleted file mode 100644 index 3bfe2bbb795e59c79f809846f02fedd6d9d83c31..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tbl/__pycache__/feature.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tbl/__pycache__/rule.cpython-39.pyc b/pipeline/nltk/tbl/__pycache__/rule.cpython-39.pyc deleted file mode 100644 index 728a583a7e29ecd9cc2bf107a3d18841283e4d70..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tbl/__pycache__/rule.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tbl/__pycache__/template.cpython-39.pyc b/pipeline/nltk/tbl/__pycache__/template.cpython-39.pyc deleted file mode 100644 index 5a2a0fb3f9e94f5ccc9d291bdad4a2df03481069..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tbl/__pycache__/template.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tbl/api.py b/pipeline/nltk/tbl/api.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/pipeline/nltk/tbl/demo.py b/pipeline/nltk/tbl/demo.py deleted file mode 100644 index a5298e396e964f1f33e89a81263014249bca7cfa..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tbl/demo.py +++ /dev/null @@ -1,418 +0,0 @@ -# Natural Language Toolkit: Transformation-based learning -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Marcus Uneson -# based on previous (nltk2) version by -# Christopher Maloof, Edward Loper, Steven Bird -# URL: -# For license information, see LICENSE.TXT - -import os -import pickle -import random -import time - -from nltk.corpus import treebank -from nltk.tag import BrillTaggerTrainer, RegexpTagger, UnigramTagger -from nltk.tag.brill import Pos, Word -from nltk.tbl import Template, error_list - - -def demo(): - """ - Run a demo with defaults. See source comments for details, - or docstrings of any of the more specific demo_* functions. - """ - postag() - - -def demo_repr_rule_format(): - """ - Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose")) - """ - postag(ruleformat="repr") - - -def demo_str_rule_format(): - """ - Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose")) - """ - postag(ruleformat="str") - - -def demo_verbose_rule_format(): - """ - Exemplify Rule.format("verbose") - """ - postag(ruleformat="verbose") - - -def demo_multiposition_feature(): - """ - The feature/s of a template takes a list of positions - relative to the current word where the feature should be - looked for, conceptually joined by logical OR. For instance, - Pos([-1, 1]), given a value V, will hold whenever V is found - one step to the left and/or one step to the right. - - For contiguous ranges, a 2-arg form giving inclusive end - points can also be used: Pos(-3, -1) is the same as the arg - below. - """ - postag(templates=[Template(Pos([-3, -2, -1]))]) - - -def demo_multifeature_template(): - """ - Templates can have more than a single feature. - """ - postag(templates=[Template(Word([0]), Pos([-2, -1]))]) - - -def demo_template_statistics(): - """ - Show aggregate statistics per template. Little used templates are - candidates for deletion, much used templates may possibly be refined. - - Deleting unused templates is mostly about saving time and/or space: - training is basically O(T) in the number of templates T - (also in terms of memory usage, which often will be the limiting factor). - """ - postag(incremental_stats=True, template_stats=True) - - -def demo_generated_templates(): - """ - Template.expand and Feature.expand are class methods facilitating - generating large amounts of templates. See their documentation for - details. - - Note: training with 500 templates can easily fill all available - even on relatively small corpora - """ - wordtpls = Word.expand([-1, 0, 1], [1, 2], excludezero=False) - tagtpls = Pos.expand([-2, -1, 0, 1], [1, 2], excludezero=True) - templates = list(Template.expand([wordtpls, tagtpls], combinations=(1, 3))) - print( - "Generated {} templates for transformation-based learning".format( - len(templates) - ) - ) - postag(templates=templates, incremental_stats=True, template_stats=True) - - -def demo_learning_curve(): - """ - Plot a learning curve -- the contribution on tagging accuracy of - the individual rules. - Note: requires matplotlib - """ - postag( - incremental_stats=True, - separate_baseline_data=True, - learning_curve_output="learningcurve.png", - ) - - -def demo_error_analysis(): - """ - Writes a file with context for each erroneous word after tagging testing data - """ - postag(error_output="errors.txt") - - -def demo_serialize_tagger(): - """ - Serializes the learned tagger to a file in pickle format; reloads it - and validates the process. - """ - postag(serialize_output="tagger.pcl") - - -def demo_high_accuracy_rules(): - """ - Discard rules with low accuracy. This may hurt performance a bit, - but will often produce rules which are more interesting read to a human. - """ - postag(num_sents=3000, min_acc=0.96, min_score=10) - - -def postag( - templates=None, - tagged_data=None, - num_sents=1000, - max_rules=300, - min_score=3, - min_acc=None, - train=0.8, - trace=3, - randomize=False, - ruleformat="str", - incremental_stats=False, - template_stats=False, - error_output=None, - serialize_output=None, - learning_curve_output=None, - learning_curve_take=300, - baseline_backoff_tagger=None, - separate_baseline_data=False, - cache_baseline_tagger=None, -): - """ - Brill Tagger Demonstration - :param templates: how many sentences of training and testing data to use - :type templates: list of Template - - :param tagged_data: maximum number of rule instances to create - :type tagged_data: C{int} - - :param num_sents: how many sentences of training and testing data to use - :type num_sents: C{int} - - :param max_rules: maximum number of rule instances to create - :type max_rules: C{int} - - :param min_score: the minimum score for a rule in order for it to be considered - :type min_score: C{int} - - :param min_acc: the minimum score for a rule in order for it to be considered - :type min_acc: C{float} - - :param train: the fraction of the the corpus to be used for training (1=all) - :type train: C{float} - - :param trace: the level of diagnostic tracing output to produce (0-4) - :type trace: C{int} - - :param randomize: whether the training data should be a random subset of the corpus - :type randomize: C{bool} - - :param ruleformat: rule output format, one of "str", "repr", "verbose" - :type ruleformat: C{str} - - :param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow) - :type incremental_stats: C{bool} - - :param template_stats: if true, will print per-template statistics collected in training and (optionally) testing - :type template_stats: C{bool} - - :param error_output: the file where errors will be saved - :type error_output: C{string} - - :param serialize_output: the file where the learned tbl tagger will be saved - :type serialize_output: C{string} - - :param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available) - :type learning_curve_output: C{string} - - :param learning_curve_take: how many rules plotted - :type learning_curve_take: C{int} - - :param baseline_backoff_tagger: the file where rules will be saved - :type baseline_backoff_tagger: tagger - - :param separate_baseline_data: use a fraction of the training data exclusively for training baseline - :type separate_baseline_data: C{bool} - - :param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get - deterministic output from the baseline unigram tagger between python versions) - :type cache_baseline_tagger: C{string} - - - Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This - is fast and fine for a demo, but is likely to generalize worse on unseen data. - Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high). - """ - - # defaults - baseline_backoff_tagger = baseline_backoff_tagger or REGEXP_TAGGER - if templates is None: - from nltk.tag.brill import brill24, describe_template_sets - - # some pre-built template sets taken from typical systems or publications are - # available. Print a list with describe_template_sets() - # for instance: - templates = brill24() - (training_data, baseline_data, gold_data, testing_data) = _demo_prepare_data( - tagged_data, train, num_sents, randomize, separate_baseline_data - ) - - # creating (or reloading from cache) a baseline tagger (unigram tagger) - # this is just a mechanism for getting deterministic output from the baseline between - # python versions - if cache_baseline_tagger: - if not os.path.exists(cache_baseline_tagger): - baseline_tagger = UnigramTagger( - baseline_data, backoff=baseline_backoff_tagger - ) - with open(cache_baseline_tagger, "w") as print_rules: - pickle.dump(baseline_tagger, print_rules) - print( - "Trained baseline tagger, pickled it to {}".format( - cache_baseline_tagger - ) - ) - with open(cache_baseline_tagger) as print_rules: - baseline_tagger = pickle.load(print_rules) - print(f"Reloaded pickled tagger from {cache_baseline_tagger}") - else: - baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger) - print("Trained baseline tagger") - if gold_data: - print( - " Accuracy on test set: {:0.4f}".format( - baseline_tagger.accuracy(gold_data) - ) - ) - - # creating a Brill tagger - tbrill = time.time() - trainer = BrillTaggerTrainer( - baseline_tagger, templates, trace, ruleformat=ruleformat - ) - print("Training tbl tagger...") - brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc) - print(f"Trained tbl tagger in {time.time() - tbrill:0.2f} seconds") - if gold_data: - print(" Accuracy on test set: %.4f" % brill_tagger.accuracy(gold_data)) - - # printing the learned rules, if learned silently - if trace == 1: - print("\nLearned rules: ") - for (ruleno, rule) in enumerate(brill_tagger.rules(), 1): - print(f"{ruleno:4d} {rule.format(ruleformat):s}") - - # printing template statistics (optionally including comparison with the training data) - # note: if not separate_baseline_data, then baseline accuracy will be artificially high - if incremental_stats: - print( - "Incrementally tagging the test data, collecting individual rule statistics" - ) - (taggedtest, teststats) = brill_tagger.batch_tag_incremental( - testing_data, gold_data - ) - print(" Rule statistics collected") - if not separate_baseline_data: - print( - "WARNING: train_stats asked for separate_baseline_data=True; the baseline " - "will be artificially high" - ) - trainstats = brill_tagger.train_stats() - if template_stats: - brill_tagger.print_template_statistics(teststats) - if learning_curve_output: - _demo_plot( - learning_curve_output, teststats, trainstats, take=learning_curve_take - ) - print(f"Wrote plot of learning curve to {learning_curve_output}") - else: - print("Tagging the test data") - taggedtest = brill_tagger.tag_sents(testing_data) - if template_stats: - brill_tagger.print_template_statistics() - - # writing error analysis to file - if error_output is not None: - with open(error_output, "w") as f: - f.write("Errors for Brill Tagger %r\n\n" % serialize_output) - f.write("\n".join(error_list(gold_data, taggedtest)).encode("utf-8") + "\n") - print(f"Wrote tagger errors including context to {error_output}") - - # serializing the tagger to a pickle file and reloading (just to see it works) - if serialize_output is not None: - taggedtest = brill_tagger.tag_sents(testing_data) - with open(serialize_output, "w") as print_rules: - pickle.dump(brill_tagger, print_rules) - print(f"Wrote pickled tagger to {serialize_output}") - with open(serialize_output) as print_rules: - brill_tagger_reloaded = pickle.load(print_rules) - print(f"Reloaded pickled tagger from {serialize_output}") - taggedtest_reloaded = brill_tagger.tag_sents(testing_data) - if taggedtest == taggedtest_reloaded: - print("Reloaded tagger tried on test set, results identical") - else: - print("PROBLEM: Reloaded tagger gave different results on test set") - - -def _demo_prepare_data( - tagged_data, train, num_sents, randomize, separate_baseline_data -): - # train is the proportion of data used in training; the rest is reserved - # for testing. - if tagged_data is None: - print("Loading tagged data from treebank... ") - tagged_data = treebank.tagged_sents() - if num_sents is None or len(tagged_data) <= num_sents: - num_sents = len(tagged_data) - if randomize: - random.seed(len(tagged_data)) - random.shuffle(tagged_data) - cutoff = int(num_sents * train) - training_data = tagged_data[:cutoff] - gold_data = tagged_data[cutoff:num_sents] - testing_data = [[t[0] for t in sent] for sent in gold_data] - if not separate_baseline_data: - baseline_data = training_data - else: - bl_cutoff = len(training_data) // 3 - (baseline_data, training_data) = ( - training_data[:bl_cutoff], - training_data[bl_cutoff:], - ) - (trainseqs, traintokens) = corpus_size(training_data) - (testseqs, testtokens) = corpus_size(testing_data) - (bltrainseqs, bltraintokens) = corpus_size(baseline_data) - print(f"Read testing data ({testseqs:d} sents/{testtokens:d} wds)") - print(f"Read training data ({trainseqs:d} sents/{traintokens:d} wds)") - print( - "Read baseline data ({:d} sents/{:d} wds) {:s}".format( - bltrainseqs, - bltraintokens, - "" if separate_baseline_data else "[reused the training set]", - ) - ) - return (training_data, baseline_data, gold_data, testing_data) - - -def _demo_plot(learning_curve_output, teststats, trainstats=None, take=None): - testcurve = [teststats["initialerrors"]] - for rulescore in teststats["rulescores"]: - testcurve.append(testcurve[-1] - rulescore) - testcurve = [1 - x / teststats["tokencount"] for x in testcurve[:take]] - - traincurve = [trainstats["initialerrors"]] - for rulescore in trainstats["rulescores"]: - traincurve.append(traincurve[-1] - rulescore) - traincurve = [1 - x / trainstats["tokencount"] for x in traincurve[:take]] - - import matplotlib.pyplot as plt - - r = list(range(len(testcurve))) - plt.plot(r, testcurve, r, traincurve) - plt.axis([None, None, None, 1.0]) - plt.savefig(learning_curve_output) - - -NN_CD_TAGGER = RegexpTagger([(r"^-?[0-9]+(\.[0-9]+)?$", "CD"), (r".*", "NN")]) - -REGEXP_TAGGER = RegexpTagger( - [ - (r"^-?[0-9]+(\.[0-9]+)?$", "CD"), # cardinal numbers - (r"(The|the|A|a|An|an)$", "AT"), # articles - (r".*able$", "JJ"), # adjectives - (r".*ness$", "NN"), # nouns formed from adjectives - (r".*ly$", "RB"), # adverbs - (r".*s$", "NNS"), # plural nouns - (r".*ing$", "VBG"), # gerunds - (r".*ed$", "VBD"), # past tense verbs - (r".*", "NN"), # nouns (default) - ] -) - - -def corpus_size(seqs): - return (len(seqs), sum(len(x) for x in seqs)) - - -if __name__ == "__main__": - demo_learning_curve() diff --git a/pipeline/nltk/tbl/erroranalysis.py b/pipeline/nltk/tbl/erroranalysis.py deleted file mode 100644 index 8b192e75d8b410942960cbf5ea1476a42f0decf7..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tbl/erroranalysis.py +++ /dev/null @@ -1,38 +0,0 @@ -# Natural Language Toolkit: Transformation-based learning -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Marcus Uneson -# based on previous (nltk2) version by -# Christopher Maloof, Edward Loper, Steven Bird -# URL: -# For license information, see LICENSE.TXT - -# returns a list of errors in string format - - -def error_list(train_sents, test_sents): - """ - Returns a list of human-readable strings indicating the errors in the - given tagging of the corpus. - - :param train_sents: The correct tagging of the corpus - :type train_sents: list(tuple) - :param test_sents: The tagged corpus - :type test_sents: list(tuple) - """ - hdr = ("%25s | %s | %s\n" + "-" * 26 + "+" + "-" * 24 + "+" + "-" * 26) % ( - "left context", - "word/test->gold".center(22), - "right context", - ) - errors = [hdr] - for (train_sent, test_sent) in zip(train_sents, test_sents): - for wordnum, (word, train_pos) in enumerate(train_sent): - test_pos = test_sent[wordnum][1] - if train_pos != test_pos: - left = " ".join("%s/%s" % w for w in train_sent[:wordnum]) - right = " ".join("%s/%s" % w for w in train_sent[wordnum + 1 :]) - mid = f"{word}/{test_pos}->{train_pos}" - errors.append(f"{left[-25:]:>25} | {mid.center(22)} | {right[:25]}") - - return errors diff --git a/pipeline/nltk/tbl/feature.py b/pipeline/nltk/tbl/feature.py deleted file mode 100644 index 568425918db4b4b7910ef0d216b03bd10411d287..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tbl/feature.py +++ /dev/null @@ -1,267 +0,0 @@ -# Natural Language Toolkit: Transformation-based learning -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Marcus Uneson -# based on previous (nltk2) version by -# Christopher Maloof, Edward Loper, Steven Bird -# URL: -# For license information, see LICENSE.TXT - -from abc import ABCMeta, abstractmethod - - -class Feature(metaclass=ABCMeta): - """ - An abstract base class for Features. A Feature is a combination of - a specific property-computing method and a list of relative positions - to apply that method to. - - The property-computing method, M{extract_property(tokens, index)}, - must be implemented by every subclass. It extracts or computes a specific - property for the token at the current index. Typical extract_property() - methods return features such as the token text or tag; but more involved - methods may consider the entire sequence M{tokens} and - for instance compute the length of the sentence the token belongs to. - - In addition, the subclass may have a PROPERTY_NAME, which is how - it will be printed (in Rules and Templates, etc). If not given, defaults - to the classname. - - """ - - json_tag = "nltk.tbl.Feature" - PROPERTY_NAME = None - - def __init__(self, positions, end=None): - """ - Construct a Feature which may apply at C{positions}. - - >>> # For instance, importing some concrete subclasses (Feature is abstract) - >>> from nltk.tag.brill import Word, Pos - - >>> # Feature Word, applying at one of [-2, -1] - >>> Word([-2,-1]) - Word([-2, -1]) - - >>> # Positions need not be contiguous - >>> Word([-2,-1, 1]) - Word([-2, -1, 1]) - - >>> # Contiguous ranges can alternatively be specified giving the - >>> # two endpoints (inclusive) - >>> Pos(-3, -1) - Pos([-3, -2, -1]) - - >>> # In two-arg form, start <= end is enforced - >>> Pos(2, 1) - Traceback (most recent call last): - File "", line 1, in - File "nltk/tbl/template.py", line 306, in __init__ - raise TypeError - ValueError: illegal interval specification: (start=2, end=1) - - :type positions: list of int - :param positions: the positions at which this features should apply - :raises ValueError: illegal position specifications - - An alternative calling convention, for contiguous positions only, - is Feature(start, end): - - :type start: int - :param start: start of range where this feature should apply - :type end: int - :param end: end of range (NOTE: inclusive!) where this feature should apply - """ - self.positions = None # to avoid warnings - if end is None: - self.positions = tuple(sorted({int(i) for i in positions})) - else: # positions was actually not a list, but only the start index - try: - if positions > end: - raise TypeError - self.positions = tuple(range(positions, end + 1)) - except TypeError as e: - # let any kind of erroneous spec raise ValueError - raise ValueError( - "illegal interval specification: (start={}, end={})".format( - positions, end - ) - ) from e - - # set property name given in subclass, or otherwise name of subclass - self.PROPERTY_NAME = self.__class__.PROPERTY_NAME or self.__class__.__name__ - - def encode_json_obj(self): - return self.positions - - @classmethod - def decode_json_obj(cls, obj): - positions = obj - return cls(positions) - - def __repr__(self): - return f"{self.__class__.__name__}({list(self.positions)!r})" - - @classmethod - def expand(cls, starts, winlens, excludezero=False): - """ - Return a list of features, one for each start point in starts - and for each window length in winlen. If excludezero is True, - no Features containing 0 in its positions will be generated - (many tbl trainers have a special representation for the - target feature at [0]) - - For instance, importing a concrete subclass (Feature is abstract) - - >>> from nltk.tag.brill import Word - - First argument gives the possible start positions, second the - possible window lengths - - >>> Word.expand([-3,-2,-1], [1]) - [Word([-3]), Word([-2]), Word([-1])] - - >>> Word.expand([-2,-1], [1]) - [Word([-2]), Word([-1])] - - >>> Word.expand([-3,-2,-1], [1,2]) - [Word([-3]), Word([-2]), Word([-1]), Word([-3, -2]), Word([-2, -1])] - - >>> Word.expand([-2,-1], [1]) - [Word([-2]), Word([-1])] - - A third optional argument excludes all Features whose positions contain zero - - >>> Word.expand([-2,-1,0], [1,2], excludezero=False) - [Word([-2]), Word([-1]), Word([0]), Word([-2, -1]), Word([-1, 0])] - - >>> Word.expand([-2,-1,0], [1,2], excludezero=True) - [Word([-2]), Word([-1]), Word([-2, -1])] - - All window lengths must be positive - - >>> Word.expand([-2,-1], [0]) - Traceback (most recent call last): - File "", line 1, in - File "nltk/tag/tbl/template.py", line 371, in expand - :param starts: where to start looking for Feature - ValueError: non-positive window length in [0] - - :param starts: where to start looking for Feature - :type starts: list of ints - :param winlens: window lengths where to look for Feature - :type starts: list of ints - :param excludezero: do not output any Feature with 0 in any of its positions. - :type excludezero: bool - :returns: list of Features - :raises ValueError: for non-positive window lengths - """ - if not all(x > 0 for x in winlens): - raise ValueError(f"non-positive window length in {winlens}") - xs = (starts[i : i + w] for w in winlens for i in range(len(starts) - w + 1)) - return [cls(x) for x in xs if not (excludezero and 0 in x)] - - def issuperset(self, other): - """ - Return True if this Feature always returns True when other does - - More precisely, return True if this feature refers to the same property as other; - and this Feature looks at all positions that other does (and possibly - other positions in addition). - - #For instance, importing a concrete subclass (Feature is abstract) - >>> from nltk.tag.brill import Word, Pos - - >>> Word([-3,-2,-1]).issuperset(Word([-3,-2])) - True - - >>> Word([-3,-2,-1]).issuperset(Word([-3,-2, 0])) - False - - #Feature subclasses must agree - >>> Word([-3,-2,-1]).issuperset(Pos([-3,-2])) - False - - :param other: feature with which to compare - :type other: (subclass of) Feature - :return: True if this feature is superset, otherwise False - :rtype: bool - - - """ - return self.__class__ is other.__class__ and set(self.positions) >= set( - other.positions - ) - - def intersects(self, other): - """ - Return True if the positions of this Feature intersects with those of other - - More precisely, return True if this feature refers to the same property as other; - and there is some overlap in the positions they look at. - - #For instance, importing a concrete subclass (Feature is abstract) - >>> from nltk.tag.brill import Word, Pos - - >>> Word([-3,-2,-1]).intersects(Word([-3,-2])) - True - - >>> Word([-3,-2,-1]).intersects(Word([-3,-2, 0])) - True - - >>> Word([-3,-2,-1]).intersects(Word([0])) - False - - #Feature subclasses must agree - >>> Word([-3,-2,-1]).intersects(Pos([-3,-2])) - False - - :param other: feature with which to compare - :type other: (subclass of) Feature - :return: True if feature classes agree and there is some overlap in the positions they look at - :rtype: bool - """ - - return bool( - self.__class__ is other.__class__ - and set(self.positions) & set(other.positions) - ) - - # Rich comparisons for Features. With @functools.total_ordering (Python 2.7+), - # it will be enough to define __lt__ and __eq__ - def __eq__(self, other): - return self.__class__ is other.__class__ and self.positions == other.positions - - def __lt__(self, other): - return ( - self.__class__.__name__ < other.__class__.__name__ - or - # self.positions is a sorted tuple of ints - self.positions < other.positions - ) - - def __ne__(self, other): - return not (self == other) - - def __gt__(self, other): - return other < self - - def __ge__(self, other): - return not self < other - - def __le__(self, other): - return self < other or self == other - - @staticmethod - @abstractmethod - def extract_property(tokens, index): - """ - Any subclass of Feature must define static method extract_property(tokens, index) - - :param tokens: the sequence of tokens - :type tokens: list of tokens - :param index: the current index - :type index: int - :return: feature value - :rtype: any (but usually scalar) - """ diff --git a/pipeline/nltk/tbl/rule.py b/pipeline/nltk/tbl/rule.py deleted file mode 100644 index 7faea23bd36ddbf974de4499bb1f9106a78e4c0e..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tbl/rule.py +++ /dev/null @@ -1,322 +0,0 @@ -# Natural Language Toolkit: Transformation-based learning -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Marcus Uneson -# based on previous (nltk2) version by -# Christopher Maloof, Edward Loper, Steven Bird -# URL: -# For license information, see LICENSE.TXT - -from abc import ABCMeta, abstractmethod - -from nltk import jsontags - - -###################################################################### -# Tag Rules -###################################################################### -class TagRule(metaclass=ABCMeta): - """ - An interface for tag transformations on a tagged corpus, as - performed by tbl taggers. Each transformation finds all tokens - in the corpus that are tagged with a specific original tag and - satisfy a specific condition, and replaces their tags with a - replacement tag. For any given transformation, the original - tag, replacement tag, and condition are fixed. Conditions may - depend on the token under consideration, as well as any other - tokens in the corpus. - - Tag rules must be comparable and hashable. - """ - - def __init__(self, original_tag, replacement_tag): - - self.original_tag = original_tag - """The tag which this TagRule may cause to be replaced.""" - - self.replacement_tag = replacement_tag - """The tag with which this TagRule may replace another tag.""" - - def apply(self, tokens, positions=None): - """ - Apply this rule at every position in positions where it - applies to the given sentence. I.e., for each position p - in *positions*, if *tokens[p]* is tagged with this rule's - original tag, and satisfies this rule's condition, then set - its tag to be this rule's replacement tag. - - :param tokens: The tagged sentence - :type tokens: list(tuple(str, str)) - :type positions: list(int) - :param positions: The positions where the transformation is to - be tried. If not specified, try it at all positions. - :return: The indices of tokens whose tags were changed by this - rule. - :rtype: int - """ - if positions is None: - positions = list(range(len(tokens))) - - # Determine the indices at which this rule applies. - change = [i for i in positions if self.applies(tokens, i)] - - # Make the changes. Note: this must be done in a separate - # step from finding applicable locations, since we don't want - # the rule to interact with itself. - for i in change: - tokens[i] = (tokens[i][0], self.replacement_tag) - - return change - - @abstractmethod - def applies(self, tokens, index): - """ - :return: True if the rule would change the tag of - ``tokens[index]``, False otherwise - :rtype: bool - :param tokens: A tagged sentence - :type tokens: list(str) - :param index: The index to check - :type index: int - """ - - # Rules must be comparable and hashable for the algorithm to work - def __eq__(self, other): - raise TypeError("Rules must implement __eq__()") - - def __ne__(self, other): - raise TypeError("Rules must implement __ne__()") - - def __hash__(self): - raise TypeError("Rules must implement __hash__()") - - -@jsontags.register_tag -class Rule(TagRule): - """ - A Rule checks the current corpus position for a certain set of conditions; - if they are all fulfilled, the Rule is triggered, meaning that it - will change tag A to tag B. For other tags than A, nothing happens. - - The conditions are parameters to the Rule instance. Each condition is a feature-value pair, - with a set of positions to check for the value of the corresponding feature. - Conceptually, the positions are joined by logical OR, and the feature set by logical AND. - - More formally, the Rule is then applicable to the M{n}th token iff: - - - The M{n}th token is tagged with the Rule's original tag; and - - For each (Feature(positions), M{value}) tuple: - - - The value of Feature of at least one token in {n+p for p in positions} - is M{value}. - """ - - json_tag = "nltk.tbl.Rule" - - def __init__(self, templateid, original_tag, replacement_tag, conditions): - """ - Construct a new Rule that changes a token's tag from - C{original_tag} to C{replacement_tag} if all of the properties - specified in C{conditions} hold. - - :param templateid: the template id (a zero-padded string, '001' etc, - so it will sort nicely) - :type templateid: string - - :param conditions: A list of Feature(positions), - each of which specifies that the property (computed by - Feature.extract_property()) of at least one - token in M{n} + p in positions is C{value}. - :type conditions: C{iterable} of C{Feature} - - """ - TagRule.__init__(self, original_tag, replacement_tag) - self._conditions = conditions - self.templateid = templateid - - def encode_json_obj(self): - return { - "templateid": self.templateid, - "original": self.original_tag, - "replacement": self.replacement_tag, - "conditions": self._conditions, - } - - @classmethod - def decode_json_obj(cls, obj): - return cls( - obj["templateid"], - obj["original"], - obj["replacement"], - tuple(tuple(feat) for feat in obj["conditions"]), - ) - - def applies(self, tokens, index): - # Inherit docs from TagRule - - # Does the given token have this Rule's "original tag"? - if tokens[index][1] != self.original_tag: - return False - - # Check to make sure that every condition holds. - for (feature, val) in self._conditions: - - # Look for *any* token that satisfies the condition. - for pos in feature.positions: - if not (0 <= index + pos < len(tokens)): - continue - if feature.extract_property(tokens, index + pos) == val: - break - else: - # No token satisfied the condition; return false. - return False - - # Every condition checked out, so the Rule is applicable. - return True - - def __eq__(self, other): - return self is other or ( - other is not None - and other.__class__ == self.__class__ - and self.original_tag == other.original_tag - and self.replacement_tag == other.replacement_tag - and self._conditions == other._conditions - ) - - def __ne__(self, other): - return not (self == other) - - def __hash__(self): - - # Cache our hash value (justified by profiling.) - try: - return self.__hash - except AttributeError: - self.__hash = hash(repr(self)) - return self.__hash - - def __repr__(self): - # Cache the repr (justified by profiling -- this is used as - # a sort key when deterministic=True.) - try: - return self.__repr - except AttributeError: - self.__repr = "{}('{}', {}, {}, [{}])".format( - self.__class__.__name__, - self.templateid, - repr(self.original_tag), - repr(self.replacement_tag), - # list(self._conditions) would be simpler but will not generate - # the same Rule.__repr__ in python 2 and 3 and thus break some tests - ", ".join(f"({f},{repr(v)})" for (f, v) in self._conditions), - ) - - return self.__repr - - def __str__(self): - def _condition_to_logic(feature, value): - """ - Return a compact, predicate-logic styled string representation - of the given condition. - """ - return "{}:{}@[{}]".format( - feature.PROPERTY_NAME, - value, - ",".join(str(w) for w in feature.positions), - ) - - conditions = " & ".join( - [_condition_to_logic(f, v) for (f, v) in self._conditions] - ) - s = f"{self.original_tag}->{self.replacement_tag} if {conditions}" - - return s - - def format(self, fmt): - """ - Return a string representation of this rule. - - >>> from nltk.tbl.rule import Rule - >>> from nltk.tag.brill import Pos - - >>> r = Rule("23", "VB", "NN", [(Pos([-2,-1]), 'DT')]) - - r.format("str") == str(r) - True - >>> r.format("str") - 'VB->NN if Pos:DT@[-2,-1]' - - r.format("repr") == repr(r) - True - >>> r.format("repr") - "Rule('23', 'VB', 'NN', [(Pos([-2, -1]),'DT')])" - - >>> r.format("verbose") - 'VB -> NN if the Pos of words i-2...i-1 is "DT"' - - >>> r.format("not_found") - Traceback (most recent call last): - File "", line 1, in - File "nltk/tbl/rule.py", line 256, in format - raise ValueError("unknown rule format spec: {0}".format(fmt)) - ValueError: unknown rule format spec: not_found - >>> - - :param fmt: format specification - :type fmt: str - :return: string representation - :rtype: str - """ - if fmt == "str": - return self.__str__() - elif fmt == "repr": - return self.__repr__() - elif fmt == "verbose": - return self._verbose_format() - else: - raise ValueError(f"unknown rule format spec: {fmt}") - - def _verbose_format(self): - """ - Return a wordy, human-readable string representation - of the given rule. - - Not sure how useful this is. - """ - - def condition_to_str(feature, value): - return 'the {} of {} is "{}"'.format( - feature.PROPERTY_NAME, - range_to_str(feature.positions), - value, - ) - - def range_to_str(positions): - if len(positions) == 1: - p = positions[0] - if p == 0: - return "this word" - if p == -1: - return "the preceding word" - elif p == 1: - return "the following word" - elif p < 0: - return "word i-%d" % -p - elif p > 0: - return "word i+%d" % p - else: - # for complete compatibility with the wordy format of nltk2 - mx = max(positions) - mn = min(positions) - if mx - mn == len(positions) - 1: - return "words i%+d...i%+d" % (mn, mx) - else: - return "words {{{}}}".format( - ",".join("i%+d" % d for d in positions) - ) - - replacement = f"{self.original_tag} -> {self.replacement_tag}" - conditions = (" if " if self._conditions else "") + ", and ".join( - condition_to_str(f, v) for (f, v) in self._conditions - ) - return replacement + conditions diff --git a/pipeline/nltk/tbl/template.py b/pipeline/nltk/tbl/template.py deleted file mode 100644 index ac9ed5df52f5730bd767a04a121637a5c2be01d2..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tbl/template.py +++ /dev/null @@ -1,325 +0,0 @@ -# Natural Language Toolkit: Transformation-based learning -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Marcus Uneson -# based on previous (nltk2) version by -# Christopher Maloof, Edward Loper, Steven Bird -# URL: -# For license information, see LICENSE.TXT - -import itertools as it -from abc import ABCMeta, abstractmethod - -from nltk.tbl.feature import Feature -from nltk.tbl.rule import Rule - - -class BrillTemplateI(metaclass=ABCMeta): - """ - An interface for generating lists of transformational rules that - apply at given sentence positions. ``BrillTemplateI`` is used by - ``Brill`` training algorithms to generate candidate rules. - """ - - @abstractmethod - def applicable_rules(self, tokens, i, correctTag): - """ - Return a list of the transformational rules that would correct - the ``i``-th subtoken's tag in the given token. In particular, - return a list of zero or more rules that would change - ``tokens[i][1]`` to ``correctTag``, if applied to ``token[i]``. - - If the ``i``-th token already has the correct tag (i.e., if - ``tagged_tokens[i][1] == correctTag``), then - ``applicable_rules()`` should return the empty list. - - :param tokens: The tagged tokens being tagged. - :type tokens: list(tuple) - :param i: The index of the token whose tag should be corrected. - :type i: int - :param correctTag: The correct tag for the ``i``-th token. - :type correctTag: any - :rtype: list(BrillRule) - """ - - @abstractmethod - def get_neighborhood(self, token, index): - """ - Returns the set of indices *i* such that - ``applicable_rules(token, i, ...)`` depends on the value of - the *index*th token of *token*. - - This method is used by the "fast" Brill tagger trainer. - - :param token: The tokens being tagged. - :type token: list(tuple) - :param index: The index whose neighborhood should be returned. - :type index: int - :rtype: set - """ - - -class Template(BrillTemplateI): - """ - A tbl Template that generates a list of L{Rule}s that apply at a given sentence - position. In particular, each C{Template} is parameterized by a list of - independent features (a combination of a specific - property to extract and a list C{L} of relative positions at which to extract - it) and generates all Rules that: - - - use the given features, each at its own independent position; and - - are applicable to the given token. - """ - - ALLTEMPLATES = [] - # record a unique id of form "001", for each template created - # _ids = it.count(0) - - def __init__(self, *features): - - """ - Construct a Template for generating Rules. - - Takes a list of Features. A C{Feature} is a combination - of a specific property and its relative positions and should be - a subclass of L{nltk.tbl.feature.Feature}. - - An alternative calling convention (kept for backwards compatibility, - but less expressive as it only permits one feature type) is - Template(Feature, (start1, end1), (start2, end2), ...) - In new code, that would be better written - Template(Feature(start1, end1), Feature(start2, end2), ...) - - For instance, importing some features - - >>> from nltk.tbl.template import Template - >>> from nltk.tag.brill import Word, Pos - - Create some features - - >>> wfeat1, wfeat2, pfeat = (Word([-1]), Word([1,2]), Pos([-2,-1])) - - Create a single-feature template - - >>> Template(wfeat1) - Template(Word([-1])) - - Or a two-feature one - - >>> Template(wfeat1, wfeat2) - Template(Word([-1]),Word([1, 2])) - - Or a three-feature one with two different feature types - - >>> Template(wfeat1, wfeat2, pfeat) - Template(Word([-1]),Word([1, 2]),Pos([-2, -1])) - - deprecated api: Feature subclass, followed by list of (start,end) pairs - (permits only a single Feature) - - >>> Template(Word, (-2,-1), (0,0)) - Template(Word([-2, -1]),Word([0])) - - Incorrect specification raises TypeError - - >>> Template(Word, (-2,-1), Pos, (0,0)) - Traceback (most recent call last): - File "", line 1, in - File "nltk/tag/tbl/template.py", line 143, in __init__ - raise TypeError( - TypeError: expected either Feature1(args), Feature2(args), ... or Feature, (start1, end1), (start2, end2), ... - - :type features: list of Features - :param features: the features to build this Template on - """ - # determine the calling form: either - # Template(Feature, args1, [args2, ...)] - # Template(Feature1(args), Feature2(args), ...) - if all(isinstance(f, Feature) for f in features): - self._features = features - elif issubclass(features[0], Feature) and all( - isinstance(a, tuple) for a in features[1:] - ): - self._features = [features[0](*tp) for tp in features[1:]] - else: - raise TypeError( - "expected either Feature1(args), Feature2(args), ... or Feature, (start1, end1), (start2, end2), ..." - ) - self.id = f"{len(self.ALLTEMPLATES):03d}" - self.ALLTEMPLATES.append(self) - - def __repr__(self): - return "{}({})".format( - self.__class__.__name__, - ",".join([str(f) for f in self._features]), - ) - - def applicable_rules(self, tokens, index, correct_tag): - if tokens[index][1] == correct_tag: - return [] - - # For each of this Template's features, find the conditions - # that are applicable for the given token. - # Then, generate one Rule for each combination of features - # (the crossproduct of the conditions). - - applicable_conditions = self._applicable_conditions(tokens, index) - xs = list(it.product(*applicable_conditions)) - return [Rule(self.id, tokens[index][1], correct_tag, tuple(x)) for x in xs] - - def _applicable_conditions(self, tokens, index): - """ - :returns: A set of all conditions for rules - that are applicable to C{tokens[index]}. - """ - conditions = [] - - for feature in self._features: - conditions.append([]) - for pos in feature.positions: - if not (0 <= index + pos < len(tokens)): - continue - value = feature.extract_property(tokens, index + pos) - conditions[-1].append((feature, value)) - return conditions - - def get_neighborhood(self, tokens, index): - # inherit docs from BrillTemplateI - - # applicable_rules(tokens, index, ...) depends on index. - neighborhood = {index} # set literal for python 2.7+ - - # applicable_rules(tokens, i, ...) depends on index if - # i+start < index <= i+end. - - allpositions = [0] + [p for feat in self._features for p in feat.positions] - start, end = min(allpositions), max(allpositions) - s = max(0, index + (-end)) - e = min(index + (-start) + 1, len(tokens)) - for i in range(s, e): - neighborhood.add(i) - return neighborhood - - @classmethod - def expand(cls, featurelists, combinations=None, skipintersecting=True): - - """ - Factory method to mass generate Templates from a list L of lists of Features. - - #With combinations=(k1, k2), the function will in all possible ways choose k1 ... k2 - #of the sublists in L; it will output all Templates formed by the Cartesian product - #of this selection, with duplicates and other semantically equivalent - #forms removed. Default for combinations is (1, len(L)). - - The feature lists may have been specified - manually, or generated from Feature.expand(). For instance, - - >>> from nltk.tbl.template import Template - >>> from nltk.tag.brill import Word, Pos - - #creating some features - >>> (wd_0, wd_01) = (Word([0]), Word([0,1])) - - >>> (pos_m2, pos_m33) = (Pos([-2]), Pos([3-2,-1,0,1,2,3])) - - >>> list(Template.expand([[wd_0], [pos_m2]])) - [Template(Word([0])), Template(Pos([-2])), Template(Pos([-2]),Word([0]))] - - >>> list(Template.expand([[wd_0, wd_01], [pos_m2]])) - [Template(Word([0])), Template(Word([0, 1])), Template(Pos([-2])), Template(Pos([-2]),Word([0])), Template(Pos([-2]),Word([0, 1]))] - - #note: with Feature.expand(), it is very easy to generate more templates - #than your system can handle -- for instance, - >>> wordtpls = Word.expand([-2,-1,0,1], [1,2], excludezero=False) - >>> len(wordtpls) - 7 - - >>> postpls = Pos.expand([-3,-2,-1,0,1,2], [1,2,3], excludezero=True) - >>> len(postpls) - 9 - - #and now the Cartesian product of all non-empty combinations of two wordtpls and - #two postpls, with semantic equivalents removed - >>> templates = list(Template.expand([wordtpls, wordtpls, postpls, postpls])) - >>> len(templates) - 713 - - - will return a list of eight templates - Template(Word([0])), - Template(Word([0, 1])), - Template(Pos([-2])), - Template(Pos([-1])), - Template(Pos([-2]),Word([0])), - Template(Pos([-1]),Word([0])), - Template(Pos([-2]),Word([0, 1])), - Template(Pos([-1]),Word([0, 1]))] - - - #Templates where one feature is a subset of another, such as - #Template(Word([0,1]), Word([1]), will not appear in the output. - #By default, this non-subset constraint is tightened to disjointness: - #Templates of type Template(Word([0,1]), Word([1,2]) will also be filtered out. - #With skipintersecting=False, then such Templates are allowed - - WARNING: this method makes it very easy to fill all your memory when training - generated templates on any real-world corpus - - :param featurelists: lists of Features, whose Cartesian product will return a set of Templates - :type featurelists: list of (list of Features) - :param combinations: given n featurelists: if combinations=k, all generated Templates will have - k features; if combinations=(k1,k2) they will have k1..k2 features; if None, defaults to 1..n - :type combinations: None, int, or (int, int) - :param skipintersecting: if True, do not output intersecting Templates (non-disjoint positions for some feature) - :type skipintersecting: bool - :returns: generator of Templates - - """ - - def nonempty_powerset(xs): # xs is a list - # itertools docnonempty_powerset([1,2,3]) --> (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3) - - # find the correct tuple given combinations, one of {None, k, (k1,k2)} - k = combinations # for brevity - combrange = ( - (1, len(xs) + 1) - if k is None - else (k, k + 1) # n over 1 .. n over n (all non-empty combinations) - if isinstance(k, int) - else (k[0], k[1] + 1) # n over k (only - ) # n over k1, n over k1+1... n over k2 - return it.chain.from_iterable( - it.combinations(xs, r) for r in range(*combrange) - ) - - seentemplates = set() - for picks in nonempty_powerset(featurelists): - for pick in it.product(*picks): - if any( - i != j and x.issuperset(y) - for (i, x) in enumerate(pick) - for (j, y) in enumerate(pick) - ): - continue - if skipintersecting and any( - i != j and x.intersects(y) - for (i, x) in enumerate(pick) - for (j, y) in enumerate(pick) - ): - continue - thistemplate = cls(*sorted(pick)) - strpick = str(thistemplate) - #!!FIXME --this is hackish - if strpick in seentemplates: # already added - cls._poptemplate() - continue - seentemplates.add(strpick) - yield thistemplate - - @classmethod - def _cleartemplates(cls): - cls.ALLTEMPLATES = [] - - @classmethod - def _poptemplate(cls): - return cls.ALLTEMPLATES.pop() if cls.ALLTEMPLATES else None diff --git a/pipeline/nltk/test/__init__.py b/pipeline/nltk/test/__init__.py deleted file mode 100644 index fa54080263a1ad65b72b1c7aabba8186c77db25d..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Natural Language Toolkit: Unit Tests -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -Unit tests for the NLTK modules. These tests are intended to ensure -that source code changes don't accidentally introduce bugs. -For instructions, please see: - -../../web/dev/local_testing.rst - -https://github.com/nltk/nltk/blob/develop/web/dev/local_testing.rst - - -""" diff --git a/pipeline/nltk/test/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/test/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 4ecad8ad326fcc4e48c8b0ab5272242a43aa6b1c..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/__pycache__/all.cpython-39.pyc b/pipeline/nltk/test/__pycache__/all.cpython-39.pyc deleted file mode 100644 index 210914837b9ce1d2dac183816b0382ac20595d03..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/__pycache__/all.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/__pycache__/childes_fixt.cpython-39.pyc b/pipeline/nltk/test/__pycache__/childes_fixt.cpython-39.pyc deleted file mode 100644 index 90da4d2af01e9af2eb919714afd221cfd18ea401..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/__pycache__/childes_fixt.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/__pycache__/classify_fixt.cpython-39.pyc b/pipeline/nltk/test/__pycache__/classify_fixt.cpython-39.pyc deleted file mode 100644 index 08f3609520be2785b6c88cbaf23e29b20776b571..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/__pycache__/classify_fixt.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/__pycache__/conftest.cpython-39.pyc b/pipeline/nltk/test/__pycache__/conftest.cpython-39.pyc deleted file mode 100644 index c53b12da48a827496bb14890b65755b084c2b8bb..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/__pycache__/conftest.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/__pycache__/gensim_fixt.cpython-39.pyc b/pipeline/nltk/test/__pycache__/gensim_fixt.cpython-39.pyc deleted file mode 100644 index 2e1c17f67b0eef13376ccef2f00b918af1d07cfb..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/__pycache__/gensim_fixt.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/__pycache__/gluesemantics_malt_fixt.cpython-39.pyc b/pipeline/nltk/test/__pycache__/gluesemantics_malt_fixt.cpython-39.pyc deleted file mode 100644 index 385e7a33fd7374b083e93e5522231917caf5f3bf..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/__pycache__/gluesemantics_malt_fixt.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/__pycache__/portuguese_en_fixt.cpython-39.pyc b/pipeline/nltk/test/__pycache__/portuguese_en_fixt.cpython-39.pyc deleted file mode 100644 index 128698cc5629ef46fd47265208543ea9a5855657..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/__pycache__/portuguese_en_fixt.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/__pycache__/probability_fixt.cpython-39.pyc b/pipeline/nltk/test/__pycache__/probability_fixt.cpython-39.pyc deleted file mode 100644 index d0b1356e993ce2bdfe82df01008ba0411f421968..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/__pycache__/probability_fixt.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/__pycache__/setup_fixt.cpython-39.pyc b/pipeline/nltk/test/__pycache__/setup_fixt.cpython-39.pyc deleted file mode 100644 index f20344524e078faa8c6adbea89bc03795834c868..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/__pycache__/setup_fixt.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/all.py b/pipeline/nltk/test/all.py deleted file mode 100644 index dd0d431e1c2fa356f31076768107b5da1e877bdd..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/all.py +++ /dev/null @@ -1,25 +0,0 @@ -"""Test suite that runs all NLTK tests. - -This module, `nltk.test.all`, is named as the NLTK ``test_suite`` in the -project's ``setup-eggs.py`` file. Here, we create a test suite that -runs all of our doctests, and return it for processing by the setuptools -test harness. - -""" -import doctest -import os.path -import unittest -from glob import glob - - -def additional_tests(): - # print("here-000000000000000") - # print("-----", glob(os.path.join(os.path.dirname(__file__), '*.doctest'))) - dir = os.path.dirname(__file__) - paths = glob(os.path.join(dir, "*.doctest")) - files = [os.path.basename(path) for path in paths] - return unittest.TestSuite([doctest.DocFileSuite(file) for file in files]) - - -# if os.path.split(path)[-1] != 'index.rst' -# skips time-dependent doctest in index.rst diff --git a/pipeline/nltk/test/bleu.doctest b/pipeline/nltk/test/bleu.doctest deleted file mode 100644 index d7e6e41f5a17e6f048a7264c72f657615e8567cc..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/bleu.doctest +++ /dev/null @@ -1,29 +0,0 @@ -========== -BLEU tests -========== - ->>> from nltk.translate import bleu - -If the candidate has no alignment to any of the references, the BLEU score is 0. - ->>> bleu( -... ['The candidate has no alignment to any of the references'.split()], -... 'John loves Mary'.split(), -... (1,), -... ) -0 - -This is an implementation of the smoothing techniques -for segment-level BLEU scores that was presented in -Boxing Chen and Collin Cherry (2014) A Systematic Comparison of -Smoothing Techniques for Sentence-Level BLEU. In WMT14. -http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf ->>> from nltk.translate.bleu_score import sentence_bleu,SmoothingFunction - - ->>> sentence_bleu( -... ['It is a place of quiet contemplation .'.split()], -... 'It is .'.split(), -... smoothing_function=SmoothingFunction().method4, -... )*100 -4.4267... diff --git a/pipeline/nltk/test/bnc.doctest b/pipeline/nltk/test/bnc.doctest deleted file mode 100644 index 80e1945d241d3a2f3b20f3550a160a4f70bd2bad..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/bnc.doctest +++ /dev/null @@ -1,60 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - - >>> import os.path - - >>> from nltk.corpus.reader import BNCCorpusReader - >>> import nltk.test - - >>> root = os.path.dirname(nltk.test.__file__) - >>> bnc = BNCCorpusReader(root=root, fileids='FX8.xml') - -Checking the word access. -------------------------- - - >>> len(bnc.words()) - 151 - - >>> bnc.words()[:6] - ['Ah', 'there', 'we', 'are', ',', '.'] - >>> bnc.words(stem=True)[:6] - ['ah', 'there', 'we', 'be', ',', '.'] - - >>> bnc.tagged_words()[:6] - [('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')] - - >>> bnc.tagged_words(c5=True)[:6] - [('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')] - -Testing access to the sentences. --------------------------------- - - >>> len(bnc.sents()) - 15 - - >>> bnc.sents()[0] - ['Ah', 'there', 'we', 'are', ',', '.'] - >>> bnc.sents(stem=True)[0] - ['ah', 'there', 'we', 'be', ',', '.'] - - >>> bnc.tagged_sents()[0] - [('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')] - >>> bnc.tagged_sents(c5=True)[0] - [('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')] - -A not lazy loader. ------------------- - - >>> eager = BNCCorpusReader(root=root, fileids=r'FX8.xml', lazy=False) - - >>> len(eager.words()) - 151 - >>> eager.words(stem=True)[6:17] - ['right', 'abdominal', 'wound', ',', 'she', 'be', 'a', 'wee', 'bit', 'confuse', '.'] - - >>> eager.tagged_words()[6:11] - [('Right', 'ADV'), ('abdominal', 'ADJ'), ('wound', 'SUBST'), (',', 'PUN'), ('she', 'PRON')] - >>> eager.tagged_words(c5=True)[6:17] - [('Right', 'AV0'), ('abdominal', 'AJ0'), ('wound', 'NN1'), (',', 'PUN'), ('she', 'PNP'), ("'s", 'VBZ'), ('a', 'AT0'), ('wee', 'AJ0-NN1'), ('bit', 'NN1'), ('confused', 'VVN-AJ0'), ('.', 'PUN')] - >>> len(eager.sents()) - 15 diff --git a/pipeline/nltk/test/ccg.doctest b/pipeline/nltk/test/ccg.doctest deleted file mode 100644 index 9c1e642c5e32ee0afe4c3d689d6461807a1db738..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/ccg.doctest +++ /dev/null @@ -1,376 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -============================== -Combinatory Categorial Grammar -============================== - -Relative Clauses ----------------- - - >>> from nltk.ccg import chart, lexicon - -Construct a lexicon: - - >>> lex = lexicon.fromstring(''' - ... :- S, NP, N, VP - ... - ... Det :: NP/N - ... Pro :: NP - ... Modal :: S\\NP/VP - ... - ... TV :: VP/NP - ... DTV :: TV/NP - ... - ... the => Det - ... - ... that => Det - ... that => NP - ... - ... I => Pro - ... you => Pro - ... we => Pro - ... - ... chef => N - ... cake => N - ... children => N - ... dough => N - ... - ... will => Modal - ... should => Modal - ... might => Modal - ... must => Modal - ... - ... and => var\\.,var/.,var - ... - ... to => VP[to]/VP - ... - ... without => (VP\\VP)/VP[ing] - ... - ... be => TV - ... cook => TV - ... eat => TV - ... - ... cooking => VP[ing]/NP - ... - ... give => DTV - ... - ... is => (S\\NP)/NP - ... prefer => (S\\NP)/NP - ... - ... which => (N\\N)/(S/NP) - ... - ... persuade => (VP/VP[to])/NP - ... ''') - - >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) - >>> for parse in parser.parse("you prefer that cake".split()): - ... chart.printCCGDerivation(parse) - ... break - ... - you prefer that cake - NP ((S\NP)/NP) (NP/N) N - --------------> - NP - ---------------------------> - (S\NP) - --------------------------------< - S - - >>> for parse in parser.parse("that is the cake which you prefer".split()): - ... chart.printCCGDerivation(parse) - ... break - ... - that is the cake which you prefer - NP ((S\NP)/NP) (NP/N) N ((N\N)/(S/NP)) NP ((S\NP)/NP) - ----->T - (S/(S\NP)) - ------------------>B - (S/NP) - ----------------------------------> - (N\N) - ----------------------------------------< - N - ------------------------------------------------> - NP - -------------------------------------------------------------> - (S\NP) - -------------------------------------------------------------------< - S - - -Some other sentences to try: -"that is the cake which we will persuade the chef to cook" -"that is the cake which we will persuade the chef to give the children" - - >>> sent = "that is the dough which you will eat without cooking".split() - >>> nosub_parser = chart.CCGChartParser(lex, chart.ApplicationRuleSet + - ... chart.CompositionRuleSet + chart.TypeRaiseRuleSet) - -Without Substitution (no output) - - >>> for parse in nosub_parser.parse(sent): - ... chart.printCCGDerivation(parse) - -With Substitution: - - >>> for parse in parser.parse(sent): - ... chart.printCCGDerivation(parse) - ... break - ... - that is the dough which you will eat without cooking - NP ((S\NP)/NP) (NP/N) N ((N\N)/(S/NP)) NP ((S\NP)/VP) (VP/NP) ((VP\VP)/VP['ing']) (VP['ing']/NP) - ----->T - (S/(S\NP)) - ------------------------------------->B - ((VP\VP)/NP) - ----------------------------------------------B - ((S\NP)/NP) - ---------------------------------------------------------------->B - (S/NP) - --------------------------------------------------------------------------------> - (N\N) - ---------------------------------------------------------------------------------------< - N - -----------------------------------------------------------------------------------------------> - NP - ------------------------------------------------------------------------------------------------------------> - (S\NP) - ------------------------------------------------------------------------------------------------------------------< - S - - -Conjunction ------------ - - >>> from nltk.ccg.chart import CCGChartParser, ApplicationRuleSet, CompositionRuleSet - >>> from nltk.ccg.chart import SubstitutionRuleSet, TypeRaiseRuleSet, printCCGDerivation - >>> from nltk.ccg import lexicon - -Lexicons for the tests: - - >>> test1_lex = ''' - ... :- S,N,NP,VP - ... I => NP - ... you => NP - ... will => S\\NP/VP - ... cook => VP/NP - ... which => (N\\N)/(S/NP) - ... and => var\\.,var/.,var - ... might => S\\NP/VP - ... eat => VP/NP - ... the => NP/N - ... mushrooms => N - ... parsnips => N''' - >>> test2_lex = ''' - ... :- N, S, NP, VP - ... articles => N - ... the => NP/N - ... and => var\\.,var/.,var - ... which => (N\\N)/(S/NP) - ... I => NP - ... anyone => NP - ... will => (S/VP)\\NP - ... file => VP/NP - ... without => (VP\\VP)/VP[ing] - ... forget => VP/NP - ... reading => VP[ing]/NP - ... ''' - -Tests handling of conjunctions. -Note that while the two derivations are different, they are semantically equivalent. - - >>> lex = lexicon.fromstring(test1_lex) - >>> parser = CCGChartParser(lex, ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet) - >>> for parse in parser.parse("I will cook and might eat the mushrooms and parsnips".split()): - ... printCCGDerivation(parse) - I will cook and might eat the mushrooms and parsnips - NP ((S\NP)/VP) (VP/NP) ((_var0\.,_var0)/.,_var0) ((S\NP)/VP) (VP/NP) (NP/N) N ((_var0\.,_var0)/.,_var0) N - ---------------------->B - ((S\NP)/NP) - ---------------------->B - ((S\NP)/NP) - -------------------------------------------------> - (((S\NP)/NP)\.,((S\NP)/NP)) - -----------------------------------------------------------------------< - ((S\NP)/NP) - -------------------------------------> - (N\.,N) - ------------------------------------------------< - N - --------------------------------------------------------> - NP - -------------------------------------------------------------------------------------------------------------------------------> - (S\NP) - -----------------------------------------------------------------------------------------------------------------------------------< - S - I will cook and might eat the mushrooms and parsnips - NP ((S\NP)/VP) (VP/NP) ((_var0\.,_var0)/.,_var0) ((S\NP)/VP) (VP/NP) (NP/N) N ((_var0\.,_var0)/.,_var0) N - ---------------------->B - ((S\NP)/NP) - ---------------------->B - ((S\NP)/NP) - -------------------------------------------------> - (((S\NP)/NP)\.,((S\NP)/NP)) - -----------------------------------------------------------------------< - ((S\NP)/NP) - ------------------------------------------------------------------------------->B - ((S\NP)/N) - -------------------------------------> - (N\.,N) - ------------------------------------------------< - N - -------------------------------------------------------------------------------------------------------------------------------> - (S\NP) - -----------------------------------------------------------------------------------------------------------------------------------< - S - - -Tests handling subject extraction. -Interesting to point that the two parses are clearly semantically different. - - >>> lex = lexicon.fromstring(test2_lex) - >>> parser = CCGChartParser(lex, ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet) - >>> for parse in parser.parse("articles which I will file and forget without reading".split()): - ... printCCGDerivation(parse) - articles which I will file and forget without reading - N ((N\N)/(S/NP)) NP ((S/VP)\NP) (VP/NP) ((_var0\.,_var0)/.,_var0) (VP/NP) ((VP\VP)/VP['ing']) (VP['ing']/NP) - -----------------< - (S/VP) - ------------------------------------->B - ((VP\VP)/NP) - ---------------------------------------------- - ((VP/NP)\.,(VP/NP)) - ----------------------------------------------------------------------------------< - (VP/NP) - --------------------------------------------------------------------------------------------------->B - (S/NP) - -------------------------------------------------------------------------------------------------------------------> - (N\N) - -----------------------------------------------------------------------------------------------------------------------------< - N - articles which I will file and forget without reading - N ((N\N)/(S/NP)) NP ((S/VP)\NP) (VP/NP) ((_var0\.,_var0)/.,_var0) (VP/NP) ((VP\VP)/VP['ing']) (VP['ing']/NP) - -----------------< - (S/VP) - ------------------------------------> - ((VP/NP)\.,(VP/NP)) - ---------------------------------------------< - (VP/NP) - ------------------------------------->B - ((VP\VP)/NP) - ----------------------------------------------------------------------------------B - (S/NP) - -------------------------------------------------------------------------------------------------------------------> - (N\N) - -----------------------------------------------------------------------------------------------------------------------------< - N - - -Unicode support ---------------- - -Unicode words are supported. - - >>> from nltk.ccg import chart, lexicon - -Lexicons for the tests: - - >>> lex = lexicon.fromstring(''' - ... :- S, N, NP, PP - ... - ... AdjI :: N\\N - ... AdjD :: N/N - ... AdvD :: S/S - ... AdvI :: S\\S - ... Det :: NP/N - ... PrepNPCompl :: PP/NP - ... PrepNAdjN :: S\\S/N - ... PrepNAdjNP :: S\\S/NP - ... VPNP :: S\\NP/NP - ... VPPP :: S\\NP/PP - ... VPser :: S\\NP/AdjI - ... - ... auto => N - ... bebidas => N - ... cine => N - ... ley => N - ... libro => N - ... ministro => N - ... panadería => N - ... presidente => N - ... super => N - ... - ... el => Det - ... la => Det - ... las => Det - ... un => Det - ... - ... Ana => NP - ... Pablo => NP - ... - ... y => var\\.,var/.,var - ... - ... pero => (S/NP)\\(S/NP)/(S/NP) - ... - ... anunció => VPNP - ... compró => VPNP - ... cree => S\\NP/S[dep] - ... desmintió => VPNP - ... lee => VPNP - ... fueron => VPPP - ... - ... es => VPser - ... - ... interesante => AdjD - ... interesante => AdjI - ... nueva => AdjD - ... nueva => AdjI - ... - ... a => PrepNPCompl - ... en => PrepNAdjN - ... en => PrepNAdjNP - ... - ... ayer => AdvI - ... - ... que => (NP\\NP)/(S/NP) - ... que => S[dep]/S - ... ''') - - >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) - >>> for parse in parser.parse(u"el ministro anunció pero el presidente desmintió la nueva ley".split()): - ... printCCGDerivation(parse) # doctest: +SKIP - ... # it fails on python2.7 because of the unicode problem explained in https://github.com/nltk/nltk/pull/1354 - ... break - el ministro anunció pero el presidente desmintió la nueva ley - (NP/N) N ((S\NP)/NP) (((S/NP)\(S/NP))/(S/NP)) (NP/N) N ((S\NP)/NP) (NP/N) (N/N) N - ------------------> - NP - ------------------>T - (S/(S\NP)) - --------------------> - NP - -------------------->T - (S/(S\NP)) - --------------------------------->B - (S/NP) - -----------------------------------------------------------> - ((S/NP)\(S/NP)) - ------------> - N - --------------------> - NP - -------------------- - S diff --git a/pipeline/nltk/test/ccg_semantics.doctest b/pipeline/nltk/test/ccg_semantics.doctest deleted file mode 100644 index 5350d58d15f6556473a9d8de990d52fc2c97f796..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/ccg_semantics.doctest +++ /dev/null @@ -1,552 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -============================================== -Combinatory Categorial Grammar with semantics -============================================== - ------ -Chart ------ - - - >>> from nltk.ccg import chart, lexicon - >>> from nltk.ccg.chart import printCCGDerivation - -No semantics -------------------- - - >>> lex = lexicon.fromstring(''' - ... :- S, NP, N - ... She => NP - ... has => (S\\NP)/NP - ... books => NP - ... ''', - ... False) - - >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) - >>> parses = list(parser.parse("She has books".split())) - >>> print(str(len(parses)) + " parses") - 3 parses - - >>> printCCGDerivation(parses[0]) - She has books - NP ((S\NP)/NP) NP - --------------------> - (S\NP) - -------------------------< - S - - >>> printCCGDerivation(parses[1]) - She has books - NP ((S\NP)/NP) NP - ----->T - (S/(S\NP)) - --------------------> - (S\NP) - -------------------------> - S - - - >>> printCCGDerivation(parses[2]) - She has books - NP ((S\NP)/NP) NP - ----->T - (S/(S\NP)) - ------------------>B - (S/NP) - -------------------------> - S - -Simple semantics -------------------- - - >>> lex = lexicon.fromstring(''' - ... :- S, NP, N - ... She => NP {she} - ... has => (S\\NP)/NP {\\x y.have(y, x)} - ... a => NP/N {\\P.exists z.P(z)} - ... book => N {book} - ... ''', - ... True) - - >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) - >>> parses = list(parser.parse("She has a book".split())) - >>> print(str(len(parses)) + " parses") - 7 parses - - >>> printCCGDerivation(parses[0]) - She has a book - NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book} - -------------------------------------> - NP {exists z.book(z)} - -------------------------------------------------------------------> - (S\NP) {\y.have(y,exists z.book(z))} - -----------------------------------------------------------------------------< - S {have(she,exists z.book(z))} - - >>> printCCGDerivation(parses[1]) - She has a book - NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book} - --------------------------------------------------------->B - ((S\NP)/N) {\P y.have(y,exists z.P(z))} - -------------------------------------------------------------------> - (S\NP) {\y.have(y,exists z.book(z))} - -----------------------------------------------------------------------------< - S {have(she,exists z.book(z))} - - >>> printCCGDerivation(parses[2]) - She has a book - NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book} - ---------->T - (S/(S\NP)) {\F.F(she)} - -------------------------------------> - NP {exists z.book(z)} - -------------------------------------------------------------------> - (S\NP) {\y.have(y,exists z.book(z))} - -----------------------------------------------------------------------------> - S {have(she,exists z.book(z))} - - >>> printCCGDerivation(parses[3]) - She has a book - NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book} - ---------->T - (S/(S\NP)) {\F.F(she)} - --------------------------------------------------------->B - ((S\NP)/N) {\P y.have(y,exists z.P(z))} - -------------------------------------------------------------------> - (S\NP) {\y.have(y,exists z.book(z))} - -----------------------------------------------------------------------------> - S {have(she,exists z.book(z))} - - >>> printCCGDerivation(parses[4]) - She has a book - NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book} - ---------->T - (S/(S\NP)) {\F.F(she)} - ---------------------------------------->B - (S/NP) {\x.have(she,x)} - -------------------------------------> - NP {exists z.book(z)} - -----------------------------------------------------------------------------> - S {have(she,exists z.book(z))} - - >>> printCCGDerivation(parses[5]) - She has a book - NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book} - ---------->T - (S/(S\NP)) {\F.F(she)} - --------------------------------------------------------->B - ((S\NP)/N) {\P y.have(y,exists z.P(z))} - ------------------------------------------------------------------->B - (S/N) {\P.have(she,exists z.P(z))} - -----------------------------------------------------------------------------> - S {have(she,exists z.book(z))} - - >>> printCCGDerivation(parses[6]) - She has a book - NP {she} ((S\NP)/NP) {\x y.have(y,x)} (NP/N) {\P.exists z.P(z)} N {book} - ---------->T - (S/(S\NP)) {\F.F(she)} - ---------------------------------------->B - (S/NP) {\x.have(she,x)} - ------------------------------------------------------------------->B - (S/N) {\P.have(she,exists z.P(z))} - -----------------------------------------------------------------------------> - S {have(she,exists z.book(z))} - -Complex semantics -------------------- - - >>> lex = lexicon.fromstring(''' - ... :- S, NP, N - ... She => NP {she} - ... has => (S\\NP)/NP {\\x y.have(y, x)} - ... a => ((S\\NP)\\((S\\NP)/NP))/N {\\P R x.(exists z.P(z) & R(z,x))} - ... book => N {book} - ... ''', - ... True) - - >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) - >>> parses = list(parser.parse("She has a book".split())) - >>> print(str(len(parses)) + " parses") - 2 parses - - >>> printCCGDerivation(parses[0]) - She has a book - NP {she} ((S\NP)/NP) {\x y.have(y,x)} (((S\NP)\((S\NP)/NP))/N) {\P R x.(exists z.P(z) & R(z,x))} N {book} - ----------------------------------------------------------------------> - ((S\NP)\((S\NP)/NP)) {\R x.(exists z.book(z) & R(z,x))} - ----------------------------------------------------------------------------------------------------< - (S\NP) {\x.(exists z.book(z) & have(x,z))} - --------------------------------------------------------------------------------------------------------------< - S {(exists z.book(z) & have(she,z))} - - >>> printCCGDerivation(parses[1]) - She has a book - NP {she} ((S\NP)/NP) {\x y.have(y,x)} (((S\NP)\((S\NP)/NP))/N) {\P R x.(exists z.P(z) & R(z,x))} N {book} - ---------->T - (S/(S\NP)) {\F.F(she)} - ----------------------------------------------------------------------> - ((S\NP)\((S\NP)/NP)) {\R x.(exists z.book(z) & R(z,x))} - ----------------------------------------------------------------------------------------------------< - (S\NP) {\x.(exists z.book(z) & have(x,z))} - --------------------------------------------------------------------------------------------------------------> - S {(exists z.book(z) & have(she,z))} - -Using conjunctions ---------------------- - - # TODO: The semantics of "and" should have been more flexible - >>> lex = lexicon.fromstring(''' - ... :- S, NP, N - ... I => NP {I} - ... cook => (S\\NP)/NP {\\x y.cook(x,y)} - ... and => var\\.,var/.,var {\\P Q x y.(P(x,y) & Q(x,y))} - ... eat => (S\\NP)/NP {\\x y.eat(x,y)} - ... the => NP/N {\\x.the(x)} - ... bacon => N {bacon} - ... ''', - ... True) - - >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) - >>> parses = list(parser.parse("I cook and eat the bacon".split())) - >>> print(str(len(parses)) + " parses") - 7 parses - - >>> printCCGDerivation(parses[0]) - I cook and eat the bacon - NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon} - -------------------------------------------------------------------------------------> - (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))} - -------------------------------------------------------------------------------------------------------------------< - ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))} - -------------------------------> - NP {the(bacon)} - --------------------------------------------------------------------------------------------------------------------------------------------------> - (S\NP) {\y.(eat(the(bacon),y) & cook(the(bacon),y))} - ----------------------------------------------------------------------------------------------------------------------------------------------------------< - S {(eat(the(bacon),I) & cook(the(bacon),I))} - - >>> printCCGDerivation(parses[1]) - I cook and eat the bacon - NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon} - -------------------------------------------------------------------------------------> - (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))} - -------------------------------------------------------------------------------------------------------------------< - ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))} - --------------------------------------------------------------------------------------------------------------------------------------->B - ((S\NP)/N) {\x y.(eat(the(x),y) & cook(the(x),y))} - --------------------------------------------------------------------------------------------------------------------------------------------------> - (S\NP) {\y.(eat(the(bacon),y) & cook(the(bacon),y))} - ----------------------------------------------------------------------------------------------------------------------------------------------------------< - S {(eat(the(bacon),I) & cook(the(bacon),I))} - - >>> printCCGDerivation(parses[2]) - I cook and eat the bacon - NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon} - -------->T - (S/(S\NP)) {\F.F(I)} - -------------------------------------------------------------------------------------> - (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))} - -------------------------------------------------------------------------------------------------------------------< - ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))} - -------------------------------> - NP {the(bacon)} - --------------------------------------------------------------------------------------------------------------------------------------------------> - (S\NP) {\y.(eat(the(bacon),y) & cook(the(bacon),y))} - ----------------------------------------------------------------------------------------------------------------------------------------------------------> - S {(eat(the(bacon),I) & cook(the(bacon),I))} - - >>> printCCGDerivation(parses[3]) - I cook and eat the bacon - NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon} - -------->T - (S/(S\NP)) {\F.F(I)} - -------------------------------------------------------------------------------------> - (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))} - -------------------------------------------------------------------------------------------------------------------< - ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))} - --------------------------------------------------------------------------------------------------------------------------------------->B - ((S\NP)/N) {\x y.(eat(the(x),y) & cook(the(x),y))} - --------------------------------------------------------------------------------------------------------------------------------------------------> - (S\NP) {\y.(eat(the(bacon),y) & cook(the(bacon),y))} - ----------------------------------------------------------------------------------------------------------------------------------------------------------> - S {(eat(the(bacon),I) & cook(the(bacon),I))} - - >>> printCCGDerivation(parses[4]) - I cook and eat the bacon - NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon} - -------->T - (S/(S\NP)) {\F.F(I)} - -------------------------------------------------------------------------------------> - (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))} - -------------------------------------------------------------------------------------------------------------------< - ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))} - --------------------------------------------------------------------------------------------------------------------------->B - (S/NP) {\x.(eat(x,I) & cook(x,I))} - -------------------------------> - NP {the(bacon)} - ----------------------------------------------------------------------------------------------------------------------------------------------------------> - S {(eat(the(bacon),I) & cook(the(bacon),I))} - - >>> printCCGDerivation(parses[5]) - I cook and eat the bacon - NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon} - -------->T - (S/(S\NP)) {\F.F(I)} - -------------------------------------------------------------------------------------> - (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))} - -------------------------------------------------------------------------------------------------------------------< - ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))} - --------------------------------------------------------------------------------------------------------------------------------------->B - ((S\NP)/N) {\x y.(eat(the(x),y) & cook(the(x),y))} - ----------------------------------------------------------------------------------------------------------------------------------------------->B - (S/N) {\x.(eat(the(x),I) & cook(the(x),I))} - ----------------------------------------------------------------------------------------------------------------------------------------------------------> - S {(eat(the(bacon),I) & cook(the(bacon),I))} - - >>> printCCGDerivation(parses[6]) - I cook and eat the bacon - NP {I} ((S\NP)/NP) {\x y.cook(x,y)} ((_var0\.,_var0)/.,_var0) {\P Q x y.(P(x,y) & Q(x,y))} ((S\NP)/NP) {\x y.eat(x,y)} (NP/N) {\x.the(x)} N {bacon} - -------->T - (S/(S\NP)) {\F.F(I)} - -------------------------------------------------------------------------------------> - (((S\NP)/NP)\.,((S\NP)/NP)) {\Q x y.(eat(x,y) & Q(x,y))} - -------------------------------------------------------------------------------------------------------------------< - ((S\NP)/NP) {\x y.(eat(x,y) & cook(x,y))} - --------------------------------------------------------------------------------------------------------------------------->B - (S/NP) {\x.(eat(x,I) & cook(x,I))} - ----------------------------------------------------------------------------------------------------------------------------------------------->B - (S/N) {\x.(eat(the(x),I) & cook(the(x),I))} - ----------------------------------------------------------------------------------------------------------------------------------------------------------> - S {(eat(the(bacon),I) & cook(the(bacon),I))} - -Tests from published papers ------------------------------- - -An example from "CCGbank: A Corpus of CCG Derivations and Dependency Structures Extracted from the Penn Treebank", Hockenmaier and Steedman, 2007, Page 359, https://www.aclweb.org/anthology/J/J07/J07-3004.pdf - - >>> lex = lexicon.fromstring(''' - ... :- S, NP - ... I => NP {I} - ... give => ((S\\NP)/NP)/NP {\\x y z.give(y,x,z)} - ... them => NP {them} - ... money => NP {money} - ... ''', - ... True) - - >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) - >>> parses = list(parser.parse("I give them money".split())) - >>> print(str(len(parses)) + " parses") - 3 parses - - >>> printCCGDerivation(parses[0]) - I give them money - NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} NP {money} - --------------------------------------------------> - ((S\NP)/NP) {\y z.give(y,them,z)} - --------------------------------------------------------------> - (S\NP) {\z.give(money,them,z)} - ----------------------------------------------------------------------< - S {give(money,them,I)} - - >>> printCCGDerivation(parses[1]) - I give them money - NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} NP {money} - -------->T - (S/(S\NP)) {\F.F(I)} - --------------------------------------------------> - ((S\NP)/NP) {\y z.give(y,them,z)} - --------------------------------------------------------------> - (S\NP) {\z.give(money,them,z)} - ----------------------------------------------------------------------> - S {give(money,them,I)} - - - >>> printCCGDerivation(parses[2]) - I give them money - NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} NP {money} - -------->T - (S/(S\NP)) {\F.F(I)} - --------------------------------------------------> - ((S\NP)/NP) {\y z.give(y,them,z)} - ---------------------------------------------------------->B - (S/NP) {\y.give(y,them,I)} - ----------------------------------------------------------------------> - S {give(money,them,I)} - - -An example from "CCGbank: A Corpus of CCG Derivations and Dependency Structures Extracted from the Penn Treebank", Hockenmaier and Steedman, 2007, Page 359, https://www.aclweb.org/anthology/J/J07/J07-3004.pdf - - >>> lex = lexicon.fromstring(''' - ... :- N, NP, S - ... money => N {money} - ... that => (N\\N)/(S/NP) {\\P Q x.(P(x) & Q(x))} - ... I => NP {I} - ... give => ((S\\NP)/NP)/NP {\\x y z.give(y,x,z)} - ... them => NP {them} - ... ''', - ... True) - - >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) - >>> parses = list(parser.parse("money that I give them".split())) - >>> print(str(len(parses)) + " parses") - 3 parses - - >>> printCCGDerivation(parses[0]) - money that I give them - N {money} ((N\N)/(S/NP)) {\P Q x.(P(x) & Q(x))} NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} - -------->T - (S/(S\NP)) {\F.F(I)} - --------------------------------------------------> - ((S\NP)/NP) {\y z.give(y,them,z)} - ---------------------------------------------------------->B - (S/NP) {\y.give(y,them,I)} - -------------------------------------------------------------------------------------------------> - (N\N) {\Q x.(give(x,them,I) & Q(x))} - ------------------------------------------------------------------------------------------------------------< - N {\x.(give(x,them,I) & money(x))} - - >>> printCCGDerivation(parses[1]) - money that I give them - N {money} ((N\N)/(S/NP)) {\P Q x.(P(x) & Q(x))} NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} - ----------->T - (N/(N\N)) {\F.F(money)} - -------->T - (S/(S\NP)) {\F.F(I)} - --------------------------------------------------> - ((S\NP)/NP) {\y z.give(y,them,z)} - ---------------------------------------------------------->B - (S/NP) {\y.give(y,them,I)} - -------------------------------------------------------------------------------------------------> - (N\N) {\Q x.(give(x,them,I) & Q(x))} - ------------------------------------------------------------------------------------------------------------> - N {\x.(give(x,them,I) & money(x))} - - >>> printCCGDerivation(parses[2]) - money that I give them - N {money} ((N\N)/(S/NP)) {\P Q x.(P(x) & Q(x))} NP {I} (((S\NP)/NP)/NP) {\x y z.give(y,x,z)} NP {them} - ----------->T - (N/(N\N)) {\F.F(money)} - -------------------------------------------------->B - (N/(S/NP)) {\P x.(P(x) & money(x))} - -------->T - (S/(S\NP)) {\F.F(I)} - --------------------------------------------------> - ((S\NP)/NP) {\y z.give(y,them,z)} - ---------------------------------------------------------->B - (S/NP) {\y.give(y,them,I)} - ------------------------------------------------------------------------------------------------------------> - N {\x.(give(x,them,I) & money(x))} - - -------- -Lexicon -------- - - >>> from nltk.ccg import lexicon - -Parse lexicon with semantics - - >>> print(str(lexicon.fromstring( - ... ''' - ... :- S,NP - ... - ... IntransVsg :: S\\NP[sg] - ... - ... sleeps => IntransVsg {\\x.sleep(x)} - ... eats => S\\NP[sg]/NP {\\x y.eat(x,y)} - ... - ... and => var\\var/var {\\x y.x & y} - ... ''', - ... True - ... ))) - and => ((_var0\_var0)/_var0) {(\x y.x & y)} - eats => ((S\NP['sg'])/NP) {\x y.eat(x,y)} - sleeps => (S\NP['sg']) {\x.sleep(x)} - -Parse lexicon without semantics - - >>> print(str(lexicon.fromstring( - ... ''' - ... :- S,NP - ... - ... IntransVsg :: S\\NP[sg] - ... - ... sleeps => IntransVsg - ... eats => S\\NP[sg]/NP {sem=\\x y.eat(x,y)} - ... - ... and => var\\var/var - ... ''', - ... False - ... ))) - and => ((_var0\_var0)/_var0) - eats => ((S\NP['sg'])/NP) - sleeps => (S\NP['sg']) - -Semantics are missing - - >>> print(str(lexicon.fromstring( - ... ''' - ... :- S,NP - ... - ... eats => S\\NP[sg]/NP - ... ''', - ... True - ... ))) - Traceback (most recent call last): - ... - AssertionError: eats => S\NP[sg]/NP must contain semantics because include_semantics is set to True - - ------------------------------------- -CCG combinator semantics computation ------------------------------------- - - >>> from nltk.sem.logic import * - >>> from nltk.ccg.logic import * - - >>> read_expr = Expression.fromstring - -Compute semantics from function application - - >>> print(str(compute_function_semantics(read_expr(r'\x.P(x)'), read_expr(r'book')))) - P(book) - - >>> print(str(compute_function_semantics(read_expr(r'\P.P(book)'), read_expr(r'read')))) - read(book) - - >>> print(str(compute_function_semantics(read_expr(r'\P.P(book)'), read_expr(r'\x.read(x)')))) - read(book) - -Compute semantics from composition - - >>> print(str(compute_composition_semantics(read_expr(r'\x.P(x)'), read_expr(r'\x.Q(x)')))) - \x.P(Q(x)) - - >>> print(str(compute_composition_semantics(read_expr(r'\x.P(x)'), read_expr(r'read')))) - Traceback (most recent call last): - ... - AssertionError: `read` must be a lambda expression - -Compute semantics from substitution - - >>> print(str(compute_substitution_semantics(read_expr(r'\x y.P(x,y)'), read_expr(r'\x.Q(x)')))) - \x.P(x,Q(x)) - - >>> print(str(compute_substitution_semantics(read_expr(r'\x.P(x)'), read_expr(r'read')))) - Traceback (most recent call last): - ... - AssertionError: `\x.P(x)` must be a lambda expression with 2 arguments - -Compute type-raise semantics - - >>> print(str(compute_type_raised_semantics(read_expr(r'\x.P(x)')))) - \F x.F(P(x)) - - >>> print(str(compute_type_raised_semantics(read_expr(r'\x.F(x)')))) - \F1 x.F1(F(x)) - - >>> print(str(compute_type_raised_semantics(read_expr(r'\x y z.P(x,y,z)')))) - \F x y z.F(P(x,y,z)) diff --git a/pipeline/nltk/test/chat80.doctest b/pipeline/nltk/test/chat80.doctest deleted file mode 100644 index b17a95fb254208823711bb8285c48060a2a6ce3e..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/chat80.doctest +++ /dev/null @@ -1,232 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -======= -Chat-80 -======= - -Chat-80 was a natural language system which allowed the user to -interrogate a Prolog knowledge base in the domain of world -geography. It was developed in the early '80s by Warren and Pereira; see -``_ for a description and -``_ for the source -files. - -The ``chat80`` module contains functions to extract data from the Chat-80 -relation files ('the world database'), and convert then into a format -that can be incorporated in the FOL models of -``nltk.sem.evaluate``. The code assumes that the Prolog -input files are available in the NLTK corpora directory. - -The Chat-80 World Database consists of the following files:: - - world0.pl - rivers.pl - cities.pl - countries.pl - contain.pl - borders.pl - -This module uses a slightly modified version of ``world0.pl``, in which -a set of Prolog rules have been omitted. The modified file is named -``world1.pl``. Currently, the file ``rivers.pl`` is not read in, since -it uses a list rather than a string in the second field. - -Reading Chat-80 Files -===================== - -Chat-80 relations are like tables in a relational database. The -relation acts as the name of the table; the first argument acts as the -'primary key'; and subsequent arguments are further fields in the -table. In general, the name of the table provides a label for a unary -predicate whose extension is all the primary keys. For example, -relations in ``cities.pl`` are of the following form:: - - 'city(athens,greece,1368).' - -Here, ``'athens'`` is the key, and will be mapped to a member of the -unary predicate *city*. - -By analogy with NLTK corpora, ``chat80`` defines a number of 'items' -which correspond to the relations. - - >>> from nltk.sem import chat80 - >>> print(chat80.items) - ('borders', 'circle_of_lat', 'circle_of_long', 'city', ...) - -The fields in the table are mapped to binary predicates. The first -argument of the predicate is the primary key, while the second -argument is the data in the relevant field. Thus, in the above -example, the third field is mapped to the binary predicate -*population_of*, whose extension is a set of pairs such as -``'(athens, 1368)'``. - -An exception to this general framework is required by the relations in -the files ``borders.pl`` and ``contains.pl``. These contain facts of the -following form:: - - 'borders(albania,greece).' - - 'contains0(africa,central_africa).' - -We do not want to form a unary concept out the element in -the first field of these records, and we want the label of the binary -relation just to be ``'border'``/``'contain'`` respectively. - -In order to drive the extraction process, we use 'relation metadata bundles' -which are Python dictionaries such as the following:: - - city = {'label': 'city', - 'closures': [], - 'schema': ['city', 'country', 'population'], - 'filename': 'cities.pl'} - -According to this, the file ``city['filename']`` contains a list of -relational tuples (or more accurately, the corresponding strings in -Prolog form) whose predicate symbol is ``city['label']`` and whose -relational schema is ``city['schema']``. The notion of a ``closure`` is -discussed in the next section. - -Concepts -======== -In order to encapsulate the results of the extraction, a class of -``Concept``\ s is introduced. A ``Concept`` object has a number of -attributes, in particular a ``prefLabel``, an arity and ``extension``. - - >>> c1 = chat80.Concept('dog', arity=1, extension=set(['d1', 'd2'])) - >>> print(c1) - Label = 'dog' - Arity = 1 - Extension = ['d1', 'd2'] - - - -The ``extension`` attribute makes it easier to inspect the output of -the extraction. - - >>> schema = ['city', 'country', 'population'] - >>> concepts = chat80.clause2concepts('cities.pl', 'city', schema) - >>> concepts - [Concept('city'), Concept('country_of'), Concept('population_of')] - >>> for c in concepts: - ... print("%s:\n\t%s" % (c.prefLabel, c.extension[:4])) - city: - ['athens', 'bangkok', 'barcelona', 'berlin'] - country_of: - [('athens', 'greece'), ('bangkok', 'thailand'), ('barcelona', 'spain'), ('berlin', 'east_germany')] - population_of: - [('athens', '1368'), ('bangkok', '1178'), ('barcelona', '1280'), ('berlin', '3481')] - -In addition, the ``extension`` can be further -processed: in the case of the ``'border'`` relation, we check that the -relation is **symmetric**, and in the case of the ``'contain'`` -relation, we carry out the **transitive closure**. The closure -properties associated with a concept is indicated in the relation -metadata, as indicated earlier. - - >>> borders = set([('a1', 'a2'), ('a2', 'a3')]) - >>> c2 = chat80.Concept('borders', arity=2, extension=borders) - >>> print(c2) - Label = 'borders' - Arity = 2 - Extension = [('a1', 'a2'), ('a2', 'a3')] - >>> c3 = chat80.Concept('borders', arity=2, closures=['symmetric'], extension=borders) - >>> c3.close() - >>> print(c3) - Label = 'borders' - Arity = 2 - Extension = [('a1', 'a2'), ('a2', 'a1'), ('a2', 'a3'), ('a3', 'a2')] - -The ``extension`` of a ``Concept`` object is then incorporated into a -``Valuation`` object. - -Persistence -=========== -The functions ``val_dump`` and ``val_load`` are provided to allow a -valuation to be stored in a persistent database and re-loaded, rather -than having to be re-computed each time. - -Individuals and Lexical Items -============================= -As well as deriving relations from the Chat-80 data, we also create a -set of individual constants, one for each entity in the domain. The -individual constants are string-identical to the entities. For -example, given a data item such as ``'zloty'``, we add to the valuation -a pair ``('zloty', 'zloty')``. In order to parse English sentences that -refer to these entities, we also create a lexical item such as the -following for each individual constant:: - - PropN[num=sg, sem=<\P.(P zloty)>] -> 'Zloty' - -The set of rules is written to the file ``chat_pnames.fcfg`` in the -current directory. - -SQL Query -========= - -The ``city`` relation is also available in RDB form and can be queried -using SQL statements. - - >>> import nltk - >>> q = "SELECT City, Population FROM city_table WHERE Country = 'china' and Population > 1000" - >>> for answer in chat80.sql_query('corpora/city_database/city.db', q): - ... print("%-10s %4s" % answer) - canton 1496 - chungking 1100 - mukden 1551 - peking 2031 - shanghai 5407 - tientsin 1795 - -The (deliberately naive) grammar ``sql.fcfg`` translates from English -to SQL: - - >>> nltk.data.show_cfg('grammars/book_grammars/sql0.fcfg') - % start S - S[SEM=(?np + WHERE + ?vp)] -> NP[SEM=?np] VP[SEM=?vp] - VP[SEM=(?v + ?pp)] -> IV[SEM=?v] PP[SEM=?pp] - VP[SEM=(?v + ?ap)] -> IV[SEM=?v] AP[SEM=?ap] - NP[SEM=(?det + ?n)] -> Det[SEM=?det] N[SEM=?n] - PP[SEM=(?p + ?np)] -> P[SEM=?p] NP[SEM=?np] - AP[SEM=?pp] -> A[SEM=?a] PP[SEM=?pp] - NP[SEM='Country="greece"'] -> 'Greece' - NP[SEM='Country="china"'] -> 'China' - Det[SEM='SELECT'] -> 'Which' | 'What' - N[SEM='City FROM city_table'] -> 'cities' - IV[SEM=''] -> 'are' - A[SEM=''] -> 'located' - P[SEM=''] -> 'in' - -Given this grammar, we can express, and then execute, queries in English. - - >>> cp = nltk.parse.load_parser('grammars/book_grammars/sql0.fcfg') - >>> query = 'What cities are in China' - >>> for tree in cp.parse(query.split()): - ... answer = tree.label()['SEM'] - ... q = " ".join(answer) - ... print(q) - ... - SELECT City FROM city_table WHERE Country="china" - - >>> rows = chat80.sql_query('corpora/city_database/city.db', q) - >>> for r in rows: print("%s" % r, end=' ') - canton chungking dairen harbin kowloon mukden peking shanghai sian tientsin - - -Using Valuations ------------------ - -In order to convert such an extension into a valuation, we use the -``make_valuation()`` method; setting ``read=True`` creates and returns -a new ``Valuation`` object which contains the results. - - >>> val = chat80.make_valuation(concepts, read=True) - >>> 'calcutta' in val['city'] - True - >>> [town for (town, country) in val['country_of'] if country == 'india'] - ['bombay', 'calcutta', 'delhi', 'hyderabad', 'madras'] - >>> dom = val.domain - >>> g = nltk.sem.Assignment(dom) - >>> m = nltk.sem.Model(dom, val) - >>> m.evaluate(r'population_of(jakarta, 533)', g) - True diff --git a/pipeline/nltk/test/childes.doctest b/pipeline/nltk/test/childes.doctest deleted file mode 100644 index d2e1195b1128f97df83e3c7e5ba386583d15242f..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/childes.doctest +++ /dev/null @@ -1,190 +0,0 @@ -======================= - CHILDES Corpus Readers -======================= - -Read the XML version of the CHILDES corpus. - -Setup -===== - - >>> from nltk.test.childes_fixt import setup_module - >>> setup_module() - -How to use CHILDESCorpusReader -============================== - -Read the CHILDESCorpusReader class and read the CHILDES corpus saved in -the nltk_data directory. - - >>> import nltk - >>> from nltk.corpus.reader import CHILDESCorpusReader - >>> corpus_root = nltk.data.find('corpora/childes/data-xml/Eng-USA-MOR/') - -Reading files in the Valian corpus (Valian, 1991). - - >>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml') - >>> valian.fileids() - ['Valian/01a.xml', 'Valian/01b.xml', 'Valian/02a.xml', 'Valian/02b.xml',... - -Count the number of files - - >>> len(valian.fileids()) - 43 - -Printing properties of the corpus files. - - >>> corpus_data = valian.corpus(valian.fileids()) - >>> print(corpus_data[0]['Lang']) - eng - >>> for key in sorted(corpus_data[0].keys()): - ... print(key, ": ", corpus_data[0][key]) - Corpus : valian - Date : 1986-03-04 - Id : 01a - Lang : eng - Version : 2.0.1 - {http://www.w3.org/2001/XMLSchema-instance}schemaLocation : http://www.talkbank.org/ns/talkbank http://talkbank.org/software/talkbank.xsd - -Printing information of participants of the corpus. The most common codes for -the participants are 'CHI' (target child), 'MOT' (mother), and 'INV' (investigator). - - >>> corpus_participants = valian.participants(valian.fileids()) - >>> for this_corpus_participants in corpus_participants[:2]: - ... for key in sorted(this_corpus_participants.keys()): - ... dct = this_corpus_participants[key] - ... print(key, ": ", [(k, dct[k]) for k in sorted(dct.keys())]) - CHI : [('age', 'P2Y1M3D'), ('group', 'normal'), ('id', 'CHI'), ('language', 'eng'), ('role', 'Target_Child'), ('sex', 'female')] - INV : [('id', 'INV'), ('language', 'eng'), ('role', 'Investigator')] - MOT : [('id', 'MOT'), ('language', 'eng'), ('role', 'Mother')] - CHI : [('age', 'P2Y1M12D'), ('group', 'normal'), ('id', 'CHI'), ('language', 'eng'), ('role', 'Target_Child'), ('sex', 'female')] - INV : [('id', 'INV'), ('language', 'eng'), ('role', 'Investigator')] - MOT : [('id', 'MOT'), ('language', 'eng'), ('role', 'Mother')] - -printing words. - - >>> valian.words('Valian/01a.xml') - ['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', ... - -printing sentences. - - >>> valian.sents('Valian/01a.xml') - [['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', - 'and', 'it', 'is', 'March', 'fourth', 'I', 'believe', 'and', 'when', - 'was', "Parent's", 'birthday'], ["Child's"], ['oh', "I'm", 'sorry'], - ["that's", 'okay'], ... - -You can specify the participants with the argument *speaker*. - - >>> valian.words('Valian/01a.xml',speaker=['INV']) - ['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', ... - >>> valian.words('Valian/01a.xml',speaker=['MOT']) - ["Child's", "that's", 'okay', 'February', 'first', 'nineteen', ... - >>> valian.words('Valian/01a.xml',speaker=['CHI']) - ['tape', 'it', 'up', 'and', 'two', 'tape', 'players', 'have',... - - -tagged_words() and tagged_sents() return the usual (word,pos) tuple lists. -POS tags in the CHILDES are automatically assigned by MOR and POST programs -(MacWhinney, 2000). - - >>> valian.tagged_words('Valian/01a.xml')[:30] - [('at', 'prep'), ('Parent', 'n:prop'), ("Lastname's", 'n:prop'), ('house', 'n'), - ('with', 'prep'), ('Child', 'n:prop'), ('Lastname', 'n:prop'), ('and', 'coord'), - ('it', 'pro'), ('is', 'v:cop'), ('March', 'n:prop'), ('fourth', 'adj'), - ('I', 'pro:sub'), ('believe', 'v'), ('and', 'coord'), ('when', 'adv:wh'), - ('was', 'v:cop'), ("Parent's", 'n:prop'), ('birthday', 'n'), ("Child's", 'n:prop'), - ('oh', 'co'), ("I'm", 'pro:sub'), ('sorry', 'adj'), ("that's", 'pro:dem'), - ('okay', 'adj'), ('February', 'n:prop'), ('first', 'adj'), - ('nineteen', 'det:num'), ('eighty', 'det:num'), ('four', 'det:num')] - - >>> valian.tagged_sents('Valian/01a.xml')[:10] - [[('at', 'prep'), ('Parent', 'n:prop'), ("Lastname's", 'n:prop'), ('house', 'n'), - ('with', 'prep'), ('Child', 'n:prop'), ('Lastname', 'n:prop'), ('and', 'coord'), - ('it', 'pro'), ('is', 'v:cop'), ('March', 'n:prop'), ('fourth', 'adj'), - ('I', 'pro:sub'), ('believe', 'v'), ('and', 'coord'), ('when', 'adv:wh'), - ('was', 'v:cop'), ("Parent's", 'n:prop'), ('birthday', 'n')], - [("Child's", 'n:prop')], [('oh', 'co'), ("I'm", 'pro:sub'), ('sorry', 'adj')], - [("that's", 'pro:dem'), ('okay', 'adj')], - [('February', 'n:prop'), ('first', 'adj'), ('nineteen', 'det:num'), - ('eighty', 'det:num'), ('four', 'det:num')], - [('great', 'adj')], - [('and', 'coord'), ("she's", 'pro:sub'), ('two', 'det:num'), ('years', 'n'), ('old', 'adj')], - [('correct', 'adj')], - [('okay', 'co')], [('she', 'pro:sub'), ('just', 'adv:int'), ('turned', 'part'), ('two', 'det:num'), - ('a', 'det'), ('month', 'n'), ('ago', 'adv')]] - -When the argument *stem* is true, the word stems (e.g., 'is' -> 'be-3PS') are -used instead of the original words. - - >>> valian.words('Valian/01a.xml')[:30] - ['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', 'and', 'it', 'is', ... - >>> valian.words('Valian/01a.xml',stem=True)[:30] - ['at', 'Parent', 'Lastname', 's', 'house', 'with', 'Child', 'Lastname', 'and', 'it', 'be-3S', ... - -When the argument *replace* is true, the replaced words are used instead of -the original words. - - >>> valian.words('Valian/01a.xml',speaker='CHI')[247] - 'tikteat' - >>> valian.words('Valian/01a.xml',speaker='CHI',replace=True)[247] - 'trick' - -When the argument *relation* is true, the relational relationships in the -sentence are returned. See Sagae et al. (2010) for details of the relational -structure adopted in the CHILDES. - - >>> valian.words('Valian/01a.xml',relation=True)[:10] - [[('at', 'prep', '1|0|ROOT'), ('Parent', 'n', '2|5|VOC'), ('Lastname', 'n', '3|5|MOD'), ('s', 'poss', '4|5|MOD'), ('house', 'n', '5|1|POBJ'), ('with', 'prep', '6|1|JCT'), ('Child', 'n', '7|8|NAME'), ('Lastname', 'n', '8|6|POBJ'), ('and', 'coord', '9|8|COORD'), ('it', 'pro', '10|11|SUBJ'), ('be-3S', 'v', '11|9|COMP'), ('March', 'n', '12|11|PRED'), ('fourth', 'adj', '13|12|MOD'), ('I', 'pro', '15|16|SUBJ'), ('believe', 'v', '16|14|ROOT'), ('and', 'coord', '18|17|ROOT'), ('when', 'adv', '19|20|PRED'), ('be-PAST', 'v', '20|18|COMP'), ('Parent', 'n', '21|23|MOD'), ('s', 'poss', '22|23|MOD'), ('birth', 'n', '23|20|SUBJ')], [('Child', 'n', '1|2|MOD'), ('s', 'poss', '2|0|ROOT')], [('oh', 'co', '1|4|COM'), ('I', 'pro', '3|4|SUBJ'), ('be', 'v', '4|0|ROOT'), ('sorry', 'adj', '5|4|PRED')], [('that', 'pro', '1|2|SUBJ'), ('be', 'v', '2|0|ROOT'), ('okay', 'adj', '3|2|PRED')], [('February', 'n', '1|6|VOC'), ('first', 'adj', '2|6|ENUM'), ('nineteen', 'det', '4|6|ENUM'), ('eighty', 'det', '5|6|ENUM'), ('four', 'det', '6|0|ROOT')], [('great', 'adj', '1|0|ROOT')], [('and', 'coord', '1|0|ROOT'), ('she', 'pro', '2|1|ROOT'), ('be', 'aux', '3|5|AUX'), ('two', 'det', '4|5|QUANT'), ('year-PL', 'n', '5|2|ROOT'), ('old', 'adj', '6|5|MOD')], [('correct', 'adj', '1|0|ROOT')], [('okay', 'co', '1|0|ROOT')], [('she', 'pro', '1|0|ROOT'), ('just', 'adv', '2|3|JCT'), ('turn-PERF', 'part', '3|1|XCOMP'), ('two', 'det', '4|6|QUANT'), ('a', 'det', '5|6|DET'), ('month', 'n', '6|3|OBJ'), ('ago', 'adv', '7|3|JCT')]] - -Printing age. When the argument *month* is true, the age information in -the CHILDES format is converted into the number of months. - - >>> valian.age() - ['P2Y1M3D', 'P2Y1M12D', 'P1Y9M21D', 'P1Y9M28D', 'P2Y1M23D', ... - >>> valian.age('Valian/01a.xml') - ['P2Y1M3D'] - >>> valian.age('Valian/01a.xml',month=True) - [25] - -Printing MLU. The criteria for the MLU computation is broadly based on -Brown (1973). - - >>> valian.MLU() - [2.3574660633484..., 2.292682926829..., 3.492857142857..., 2.961783439490..., - 2.0842696629213..., 3.169811320754..., 3.137404580152..., 3.0578034682080..., - 4.090163934426..., 3.488372093023..., 2.8773584905660..., 3.4792899408284..., - 4.0111940298507..., 3.456790123456..., 4.487603305785..., 4.007936507936..., - 5.25, 5.154696132596..., ...] - - >>> valian.MLU('Valian/01a.xml') - [2.35746606334...] - - -Basic stuff -============================== - -Count the number of words and sentences of each file. - - >>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml') - >>> for this_file in valian.fileids()[:6]: - ... print(valian.corpus(this_file)[0]['Corpus'], valian.corpus(this_file)[0]['Id']) - ... print("num of words: %i" % len(valian.words(this_file))) - ... print("num of sents: %i" % len(valian.sents(this_file))) - valian 01a - num of words: 3606 - num of sents: 1027 - valian 01b - num of words: 4376 - num of sents: 1274 - valian 02a - num of words: 2673 - num of sents: 801 - valian 02b - num of words: 5020 - num of sents: 1583 - valian 03a - num of words: 2743 - num of sents: 988 - valian 03b - num of words: 4409 - num of sents: 1397 diff --git a/pipeline/nltk/test/childes_fixt.py b/pipeline/nltk/test/childes_fixt.py deleted file mode 100644 index 0c3b84fd5f089d55e30f10f8233ec7ce2cb5f1b7..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/childes_fixt.py +++ /dev/null @@ -1,13 +0,0 @@ -def setup_module(): - import pytest - - import nltk.data - - try: - nltk.data.find("corpora/childes/data-xml/Eng-USA-MOR/") - except LookupError as e: - pytest.skip( - "The CHILDES corpus is not found. " - "It should be manually downloaded and saved/unpacked " - "to [NLTK_Data_Dir]/corpora/childes/" - ) diff --git a/pipeline/nltk/test/chunk.doctest b/pipeline/nltk/test/chunk.doctest deleted file mode 100644 index d67824cb378d99ffaf647bbc7cfc0e481f2cc26b..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/chunk.doctest +++ /dev/null @@ -1,372 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -========== - Chunking -========== - - >>> from nltk.chunk import * - >>> from nltk.chunk.util import * - >>> from nltk.chunk.regexp import * - >>> from nltk import Tree - - >>> tagged_text = "[ The/DT cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] [ the/DT dog/NN ] chewed/VBD ./." - >>> gold_chunked_text = tagstr2tree(tagged_text) - >>> unchunked_text = gold_chunked_text.flatten() - -Chunking uses a special regexp syntax for rules that delimit the chunks. These -rules must be converted to 'regular' regular expressions before a sentence can -be chunked. - - >>> tag_pattern = "
    ?*" - >>> regexp_pattern = tag_pattern2re_pattern(tag_pattern) - >>> regexp_pattern - '(<(DT)>)?(<(JJ)>)*(<(NN[^\\{\\}<>]*)>)' - -Construct some new chunking rules. - - >>> chunk_rule = ChunkRule(r"<.*>+", "Chunk everything") - >>> strip_rule = StripRule(r"", "Strip on verbs/prepositions") - >>> split_rule = SplitRule("
    ", "
    ", - ... "Split successive determiner/noun pairs") - - -Create and score a series of chunk parsers, successively more complex. - - >>> chunk_parser = RegexpChunkParser([chunk_rule], chunk_label='NP') - >>> chunked_text = chunk_parser.parse(unchunked_text) - >>> print(chunked_text) - (S - (NP - The/DT - cat/NN - sat/VBD - on/IN - the/DT - mat/NN - the/DT - dog/NN - chewed/VBD - ./.)) - - >>> chunkscore = ChunkScore() - >>> chunkscore.score(gold_chunked_text, chunked_text) - >>> print(chunkscore.precision()) - 0.0 - - >>> print(chunkscore.recall()) - 0.0 - - >>> print(chunkscore.f_measure()) - 0 - - >>> for chunk in sorted(chunkscore.missed()): print(chunk) - (NP The/DT cat/NN) - (NP the/DT dog/NN) - (NP the/DT mat/NN) - - >>> for chunk in chunkscore.incorrect(): print(chunk) - (NP - The/DT - cat/NN - sat/VBD - on/IN - the/DT - mat/NN - the/DT - dog/NN - chewed/VBD - ./.) - - >>> chunk_parser = RegexpChunkParser([chunk_rule, strip_rule], - ... chunk_label='NP') - >>> chunked_text = chunk_parser.parse(unchunked_text) - >>> print(chunked_text) - (S - (NP The/DT cat/NN) - sat/VBD - on/IN - (NP the/DT mat/NN the/DT dog/NN) - chewed/VBD - ./.) - >>> assert chunked_text == chunk_parser.parse(list(unchunked_text)) - - >>> chunkscore = ChunkScore() - >>> chunkscore.score(gold_chunked_text, chunked_text) - >>> chunkscore.precision() - 0.5 - - >>> print(chunkscore.recall()) - 0.33333333... - - >>> print(chunkscore.f_measure()) - 0.4 - - >>> for chunk in sorted(chunkscore.missed()): print(chunk) - (NP the/DT dog/NN) - (NP the/DT mat/NN) - - >>> for chunk in chunkscore.incorrect(): print(chunk) - (NP the/DT mat/NN the/DT dog/NN) - - >>> chunk_parser = RegexpChunkParser([chunk_rule, strip_rule, split_rule], - ... chunk_label='NP') - >>> chunked_text = chunk_parser.parse(unchunked_text, trace=True) - # Input: -
    <.> - # Chunk everything: - {
    <.>} - # Strip on verbs/prepositions: - {
    } {
    } <.> - # Split successive determiner/noun pairs: - {
    } {
    }{
    } <.> - >>> print(chunked_text) - (S - (NP The/DT cat/NN) - sat/VBD - on/IN - (NP the/DT mat/NN) - (NP the/DT dog/NN) - chewed/VBD - ./.) - - >>> chunkscore = ChunkScore() - >>> chunkscore.score(gold_chunked_text, chunked_text) - >>> chunkscore.precision() - 1.0 - - >>> chunkscore.recall() - 1.0 - - >>> chunkscore.f_measure() - 1.0 - - >>> chunkscore.missed() - [] - - >>> chunkscore.incorrect() - [] - - >>> chunk_parser.rules() - [+'>, '>, - ', '
    '>] - -Printing parsers: - - >>> print(repr(chunk_parser)) - - >>> print(chunk_parser) - RegexpChunkParser with 3 rules: - Chunk everything - +'> - Strip on verbs/prepositions - '> - Split successive determiner/noun pairs - ', '
    '> - -Regression Tests -~~~~~~~~~~~~~~~~ -ChunkParserI ------------- -`ChunkParserI` is an abstract interface -- it is not meant to be -instantiated directly. - - >>> ChunkParserI().parse([]) - Traceback (most recent call last): - . . . - NotImplementedError - - -ChunkString ------------ -ChunkString can be built from a tree of tagged tuples, a tree of -trees, or a mixed list of both: - - >>> t1 = Tree('S', [('w%d' % i, 't%d' % i) for i in range(10)]) - >>> t2 = Tree('S', [Tree('t0', []), Tree('t1', ['c1'])]) - >>> t3 = Tree('S', [('w0', 't0'), Tree('t1', ['c1'])]) - >>> ChunkString(t1) - '> - >>> ChunkString(t2) - '> - >>> ChunkString(t3) - '> - -Other values generate an error: - - >>> ChunkString(Tree('S', ['x'])) - Traceback (most recent call last): - . . . - ValueError: chunk structures must contain tagged tokens or trees - -The `str()` for a chunk string adds spaces to it, which makes it line -up with `str()` output for other chunk strings over the same -underlying input. - - >>> cs = ChunkString(t1) - >>> print(cs) - - >>> cs.xform('', '{}') - >>> print(cs) - {} - -The `_verify()` method makes sure that our transforms don't corrupt -the chunk string. By setting debug_level=2, `_verify()` will be -called at the end of every call to `xform`. - - >>> cs = ChunkString(t1, debug_level=3) - - >>> # tag not marked with <...>: - >>> cs.xform('', 't3') - Traceback (most recent call last): - . . . - ValueError: Transformation generated invalid chunkstring: - t3 - - >>> # brackets not balanced: - >>> cs.xform('', '{') - Traceback (most recent call last): - . . . - ValueError: Transformation generated invalid chunkstring: - { - - >>> # nested brackets: - >>> cs.xform('', '{{}}') - Traceback (most recent call last): - . . . - ValueError: Transformation generated invalid chunkstring: - {{}} - - >>> # modified tags: - >>> cs.xform('', '') - Traceback (most recent call last): - . . . - ValueError: Transformation generated invalid chunkstring: tag changed - - >>> # added tags: - >>> cs.xform('', '') - Traceback (most recent call last): - . . . - ValueError: Transformation generated invalid chunkstring: tag changed - -Chunking Rules --------------- - -Test the different rule constructors & __repr__ methods: - - >>> r1 = RegexpChunkRule(''+ChunkString.IN_STRIP_PATTERN, - ... '{}', 'chunk and ') - >>> r2 = RegexpChunkRule(re.compile(''+ChunkString.IN_STRIP_PATTERN), - ... '{}', 'chunk and ') - >>> r3 = ChunkRule('', 'chunk and ') - >>> r4 = StripRule('', 'strip and ') - >>> r5 = UnChunkRule('', 'unchunk and ') - >>> r6 = MergeRule('', '', 'merge w/ ') - >>> r7 = SplitRule('', '', 'split from ') - >>> r8 = ExpandLeftRule('', '', 'expand left ') - >>> r9 = ExpandRightRule('', '', 'expand right ') - >>> for rule in r1, r2, r3, r4, r5, r6, r7, r8, r9: - ... print(rule) - (?=[^\\}]*(\\{|$))'->'{}'> - (?=[^\\}]*(\\{|$))'->'{}'> - '> - '> - '> - ', ''> - ', ''> - ', ''> - ', ''> - -`tag_pattern2re_pattern()` complains if the tag pattern looks problematic: - - >>> tag_pattern2re_pattern('{}') - Traceback (most recent call last): - . . . - ValueError: Bad tag pattern: '{}' - -RegexpChunkParser ------------------ - -A warning is printed when parsing an empty sentence: - - >>> parser = RegexpChunkParser([ChunkRule('', '')]) - >>> parser.parse(Tree('S', [])) - Warning: parsing empty text - Tree('S', []) - -RegexpParser ------------- - - >>> parser = RegexpParser(''' - ... NP: {
    ? * *} # NP - ... P: {} # Preposition - ... V: {} # Verb - ... PP: {

    } # PP -> P NP - ... VP: { *} # VP -> V (NP|PP)* - ... ''') - >>> print(repr(parser)) - - >>> print(parser) - chunk.RegexpParser with 5 stages: - RegexpChunkParser with 1 rules: - NP ? * *'> - RegexpChunkParser with 1 rules: - Preposition '> - RegexpChunkParser with 1 rules: - Verb '> - RegexpChunkParser with 1 rules: - PP -> P NP '> - RegexpChunkParser with 1 rules: - VP -> V (NP|PP)* *'> - >>> print(parser.parse(unchunked_text, trace=True)) - # Input: -

    <.> - # NP: - {
    } {
    }{
    } <.> - # Input: - <.> - # Preposition: - {} <.> - # Input: -

    <.> - # Verb: - {}

    {} <.> - # Input: -

    <.> - # PP -> P NP: - {

    } <.> - # Input: - <.> - # VP -> V (NP|PP)*: - { }{} <.> - (S - (NP The/DT cat/NN) - (VP - (V sat/VBD) - (PP (P on/IN) (NP the/DT mat/NN)) - (NP the/DT dog/NN)) - (VP (V chewed/VBD)) - ./.) - -Test parsing of other rule types: - - >>> print(RegexpParser(''' - ... X: - ... }{ # strip rule - ... }{ # split rule - ... {} # merge rule - ... {} # chunk rule w/ context - ... ''')) - chunk.RegexpParser with 1 stages: - RegexpChunkParser with 4 rules: - strip rule '> - split rule ', ''> - merge rule ', ''> - chunk rule w/ context ', '', ''> - -Illegal patterns give an error message: - - >>> print(RegexpParser('X: {} {}')) - Traceback (most recent call last): - . . . - ValueError: Illegal chunk pattern: {} {} diff --git a/pipeline/nltk/test/classify.doctest b/pipeline/nltk/test/classify.doctest deleted file mode 100644 index ef1bf7c9563488b6d85bb9098f540c1ce11af34b..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/classify.doctest +++ /dev/null @@ -1,202 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -============= - Classifiers -============= - - >>> from nltk.test.classify_fixt import setup_module - >>> setup_module() - -Classifiers label tokens with category labels (or *class labels*). -Typically, labels are represented with strings (such as ``"health"`` -or ``"sports"``. In NLTK, classifiers are defined using classes that -implement the `ClassifierI` interface, which supports the following operations: - -- self.classify(featureset) -- self.classify_many(featuresets) -- self.labels() -- self.prob_classify(featureset) -- self.prob_classify_many(featuresets) - -NLTK defines several classifier classes: - -- `ConditionalExponentialClassifier` -- `DecisionTreeClassifier` -- `MaxentClassifier` -- `NaiveBayesClassifier` -- `WekaClassifier` - -Classifiers are typically created by training them on a training -corpus. - - -Regression Tests -~~~~~~~~~~~~~~~~ - -We define a very simple training corpus with 3 binary features: ['a', -'b', 'c'], and are two labels: ['x', 'y']. We use a simple feature set so -that the correct answers can be calculated analytically (although we -haven't done this yet for all tests). - - >>> import nltk - >>> train = [ - ... (dict(a=1,b=1,c=1), 'y'), - ... (dict(a=1,b=1,c=1), 'x'), - ... (dict(a=1,b=1,c=0), 'y'), - ... (dict(a=0,b=1,c=1), 'x'), - ... (dict(a=0,b=1,c=1), 'y'), - ... (dict(a=0,b=0,c=1), 'y'), - ... (dict(a=0,b=1,c=0), 'x'), - ... (dict(a=0,b=0,c=0), 'x'), - ... (dict(a=0,b=1,c=1), 'y'), - ... (dict(a=None,b=1,c=0), 'x'), - ... ] - >>> test = [ - ... (dict(a=1,b=0,c=1)), # unseen - ... (dict(a=1,b=0,c=0)), # unseen - ... (dict(a=0,b=1,c=1)), # seen 3 times, labels=y,y,x - ... (dict(a=0,b=1,c=0)), # seen 1 time, label=x - ... ] - -Test the Naive Bayes classifier: - - >>> classifier = nltk.classify.NaiveBayesClassifier.train(train) - >>> sorted(classifier.labels()) - ['x', 'y'] - >>> classifier.classify_many(test) - ['y', 'x', 'y', 'x'] - >>> for pdist in classifier.prob_classify_many(test): - ... print('%.4f %.4f' % (pdist.prob('x'), pdist.prob('y'))) - 0.2500 0.7500 - 0.5833 0.4167 - 0.3571 0.6429 - 0.7000 0.3000 - >>> classifier.show_most_informative_features() - Most Informative Features - c = 0 x : y = 2.3 : 1.0 - c = 1 y : x = 1.8 : 1.0 - a = 1 y : x = 1.7 : 1.0 - a = 0 x : y = 1.0 : 1.0 - b = 0 x : y = 1.0 : 1.0 - b = 1 x : y = 1.0 : 1.0 - -Test the Decision Tree classifier (without None): - - >>> classifier = nltk.classify.DecisionTreeClassifier.train( - ... train[:-1], entropy_cutoff=0, - ... support_cutoff=0) - >>> sorted(classifier.labels()) - ['x', 'y'] - >>> print(classifier) - c=0? .................................................. x - a=0? ................................................ x - a=1? ................................................ y - c=1? .................................................. y - - >>> classifier.classify_many(test) - ['y', 'y', 'y', 'x'] - >>> for pdist in classifier.prob_classify_many(test): - ... print('%.4f %.4f' % (pdist.prob('x'), pdist.prob('y'))) - Traceback (most recent call last): - . . . - NotImplementedError - - -Test the Decision Tree classifier (with None): - - >>> classifier = nltk.classify.DecisionTreeClassifier.train( - ... train, entropy_cutoff=0, - ... support_cutoff=0) - >>> sorted(classifier.labels()) - ['x', 'y'] - >>> print(classifier) - c=0? .................................................. x - a=0? ................................................ x - a=1? ................................................ y - a=None? ............................................. x - c=1? .................................................. y - - - -Test SklearnClassifier, which requires the scikit-learn package. - - >>> from nltk.classify import SklearnClassifier - >>> from sklearn.naive_bayes import BernoulliNB - >>> from sklearn.svm import SVC - >>> train_data = [({"a": 4, "b": 1, "c": 0}, "ham"), - ... ({"a": 5, "b": 2, "c": 1}, "ham"), - ... ({"a": 0, "b": 3, "c": 4}, "spam"), - ... ({"a": 5, "b": 1, "c": 1}, "ham"), - ... ({"a": 1, "b": 4, "c": 3}, "spam")] - >>> classif = SklearnClassifier(BernoulliNB()).train(train_data) - >>> test_data = [{"a": 3, "b": 2, "c": 1}, - ... {"a": 0, "b": 3, "c": 7}] - >>> classif.classify_many(test_data) - ['ham', 'spam'] - >>> classif = SklearnClassifier(SVC(), sparse=False).train(train_data) - >>> classif.classify_many(test_data) - ['ham', 'spam'] - -Test the Maximum Entropy classifier training algorithms; they should all -generate the same results. - - >>> def print_maxent_test_header(): - ... print(' '*11+''.join([' test[%s] ' % i - ... for i in range(len(test))])) - ... print(' '*11+' p(x) p(y)'*len(test)) - ... print('-'*(11+15*len(test))) - - >>> def test_maxent(algorithm): - ... print('%11s' % algorithm, end=' ') - ... try: - ... classifier = nltk.classify.MaxentClassifier.train( - ... train, algorithm, trace=0, max_iter=1000) - ... except Exception as e: - ... print('Error: %r' % e) - ... return - ... - ... for featureset in test: - ... pdist = classifier.prob_classify(featureset) - ... print('%8.2f%6.2f' % (pdist.prob('x'), pdist.prob('y')), end=' ') - ... print() - - >>> print_maxent_test_header(); test_maxent('GIS'); test_maxent('IIS') - test[0] test[1] test[2] test[3] - p(x) p(y) p(x) p(y) p(x) p(y) p(x) p(y) - ----------------------------------------------------------------------- - GIS 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24 - IIS 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24 - - >>> test_maxent('MEGAM'); test_maxent('TADM') # doctest: +SKIP - MEGAM 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24 - TADM 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24 - - - -Regression tests for TypedMaxentFeatureEncoding -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - >>> from nltk.classify import maxent - >>> train = [ - ... ({'a': 1, 'b': 1, 'c': 1}, 'y'), - ... ({'a': 5, 'b': 5, 'c': 5}, 'x'), - ... ({'a': 0.9, 'b': 0.9, 'c': 0.9}, 'y'), - ... ({'a': 5.5, 'b': 5.4, 'c': 5.3}, 'x'), - ... ({'a': 0.8, 'b': 1.2, 'c': 1}, 'y'), - ... ({'a': 5.1, 'b': 4.9, 'c': 5.2}, 'x') - ... ] - - >>> test = [ - ... {'a': 1, 'b': 0.8, 'c': 1.2}, - ... {'a': 5.2, 'b': 5.1, 'c': 5} - ... ] - - >>> encoding = maxent.TypedMaxentFeatureEncoding.train( - ... train, count_cutoff=3, alwayson_features=True) - - >>> classifier = maxent.MaxentClassifier.train( - ... train, bernoulli=False, encoding=encoding, trace=0) - - >>> classifier.classify_many(test) - ['y', 'x'] diff --git a/pipeline/nltk/test/classify_fixt.py b/pipeline/nltk/test/classify_fixt.py deleted file mode 100644 index 17b037281aff04a7d9a1faf56ccd9b055e1a1071..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/classify_fixt.py +++ /dev/null @@ -1,5 +0,0 @@ -# most of classify.doctest requires numpy -def setup_module(): - import pytest - - pytest.importorskip("numpy") diff --git a/pipeline/nltk/test/collections.doctest b/pipeline/nltk/test/collections.doctest deleted file mode 100644 index 6dd98358a31a69c881c217cd5cbdbd12a0ee3d21..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/collections.doctest +++ /dev/null @@ -1,31 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -=========== -Collections -=========== - - >>> import nltk - >>> from nltk.collections import * - -Trie ----- - -Trie can be pickled: - - >>> import pickle - >>> trie = nltk.collections.Trie(['a']) - >>> s = pickle.dumps(trie) - >>> pickle.loads(s) - {'a': {True: None}} - -LazyIteratorList ----------------- - -Fetching the length of a LazyIteratorList object does not throw a StopIteration exception: - - >>> lil = LazyIteratorList(i for i in range(1, 11)) - >>> lil[-1] - 10 - >>> len(lil) - 10 diff --git a/pipeline/nltk/test/collocations.doctest b/pipeline/nltk/test/collocations.doctest deleted file mode 100644 index 3a3471e27b300396c4664dcc0e03a48771d4306d..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/collocations.doctest +++ /dev/null @@ -1,307 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -============== - Collocations -============== - -Overview -~~~~~~~~ - -Collocations are expressions of multiple words which commonly co-occur. For -example, the top ten bigram collocations in Genesis are listed below, as -measured using Pointwise Mutual Information. - - >>> import nltk - >>> from nltk.collocations import * - >>> bigram_measures = nltk.collocations.BigramAssocMeasures() - >>> trigram_measures = nltk.collocations.TrigramAssocMeasures() - >>> fourgram_measures = nltk.collocations.QuadgramAssocMeasures() - >>> finder = BigramCollocationFinder.from_words( - ... nltk.corpus.genesis.words('english-web.txt')) - >>> finder.nbest(bigram_measures.pmi, 10) - [('Allon', 'Bacuth'), ('Ashteroth', 'Karnaim'), ('Ben', 'Ammi'), - ('En', 'Mishpat'), ('Jegar', 'Sahadutha'), ('Salt', 'Sea'), - ('Whoever', 'sheds'), ('appoint', 'overseers'), ('aromatic', 'resin'), - ('cutting', 'instrument')] - -While these words are highly collocated, the expressions are also very -infrequent. Therefore it is useful to apply filters, such as ignoring all -bigrams which occur less than three times in the corpus: - - >>> finder.apply_freq_filter(3) - >>> finder.nbest(bigram_measures.pmi, 10) - [('Beer', 'Lahai'), ('Lahai', 'Roi'), ('gray', 'hairs'), - ('ewe', 'lambs'), ('Most', 'High'), ('many', 'colors'), - ('burnt', 'offering'), ('Paddan', 'Aram'), ('east', 'wind'), - ('living', 'creature')] - -We may similarly find collocations among tagged words: - - >>> finder = BigramCollocationFinder.from_words( - ... nltk.corpus.brown.tagged_words('ca01', tagset='universal')) - >>> finder.nbest(bigram_measures.pmi, 5) - [(('1,119', 'NUM'), ('votes', 'NOUN')), - (('1962', 'NUM'), ("governor's", 'NOUN')), - (('637', 'NUM'), ('E.', 'NOUN')), - (('Alpharetta', 'NOUN'), ('prison', 'NOUN')), - (('Bar', 'NOUN'), ('Association', 'NOUN'))] - -Or tags alone: - - >>> finder = BigramCollocationFinder.from_words(t for w, t in - ... nltk.corpus.brown.tagged_words('ca01', tagset='universal')) - >>> finder.nbest(bigram_measures.pmi, 10) - [('PRT', 'VERB'), ('PRON', 'VERB'), ('ADP', 'DET'), ('.', 'PRON'), ('DET', 'ADJ'), - ('CONJ', 'PRON'), ('ADP', 'NUM'), ('NUM', '.'), ('ADV', 'ADV'), ('VERB', 'ADV')] - -Or spanning intervening words: - - >>> finder = BigramCollocationFinder.from_words( - ... nltk.corpus.genesis.words('english-web.txt'), - ... window_size = 20) - >>> finder.apply_freq_filter(2) - >>> ignored_words = nltk.corpus.stopwords.words('english') - >>> finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) - >>> finder.nbest(bigram_measures.likelihood_ratio, 10) - [('chief', 'chief'), ('became', 'father'), ('years', 'became'), - ('hundred', 'years'), ('lived', 'became'), ('king', 'king'), - ('lived', 'years'), ('became', 'became'), ('chief', 'chiefs'), - ('hundred', 'became')] - -Finders -~~~~~~~ - -The collocations package provides collocation finders which by default -consider all ngrams in a text as candidate collocations: - - >>> text = "I do not like green eggs and ham, I do not like them Sam I am!" - >>> tokens = nltk.wordpunct_tokenize(text) - >>> finder = BigramCollocationFinder.from_words(tokens) - >>> scored = finder.score_ngrams(bigram_measures.raw_freq) - >>> sorted(bigram for bigram, score in scored) - [(',', 'I'), ('I', 'am'), ('I', 'do'), ('Sam', 'I'), ('am', '!'), - ('and', 'ham'), ('do', 'not'), ('eggs', 'and'), ('green', 'eggs'), - ('ham', ','), ('like', 'green'), ('like', 'them'), ('not', 'like'), - ('them', 'Sam')] - -We could otherwise construct the collocation finder from manually-derived -FreqDists: - - >>> word_fd = nltk.FreqDist(tokens) - >>> bigram_fd = nltk.FreqDist(nltk.bigrams(tokens)) - >>> finder = BigramCollocationFinder(word_fd, bigram_fd) - >>> scored == finder.score_ngrams(bigram_measures.raw_freq) - True - -A similar interface is provided for trigrams: - - >>> finder = TrigramCollocationFinder.from_words(tokens) - >>> scored = finder.score_ngrams(trigram_measures.raw_freq) - >>> set(trigram for trigram, score in scored) == set(nltk.trigrams(tokens)) - True - -We may want to select only the top n results: - - >>> sorted(finder.nbest(trigram_measures.raw_freq, 2)) - [('I', 'do', 'not'), ('do', 'not', 'like')] - -Alternatively, we can select those above a minimum score value: - - >>> sorted(finder.above_score(trigram_measures.raw_freq, - ... 1.0 / len(tuple(nltk.trigrams(tokens))))) - [('I', 'do', 'not'), ('do', 'not', 'like')] - -Now spanning intervening words: - - >>> finder = TrigramCollocationFinder.from_words(tokens) - >>> finder = TrigramCollocationFinder.from_words(tokens, window_size=4) - >>> sorted(finder.nbest(trigram_measures.raw_freq, 4)) - [('I', 'do', 'like'), ('I', 'do', 'not'), ('I', 'not', 'like'), ('do', 'not', 'like')] - -A closer look at the finder's ngram frequencies: - - >>> sorted(finder.ngram_fd.items(), key=lambda t: (-t[1], t[0]))[:10] - [(('I', 'do', 'like'), 2), (('I', 'do', 'not'), 2), (('I', 'not', 'like'), 2), - (('do', 'not', 'like'), 2), ((',', 'I', 'do'), 1), ((',', 'I', 'not'), 1), - ((',', 'do', 'not'), 1), (('I', 'am', '!'), 1), (('Sam', 'I', '!'), 1), - (('Sam', 'I', 'am'), 1)] - -A similar interface is provided for fourgrams: - - >>> finder_4grams = QuadgramCollocationFinder.from_words(tokens) - >>> scored_4grams = finder_4grams.score_ngrams(fourgram_measures.raw_freq) - >>> set(fourgram for fourgram, score in scored_4grams) == set(nltk.ngrams(tokens, n=4)) - True - -Filtering candidates -~~~~~~~~~~~~~~~~~~~~ - -All the ngrams in a text are often too many to be useful when finding -collocations. It is generally useful to remove some words or punctuation, -and to require a minimum frequency for candidate collocations. - -Given our sample text above, if we remove all trigrams containing personal -pronouns from candidature, score_ngrams should return 6 less results, and -'do not like' will be the only candidate which occurs more than once: - - >>> finder = TrigramCollocationFinder.from_words(tokens) - >>> len(finder.score_ngrams(trigram_measures.raw_freq)) - 14 - >>> finder.apply_word_filter(lambda w: w in ('I', 'me')) - >>> len(finder.score_ngrams(trigram_measures.raw_freq)) - 8 - >>> sorted(finder.above_score(trigram_measures.raw_freq, - ... 1.0 / len(tuple(nltk.trigrams(tokens))))) - [('do', 'not', 'like')] - -Sometimes a filter is a function on the whole ngram, rather than each word, -such as if we may permit 'and' to appear in the middle of a trigram, but -not on either edge: - - >>> finder.apply_ngram_filter(lambda w1, w2, w3: 'and' in (w1, w3)) - >>> len(finder.score_ngrams(trigram_measures.raw_freq)) - 6 - -Finally, it is often important to remove low frequency candidates, as we -lack sufficient evidence about their significance as collocations: - - >>> finder.apply_freq_filter(2) - >>> len(finder.score_ngrams(trigram_measures.raw_freq)) - 1 - -Association measures -~~~~~~~~~~~~~~~~~~~~ - -A number of measures are available to score collocations or other associations. -The arguments to measure functions are marginals of a contingency table, in the -bigram case (n_ii, (n_ix, n_xi), n_xx):: - - w1 ~w1 - ------ ------ - w2 | n_ii | n_oi | = n_xi - ------ ------ - ~w2 | n_io | n_oo | - ------ ------ - = n_ix TOTAL = n_xx - -We test their calculation using some known values presented in Manning and -Schutze's text and other papers. - -Student's t: examples from Manning and Schutze 5.3.2 - - >>> print('%0.4f' % bigram_measures.student_t(8, (15828, 4675), 14307668)) - 0.9999 - >>> print('%0.4f' % bigram_measures.student_t(20, (42, 20), 14307668)) - 4.4721 - -Chi-square: examples from Manning and Schutze 5.3.3 - - >>> print('%0.2f' % bigram_measures.chi_sq(8, (15828, 4675), 14307668)) - 1.55 - >>> print('%0.0f' % bigram_measures.chi_sq(59, (67, 65), 571007)) - 456400 - -Likelihood ratios: examples from Dunning, CL, 1993 - - >>> print('%0.2f' % bigram_measures.likelihood_ratio(110, (2552, 221), 31777)) - 270.72 - >>> print('%0.2f' % bigram_measures.likelihood_ratio(8, (13, 32), 31777)) - 95.29 - -Pointwise Mutual Information: examples from Manning and Schutze 5.4 - - >>> print('%0.2f' % bigram_measures.pmi(20, (42, 20), 14307668)) - 18.38 - >>> print('%0.2f' % bigram_measures.pmi(20, (15019, 15629), 14307668)) - 0.29 - -TODO: Find authoritative results for trigrams. - -Using contingency table values -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -While frequency counts make marginals readily available for collocation -finding, it is common to find published contingency table values. The -collocations package therefore provides a wrapper, ContingencyMeasures, which -wraps an association measures class, providing association measures which -take contingency values as arguments, (n_ii, n_io, n_oi, n_oo) in the -bigram case. - - >>> from nltk.metrics import ContingencyMeasures - >>> cont_bigram_measures = ContingencyMeasures(bigram_measures) - >>> print('%0.2f' % cont_bigram_measures.likelihood_ratio(8, 5, 24, 31740)) - 95.29 - >>> print('%0.2f' % cont_bigram_measures.chi_sq(8, 15820, 4667, 14287173)) - 1.55 - -Ranking and correlation -~~~~~~~~~~~~~~~~~~~~~~~ - -It is useful to consider the results of finding collocations as a ranking, and -the rankings output using different association measures can be compared using -the Spearman correlation coefficient. - -Ranks can be assigned to a sorted list of results trivially by assigning -strictly increasing ranks to each result: - - >>> from nltk.metrics.spearman import * - >>> results_list = ['item1', 'item2', 'item3', 'item4', 'item5'] - >>> print(list(ranks_from_sequence(results_list))) - [('item1', 0), ('item2', 1), ('item3', 2), ('item4', 3), ('item5', 4)] - -If scores are available for each result, we may allow sufficiently similar -results (differing by no more than rank_gap) to be assigned the same rank: - - >>> results_scored = [('item1', 50.0), ('item2', 40.0), ('item3', 38.0), - ... ('item4', 35.0), ('item5', 14.0)] - >>> print(list(ranks_from_scores(results_scored, rank_gap=5))) - [('item1', 0), ('item2', 1), ('item3', 1), ('item4', 1), ('item5', 4)] - -The Spearman correlation coefficient gives a number from -1.0 to 1.0 comparing -two rankings. A coefficient of 1.0 indicates identical rankings; -1.0 indicates -exact opposite rankings. - - >>> print('%0.1f' % spearman_correlation( - ... ranks_from_sequence(results_list), - ... ranks_from_sequence(results_list))) - 1.0 - >>> print('%0.1f' % spearman_correlation( - ... ranks_from_sequence(reversed(results_list)), - ... ranks_from_sequence(results_list))) - -1.0 - >>> results_list2 = ['item2', 'item3', 'item1', 'item5', 'item4'] - >>> print('%0.1f' % spearman_correlation( - ... ranks_from_sequence(results_list), - ... ranks_from_sequence(results_list2))) - 0.6 - >>> print('%0.1f' % spearman_correlation( - ... ranks_from_sequence(reversed(results_list)), - ... ranks_from_sequence(results_list2))) - -0.6 - -Keywords -~~~~~~~~ - -Bigram association metrics can also be used to perform keyword analysis. . For example, this finds the keywords -associated with the "romance" section of the Brown corpus as measured by likelihood ratio: - - >>> romance = nltk.FreqDist(w.lower() for w in nltk.corpus.brown.words(categories='romance') if w.isalpha()) - >>> freq = nltk.FreqDist(w.lower() for w in nltk.corpus.brown.words() if w.isalpha()) - - >>> key = nltk.FreqDist() - >>> for w in romance: - ... key[w] = bigram_measures.likelihood_ratio(romance[w], (freq[w], romance.N()), freq.N()) - - >>> for k,v in key.most_common(10): - ... print(f'{k:10s} {v:9.3f}') - she 1163.325 - i 995.961 - her 930.528 - you 513.149 - of 501.891 - is 463.386 - had 421.615 - he 411.000 - the 347.632 - said 300.811 diff --git a/pipeline/nltk/test/concordance.doctest b/pipeline/nltk/test/concordance.doctest deleted file mode 100644 index 8dbd81a01818b99681be51491bd3eaadd0c86e38..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/concordance.doctest +++ /dev/null @@ -1,75 +0,0 @@ -.. Copyright (C) 2001-2016 NLTK Project -.. For license information, see LICENSE.TXT - -================================== -Concordance Example -================================== - -A concordance view shows us every occurrence of a given -word, together with some context. Here we look up the word monstrous -in Moby Dick by entering text1 followed by a period, then the term -concordance, and then placing "monstrous" in parentheses: - ->>> from nltk.corpus import gutenberg ->>> from nltk.text import Text ->>> corpus = gutenberg.words('melville-moby_dick.txt') ->>> text = Text(corpus) - ->>> text.concordance("monstrous") -Displaying 11 of 11 matches: -ong the former , one was of a most monstrous size . ... This came towards us , -ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r -ll over with a heathenish array of monstrous clubs and spears . Some were thick -d as you gazed , and wondered what monstrous cannibal and savage could ever hav -that has survived the flood ; most monstrous and most mountainous ! That Himmal -they might scout at Moby Dick as a monstrous fable , or still worse and more de -th of Radney .'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l -ing Scenes . In connexion with the monstrous pictures of whales , I am strongly -ere to enter upon those still more monstrous stories of them which are to be fo -ght have been rummaged out of this monstrous cabinet there is no telling . But -of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u - ->>> text.concordance("monstrous") -Displaying 11 of 11 matches: -ong the former , one was of a most monstrous size . ... This came towards us , -ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r -ll over with a heathenish array of monstrous clubs and spears . Some were thick -... - -We can also search for a multi-word phrase by passing a list of strings: - ->>> text.concordance(["monstrous", "size"]) -Displaying 2 of 2 matches: -the former , one was of a most monstrous size . ... This came towards us , op -Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead upo - -================================= -Concordance List -================================= - -Often we need to store the results of concordance for further usage. -To do so, call the concordance function with the stdout argument set -to false: - ->>> from nltk.corpus import gutenberg ->>> from nltk.text import Text ->>> corpus = gutenberg.words('melville-moby_dick.txt') ->>> text = Text(corpus) ->>> con_list = text.concordance_list("monstrous") ->>> con_list[2].line -'ll over with a heathenish array of monstrous clubs and spears . Some were thick' ->>> len(con_list) -11 - -================================= -Patching Issue #2088 -================================= - -Patching https://github.com/nltk/nltk/issues/2088 -The left slice of the left context should be clip to 0 if the `i-context` < 0. - ->>> from nltk import Text, word_tokenize ->>> jane_eyre = 'Chapter 1\nTHERE was no possibility of taking a walk that day. We had been wandering, indeed, in the leafless shrubbery an hour in the morning; but since dinner (Mrs. Reed, when there was no company, dined early) the cold winter wind had brought with it clouds so sombre, and a rain so penetrating, that further outdoor exercise was now out of the question.' ->>> text = Text(word_tokenize(jane_eyre)) ->>> text.concordance_list('taking')[0].left -['Chapter', '1', 'THERE', 'was', 'no', 'possibility', 'of'] diff --git a/pipeline/nltk/test/conftest.py b/pipeline/nltk/test/conftest.py deleted file mode 100644 index d5e89a36725cb1da9ec3865c215c357ef98cabbe..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/conftest.py +++ /dev/null @@ -1,33 +0,0 @@ -import pytest - -from nltk.corpus.reader import CorpusReader - - -@pytest.fixture(autouse=True) -def mock_plot(mocker): - """Disable matplotlib plotting in test code""" - - try: - import matplotlib.pyplot as plt - - mocker.patch.object(plt, "gca") - mocker.patch.object(plt, "show") - except ImportError: - pass - - -@pytest.fixture(scope="module", autouse=True) -def teardown_loaded_corpora(): - """ - After each test session ends (either doctest or unit test), - unload any loaded corpora - """ - - yield # first, wait for the test to end - - import nltk.corpus - - for name in dir(nltk.corpus): - obj = getattr(nltk.corpus, name, None) - if isinstance(obj, CorpusReader) and hasattr(obj, "_unload"): - obj._unload() diff --git a/pipeline/nltk/test/corpus.doctest b/pipeline/nltk/test/corpus.doctest deleted file mode 100644 index 4e650d850bbe5327266f159db637f563867ef2b3..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/corpus.doctest +++ /dev/null @@ -1,2196 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -================ - Corpus Readers -================ - -The `nltk.corpus` package defines a collection of *corpus reader* -classes, which can be used to access the contents of a diverse set of -corpora. The list of available corpora is given at: - -https://www.nltk.org/nltk_data/ - -Each corpus reader class is specialized to handle a specific -corpus format. In addition, the `nltk.corpus` package automatically -creates a set of corpus reader instances that can be used to access -the corpora in the NLTK data package. -Section `Corpus Reader Objects`_ ("Corpus Reader Objects") describes -the corpus reader instances that can be used to read the corpora in -the NLTK data package. Section `Corpus Reader Classes`_ ("Corpus -Reader Classes") describes the corpus reader classes themselves, and -discusses the issues involved in creating new corpus reader objects -and new corpus reader classes. Section `Regression Tests`_ -("Regression Tests") contains regression tests for the corpus readers -and associated functions and classes. - -.. contents:: **Table of Contents** - :depth: 4 - :backlinks: none - ---------------------- -Corpus Reader Objects ---------------------- - -Overview -======== - -NLTK includes a diverse set of corpora which can be -read using the ``nltk.corpus`` package. Each corpus is accessed by -means of a "corpus reader" object from ``nltk.corpus``: - - >>> import nltk.corpus - >>> # The Brown corpus: - >>> print(str(nltk.corpus.brown).replace('\\\\','/')) - - >>> # The Penn Treebank Corpus: - >>> print(str(nltk.corpus.treebank).replace('\\\\','/')) - - >>> # The Name Genders Corpus: - >>> print(str(nltk.corpus.names).replace('\\\\','/')) - - >>> # The Inaugural Address Corpus: - >>> print(str(nltk.corpus.inaugural).replace('\\\\','/')) - - -Most corpora consist of a set of files, each containing a document (or -other pieces of text). A list of identifiers for these files is -accessed via the ``fileids()`` method of the corpus reader: - - >>> nltk.corpus.treebank.fileids() - ['wsj_0001.mrg', 'wsj_0002.mrg', 'wsj_0003.mrg', 'wsj_0004.mrg', ...] - >>> nltk.corpus.inaugural.fileids() - ['1789-Washington.txt', '1793-Washington.txt', '1797-Adams.txt', ...] - -Each corpus reader provides a variety of methods to read data from the -corpus, depending on the format of the corpus. For example, plaintext -corpora support methods to read the corpus as raw text, a list of -words, a list of sentences, or a list of paragraphs. - - >>> from nltk.corpus import inaugural - >>> inaugural.raw('1789-Washington.txt') - 'Fellow-Citizens of the Senate ...' - >>> inaugural.words('1789-Washington.txt') - ['Fellow', '-', 'Citizens', 'of', 'the', ...] - >>> inaugural.sents('1789-Washington.txt') - [['Fellow', '-', 'Citizens'...], ['Among', 'the', 'vicissitudes'...]...] - >>> inaugural.paras('1789-Washington.txt') - [[['Fellow', '-', 'Citizens'...]], - [['Among', 'the', 'vicissitudes'...], - ['On', 'the', 'one', 'hand', ',', 'I'...]...]...] - -Each of these reader methods may be given a single document's item -name or a list of document item names. When given a list of document -item names, the reader methods will concatenate together the contents -of the individual documents. - - >>> l1 = len(inaugural.words('1789-Washington.txt')) - >>> l2 = len(inaugural.words('1793-Washington.txt')) - >>> l3 = len(inaugural.words(['1789-Washington.txt', '1793-Washington.txt'])) - >>> print('%s+%s == %s' % (l1, l2, l3)) - 1538+147 == 1685 - -If the reader methods are called without any arguments, they will -typically load all documents in the corpus. - - >>> len(inaugural.words()) - 152901 - -If a corpus contains a README file, it can be accessed with a ``readme()`` method: - - >>> inaugural.readme()[:32] - 'C-Span Inaugural Address Corpus\n' - -Plaintext Corpora -================= - -Here are the first few words from each of NLTK's plaintext corpora: - - >>> nltk.corpus.abc.words() - ['PM', 'denies', 'knowledge', 'of', 'AWB', ...] - >>> nltk.corpus.genesis.words() - ['In', 'the', 'beginning', 'God', 'created', ...] - >>> nltk.corpus.gutenberg.words(fileids='austen-emma.txt') - ['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ...] - >>> nltk.corpus.inaugural.words() - ['Fellow', '-', 'Citizens', 'of', 'the', ...] - >>> nltk.corpus.state_union.words() - ['PRESIDENT', 'HARRY', 'S', '.', 'TRUMAN', "'", ...] - >>> nltk.corpus.webtext.words() - ['Cookie', 'Manager', ':', '"', 'Don', "'", 't', ...] - -Tagged Corpora -============== - -In addition to the plaintext corpora, NLTK's data package also -contains a wide variety of annotated corpora. For example, the Brown -Corpus is annotated with part-of-speech tags, and defines additional -methods ``tagged_*()`` which words as `(word,tag)` tuples, rather -than just bare word strings. - - >>> from nltk.corpus import brown - >>> print(brown.words()) - ['The', 'Fulton', 'County', 'Grand', 'Jury', ...] - >>> print(brown.tagged_words()) - [('The', 'AT'), ('Fulton', 'NP-TL'), ...] - >>> print(brown.sents()) - [['The', 'Fulton', 'County'...], ['The', 'jury', 'further'...], ...] - >>> print(brown.tagged_sents()) - [[('The', 'AT'), ('Fulton', 'NP-TL')...], - [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR')...]...] - >>> print(brown.paras(categories='reviews')) - [[['It', 'is', 'not', 'news', 'that', 'Nathan', 'Milstein'...], - ['Certainly', 'not', 'in', 'Orchestra', 'Hall', 'where'...]], - [['There', 'was', 'about', 'that', 'song', 'something', ...], - ['Not', 'the', 'noblest', 'performance', 'we', 'have', ...], ...], ...] - >>> print(brown.tagged_paras(categories='reviews')) - [[[('It', 'PPS'), ('is', 'BEZ'), ('not', '*'), ...], - [('Certainly', 'RB'), ('not', '*'), ('in', 'IN'), ...]], - [[('There', 'EX'), ('was', 'BEDZ'), ('about', 'IN'), ...], - [('Not', '*'), ('the', 'AT'), ('noblest', 'JJT'), ...], ...], ...] - -Similarly, the Indian Language POS-Tagged Corpus includes samples of -Indian text annotated with part-of-speech tags: - - >>> from nltk.corpus import indian - >>> print(indian.words()) # doctest: +SKIP - ['\xe0\xa6\xae\xe0\xa6\xb9\xe0\xa6\xbf\...', - '\xe0\xa6\xb8\xe0\xa6\xa8\xe0\xa7\x8d\xe0...', ...] - >>> print(indian.tagged_words()) # doctest: +SKIP - [('\xe0\xa6\xae\xe0\xa6\xb9\xe0\xa6\xbf...', 'NN'), - ('\xe0\xa6\xb8\xe0\xa6\xa8\xe0\xa7\x8d\xe0...', 'NN'), ...] - -Several tagged corpora support access to a simplified, universal tagset, e.g. where all nouns -tags are collapsed to a single category ``NOUN``: - - >>> print(brown.tagged_sents(tagset='universal')) - [[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ...], - [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ...]...] - >>> from nltk.corpus import conll2000, switchboard - >>> print(conll2000.tagged_words(tagset='universal')) - [('Confidence', 'NOUN'), ('in', 'ADP'), ...] - -Use ``nltk.app.pos_concordance()`` to access a GUI for searching tagged corpora. - -Chunked Corpora -=============== - -The CoNLL corpora also provide chunk structures, which are encoded as -flat trees. The CoNLL 2000 Corpus includes phrasal chunks; and the -CoNLL 2002 Corpus includes named entity chunks. - - >>> from nltk.corpus import conll2000, conll2002 - >>> print(conll2000.sents()) - [['Confidence', 'in', 'the', 'pound', 'is', 'widely', ...], - ['Chancellor', 'of', 'the', 'Exchequer', ...], ...] - >>> for tree in conll2000.chunked_sents()[:2]: - ... print(tree) - (S - (NP Confidence/NN) - (PP in/IN) - (NP the/DT pound/NN) - (VP is/VBZ widely/RB expected/VBN to/TO take/VB) - (NP another/DT sharp/JJ dive/NN) - if/IN - ...) - (S - Chancellor/NNP - (PP of/IN) - (NP the/DT Exchequer/NNP) - ...) - >>> print(conll2002.sents()) - [['Sao', 'Paulo', '(', 'Brasil', ')', ',', ...], ['-'], ...] - >>> for tree in conll2002.chunked_sents()[:2]: - ... print(tree) - (S - (LOC Sao/NC Paulo/VMI) - (/Fpa - (LOC Brasil/NC) - )/Fpt - ...) - (S -/Fg) - -.. note:: Since the CONLL corpora do not contain paragraph break - information, these readers do not support the ``para()`` method.) - -.. warning:: if you call the conll corpora reader methods without any - arguments, they will return the contents of the entire corpus, - *including* the 'test' portions of the corpus.) - -SemCor is a subset of the Brown corpus tagged with WordNet senses and -named entities. Both kinds of lexical items include multiword units, -which are encoded as chunks (senses and part-of-speech tags pertain -to the entire chunk). - - >>> from nltk.corpus import semcor - >>> semcor.words() - ['The', 'Fulton', 'County', 'Grand', 'Jury', ...] - >>> semcor.chunks() - [['The'], ['Fulton', 'County', 'Grand', 'Jury'], ...] - >>> semcor.sents() - [['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...], - ['The', 'jury', 'further', 'said', ...], ...] - >>> semcor.chunk_sents() - [[['The'], ['Fulton', 'County', 'Grand', 'Jury'], ['said'], ... - ['.']], [['The'], ['jury'], ['further'], ['said'], ... ['.']], ...] - >>> list(map(str, semcor.tagged_chunks(tag='both')[:3])) - ['(DT The)', "(Lemma('group.n.01.group') (NE (NNP Fulton County Grand Jury)))", "(Lemma('state.v.01.say') (VB said))"] - >>> [[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:2]] - [['(DT The)', "(Lemma('group.n.01.group') (NE (NNP Fulton County Grand Jury)))", ... - '(None .)'], ['(DT The)', ... '(None .)']] - - -The IEER corpus is another chunked corpus. This corpus is unusual in -that each corpus item contains multiple documents. (This reflects the -fact that each corpus file contains multiple documents.) The IEER -corpus defines the `parsed_docs` method, which returns the documents -in a given item as `IEERDocument` objects: - - >>> from nltk.corpus import ieer - >>> ieer.fileids() - ['APW_19980314', 'APW_19980424', 'APW_19980429', - 'NYT_19980315', 'NYT_19980403', 'NYT_19980407'] - >>> docs = ieer.parsed_docs('APW_19980314') - >>> print(docs[0]) - - >>> print(docs[0].docno) - APW19980314.0391 - >>> print(docs[0].doctype) - NEWS STORY - >>> print(docs[0].date_time) - 03/14/1998 10:36:00 - >>> print(docs[0].headline) - (DOCUMENT Kenyans protest tax hikes) - >>> print(docs[0].text) - (DOCUMENT - (LOCATION NAIROBI) - , - (LOCATION Kenya) - ( - (ORGANIZATION AP) - ) - _ - (CARDINAL Thousands) - of - laborers, - ... - on - (DATE Saturday) - ...) - -Parsed Corpora -============== - -The Treebank corpora provide a syntactic parse for each sentence. The -NLTK data package includes a 10% sample of the Penn Treebank (in -``treebank``), as well as the Sinica Treebank (in ``sinica_treebank``). - -Reading the Penn Treebank (Wall Street Journal sample): - - >>> from nltk.corpus import treebank - >>> print(treebank.fileids()) - ['wsj_0001.mrg', 'wsj_0002.mrg', 'wsj_0003.mrg', 'wsj_0004.mrg', ...] - >>> print(treebank.words('wsj_0003.mrg')) - ['A', 'form', 'of', 'asbestos', 'once', 'used', ...] - >>> print(treebank.tagged_words('wsj_0003.mrg')) - [('A', 'DT'), ('form', 'NN'), ('of', 'IN'), ...] - >>> print(treebank.parsed_sents('wsj_0003.mrg')[0]) - (S - (S-TPC-1 - (NP-SBJ - (NP (NP (DT A) (NN form)) (PP (IN of) (NP (NN asbestos)))) - (RRC ...)...)...) - ... - (VP (VBD reported) (SBAR (-NONE- 0) (S (-NONE- *T*-1)))) - (. .)) - -If you have access to a full installation of the Penn Treebank, NLTK -can be configured to load it as well. Download the ``ptb`` package, -and in the directory ``nltk_data/corpora/ptb`` place the ``BROWN`` -and ``WSJ`` directories of the Treebank installation (symlinks work -as well). Then use the ``ptb`` module instead of ``treebank``: - - >>> from nltk.corpus import ptb - >>> print(ptb.fileids()) # doctest: +SKIP - ['BROWN/CF/CF01.MRG', 'BROWN/CF/CF02.MRG', 'BROWN/CF/CF03.MRG', 'BROWN/CF/CF04.MRG', ...] - >>> print(ptb.words('WSJ/00/WSJ_0003.MRG')) # doctest: +SKIP - ['A', 'form', 'of', 'asbestos', 'once', 'used', '*', ...] - >>> print(ptb.tagged_words('WSJ/00/WSJ_0003.MRG')) # doctest: +SKIP - [('A', 'DT'), ('form', 'NN'), ('of', 'IN'), ...] - -...and so forth, like ``treebank`` but with extended fileids. Categories -specified in ``allcats.txt`` can be used to filter by genre; they consist -of ``news`` (for WSJ articles) and names of the Brown subcategories -(``fiction``, ``humor``, ``romance``, etc.): - - >>> ptb.categories() # doctest: +SKIP - ['adventure', 'belles_lettres', 'fiction', 'humor', 'lore', 'mystery', 'news', 'romance', 'science_fiction'] - >>> print(ptb.fileids('news')) # doctest: +SKIP - ['WSJ/00/WSJ_0001.MRG', 'WSJ/00/WSJ_0002.MRG', 'WSJ/00/WSJ_0003.MRG', ...] - >>> print(ptb.words(categories=['humor','fiction'])) # doctest: +SKIP - ['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back', ...] - -As PropBank and NomBank depend on the (WSJ portion of the) Penn Treebank, -the modules ``propbank_ptb`` and ``nombank_ptb`` are provided for access -to a full PTB installation. - -Reading the Sinica Treebank: - - >>> from nltk.corpus import sinica_treebank - >>> print(sinica_treebank.sents()) # doctest: +SKIP - [['\xe4\xb8\x80'], ['\xe5\x8f\x8b\xe6\x83\x85'], ...] - >>> sinica_treebank.parsed_sents()[25] # doctest: +SKIP - Tree('S', - [Tree('NP', - [Tree('Nba', ['\xe5\x98\x89\xe7\x8f\x8d'])]), - Tree('V\xe2\x80\xa7\xe5\x9c\xb0', - [Tree('VA11', ['\xe4\xb8\x8d\xe5\x81\x9c']), - Tree('DE', ['\xe7\x9a\x84'])]), - Tree('VA4', ['\xe5\x93\xad\xe6\xb3\xa3'])]) - -Reading the CoNLL 2007 Dependency Treebanks: - - >>> from nltk.corpus import conll2007 - >>> conll2007.sents('esp.train')[0] # doctest: +SKIP - ['El', 'aumento', 'del', 'índice', 'de', 'desempleo', ...] - >>> conll2007.parsed_sents('esp.train')[0] # doctest: +SKIP - - >>> print(conll2007.parsed_sents('esp.train')[0].tree()) # doctest: +SKIP - (fortaleció - (aumento El (del (índice (de (desempleo estadounidense))))) - hoy - considerablemente - (al - (euro - (cotizaba - , - que - (a (15.35 las GMT)) - se - (en (mercado el (de divisas) (de Fráncfort))) - (a 0,9452_dólares) - (frente_a , (0,9349_dólares los (de (mañana esta))))))) - .) - -Word Lists and Lexicons -======================= - -The NLTK data package also includes a number of lexicons and word -lists. These are accessed just like text corpora. The following -examples illustrate the use of the wordlist corpora: - - >>> from nltk.corpus import names, stopwords, words - >>> words.fileids() - ['en', 'en-basic'] - >>> words.words('en') - ['A', 'a', 'aa', 'aal', 'aalii', 'aam', 'Aani', 'aardvark', 'aardwolf', ...] - - >>> stopwords.fileids() # doctest: +SKIP - ['arabic', 'azerbaijani', 'bengali', 'danish', 'dutch', 'english', 'finnish', 'french', ...] - >>> sorted(stopwords.words('portuguese')) - ['a', 'ao', 'aos', 'aquela', 'aquelas', 'aquele', 'aqueles', ...] - >>> names.fileids() - ['female.txt', 'male.txt'] - >>> names.words('male.txt') - ['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot', 'Abbott', ...] - >>> names.words('female.txt') - ['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', ...] - -The CMU Pronunciation Dictionary corpus contains pronunciation -transcriptions for over 100,000 words. It can be accessed as a list -of entries (where each entry consists of a word, an identifier, and a -transcription) or as a dictionary from words to lists of -transcriptions. Transcriptions are encoded as tuples of phoneme -strings. - - >>> from nltk.corpus import cmudict - >>> print(cmudict.entries()[653:659]) - [('acetate', ['AE1', 'S', 'AH0', 'T', 'EY2', 'T']), - ('acetic', ['AH0', 'S', 'EH1', 'T', 'IH0', 'K']), - ('acetic', ['AH0', 'S', 'IY1', 'T', 'IH0', 'K']), - ('aceto', ['AA0', 'S', 'EH1', 'T', 'OW0']), - ('acetochlor', ['AA0', 'S', 'EH1', 'T', 'OW0', 'K', 'L', 'AO2', 'R']), - ('acetone', ['AE1', 'S', 'AH0', 'T', 'OW2', 'N'])] - >>> # Load the entire cmudict corpus into a Python dictionary: - >>> transcr = cmudict.dict() - >>> print([transcr[w][0] for w in 'Natural Language Tool Kit'.lower().split()]) - [['N', 'AE1', 'CH', 'ER0', 'AH0', 'L'], - ['L', 'AE1', 'NG', 'G', 'W', 'AH0', 'JH'], - ['T', 'UW1', 'L'], - ['K', 'IH1', 'T']] - - -WordNet -======= - -Please see the separate WordNet howto. - -FrameNet -======== - -Please see the separate FrameNet howto. - -PropBank -======== - -Please see the separate PropBank howto. - -SentiWordNet -============ - -Please see the separate SentiWordNet howto. - -Categorized Corpora -=================== - -Several corpora included with NLTK contain documents that have been categorized for -topic, genre, polarity, etc. In addition to the standard corpus interface, these -corpora provide access to the list of categories and the mapping between the documents -and their categories (in both directions). Access the categories using the ``categories()`` -method, e.g.: - - >>> from nltk.corpus import brown, movie_reviews, reuters - >>> brown.categories() - ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', - 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] - >>> movie_reviews.categories() - ['neg', 'pos'] - >>> reuters.categories() - ['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', - 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', - 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', ...] - -This method has an optional argument that specifies a document or a list -of documents, allowing us to map from (one or more) documents to (one or more) categories: - - >>> brown.categories('ca01') - ['news'] - >>> brown.categories(['ca01','cb01']) - ['editorial', 'news'] - >>> reuters.categories('training/9865') - ['barley', 'corn', 'grain', 'wheat'] - >>> reuters.categories(['training/9865', 'training/9880']) - ['barley', 'corn', 'grain', 'money-fx', 'wheat'] - -We can go back the other way using the optional argument of the ``fileids()`` method: - - >>> reuters.fileids('barley') - ['test/15618', 'test/15649', 'test/15676', 'test/15728', 'test/15871', ...] - -Both the ``categories()`` and ``fileids()`` methods return a sorted list containing -no duplicates. - -In addition to mapping between categories and documents, these corpora permit -direct access to their contents via the categories. Instead of accessing a subset -of a corpus by specifying one or more fileids, we can identify one or more categories, e.g.: - - >>> brown.tagged_words(categories='news') - [('The', 'AT'), ('Fulton', 'NP-TL'), ...] - >>> brown.sents(categories=['editorial','reviews']) - [['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', - 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', - 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', - 'the', 'day', 'it', 'convened', '.'], ...] - -Note that it is an error to specify both documents and categories. - -In the context of a text categorization system, we can easily test if the -category assigned to a document is correct as follows: - - >>> def classify(doc): return 'news' # Trivial classifier - >>> doc = 'ca01' - >>> classify(doc) in brown.categories(doc) - True - - -Other Corpora -============= - -comparative_sentences ---------------------- -A list of sentences from various sources, especially reviews and articles. Each -line contains one sentence; sentences were separated by using a sentence tokenizer. -Comparative sentences have been annotated with their type, entities, features and -keywords. - - >>> from nltk.corpus import comparative_sentences - >>> comparison = comparative_sentences.comparisons()[0] - >>> comparison.text - ['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly', - 'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve", - 'had', '.'] - >>> comparison.entity_2 - 'models' - >>> (comparison.feature, comparison.keyword) - ('rewind', 'more') - >>> len(comparative_sentences.comparisons()) - 853 - -opinion_lexicon ---------------- -A list of positive and negative opinion words or sentiment words for English. - - >>> from nltk.corpus import opinion_lexicon - >>> opinion_lexicon.words()[:4] - ['2-faced', '2-faces', 'abnormal', 'abolish'] - -The OpinionLexiconCorpusReader also provides shortcuts to retrieve positive/negative -words: - - >>> opinion_lexicon.negative()[:4] - ['2-faced', '2-faces', 'abnormal', 'abolish'] - -Note that words from `words()` method in opinion_lexicon are sorted by file id, -not alphabetically: - - >>> opinion_lexicon.words()[0:10] - ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably', - 'abominate', 'abomination', 'abort', 'aborted'] - >>> sorted(opinion_lexicon.words())[0:10] - ['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably', - 'abominate', 'abomination', 'abort'] - -ppattach --------- -The Prepositional Phrase Attachment corpus is a corpus of -prepositional phrase attachment decisions. Each instance in the -corpus is encoded as a ``PPAttachment`` object: - - >>> from nltk.corpus import ppattach - >>> ppattach.attachments('training') - [PPAttachment(sent='0', verb='join', noun1='board', - prep='as', noun2='director', attachment='V'), - PPAttachment(sent='1', verb='is', noun1='chairman', - prep='of', noun2='N.V.', attachment='N'), - ...] - >>> inst = ppattach.attachments('training')[0] - >>> (inst.sent, inst.verb, inst.noun1, inst.prep, inst.noun2) - ('0', 'join', 'board', 'as', 'director') - >>> inst.attachment - 'V' - -product_reviews_1 and product_reviews_2 ---------------------------------------- -These two datasets respectively contain annotated customer reviews of 5 and 9 -products from amazon.com. - - >>> from nltk.corpus import product_reviews_1 - >>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt') - >>> review = camera_reviews[0] - >>> review.sents()[0] - ['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am', - 'extremely', 'satisfied', 'with', 'the', 'purchase', '.'] - >>> review.features() - [('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'), - ('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'), - ('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'), - ('option', '+1')] - -It is also possible to reach the same information directly from the stream: - - >>> product_reviews_1.features('Canon_G3.txt') - [('canon powershot g3', '+3'), ('use', '+2'), ...] - -We can compute stats for specific product features: - - >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture']) - >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture']) - >>> mean = tot / n_reviews - >>> print(n_reviews, tot, mean) - 15 24 1.6 - -pros_cons ---------- -A list of pros/cons sentences for determining context (aspect) dependent -sentiment words, which are then applied to sentiment analysis of comparative -sentences. - - >>> from nltk.corpus import pros_cons - >>> pros_cons.sents(categories='Cons') - [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy', - 'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'], - ...] - >>> pros_cons.words('IntegratedPros.txt') - ['Easy', 'to', 'use', ',', 'economical', '!', ...] - -semcor ------- -The Brown Corpus, annotated with WordNet senses. - - >>> from nltk.corpus import semcor - >>> semcor.words('brown2/tagfiles/br-n12.xml') - ['When', 'several', 'minutes', 'had', 'passed', ...] - -senseval --------- -The Senseval 2 corpus is a word sense disambiguation corpus. Each -item in the corpus corresponds to a single ambiguous word. For each -of these words, the corpus contains a list of instances, corresponding -to occurrences of that word. Each instance provides the word; a list -of word senses that apply to the word occurrence; and the word's -context. - - >>> from nltk.corpus import senseval - >>> senseval.fileids() - ['hard.pos', 'interest.pos', 'line.pos', 'serve.pos'] - >>> senseval.instances('hard.pos') - ... - [SensevalInstance(word='hard-a', - position=20, - context=[('``', '``'), ('he', 'PRP'), ...('hard', 'JJ'), ...], - senses=('HARD1',)), - SensevalInstance(word='hard-a', - position=10, - context=[('clever', 'NNP'), ...('hard', 'JJ'), ('time', 'NN'), ...], - senses=('HARD1',)), ...] - -The following code looks at instances of the word 'interest', and -displays their local context (2 words on each side) and word sense(s): - - >>> for inst in senseval.instances('interest.pos')[:10]: - ... p = inst.position - ... left = ' '.join(w for (w,t) in inst.context[p-2:p]) - ... word = ' '.join(w for (w,t) in inst.context[p:p+1]) - ... right = ' '.join(w for (w,t) in inst.context[p+1:p+3]) - ... senses = ' '.join(inst.senses) - ... print('%20s |%10s | %-15s -> %s' % (left, word, right, senses)) - declines in | interest | rates . -> interest_6 - indicate declining | interest | rates because -> interest_6 - in short-term | interest | rates . -> interest_6 - 4 % | interest | in this -> interest_5 - company with | interests | in the -> interest_5 - , plus | interest | . -> interest_6 - set the | interest | rate on -> interest_6 - 's own | interest | , prompted -> interest_4 - principal and | interest | is the -> interest_6 - increase its | interest | to 70 -> interest_5 - -sentence_polarity ------------------ -The Sentence Polarity dataset contains 5331 positive and 5331 negative processed -sentences. - - >>> from nltk.corpus import sentence_polarity - >>> sentence_polarity.sents() - [['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish', - 'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find', - 'it', 'funny', '.'], ...] - >>> sentence_polarity.categories() - ['neg', 'pos'] - >>> sentence_polarity.sents()[1] - ["it's", 'so', 'laddish', 'and', 'juvenile', ',', 'only', 'teenage', 'boys', - 'could', 'possibly', 'find', 'it', 'funny', '.'] - -shakespeare ------------ -The Shakespeare corpus contains a set of Shakespeare plays, formatted -as XML files. These corpora are returned as ElementTree objects: - - >>> from nltk.corpus import shakespeare - >>> from xml.etree import ElementTree - >>> shakespeare.fileids() - ['a_and_c.xml', 'dream.xml', 'hamlet.xml', 'j_caesar.xml', ...] - >>> play = shakespeare.xml('dream.xml') - >>> print(play) - - >>> print('%s: %s' % (play[0].tag, play[0].text)) - TITLE: A Midsummer Night's Dream - >>> personae = [persona.text for persona in - ... play.findall('PERSONAE/PERSONA')] - >>> print(personae) - ['THESEUS, Duke of Athens.', 'EGEUS, father to Hermia.', ...] - >>> # Find and print speakers not listed as personae - >>> names = [persona.split(',')[0] for persona in personae] - >>> speakers = set(speaker.text for speaker in - ... play.findall('*/*/*/SPEAKER')) - >>> print(sorted(speakers.difference(names))) - ['ALL', 'COBWEB', 'DEMETRIUS', 'Fairy', 'HERNIA', 'LYSANDER', - 'Lion', 'MOTH', 'MUSTARDSEED', 'Moonshine', 'PEASEBLOSSOM', - 'Prologue', 'Pyramus', 'Thisbe', 'Wall'] - -subjectivity ------------- -The Subjectivity Dataset contains 5000 subjective and 5000 objective processed -sentences. - - >>> from nltk.corpus import subjectivity - >>> subjectivity.categories() - ['obj', 'subj'] - >>> subjectivity.sents()[23] - ['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits', - 'happened', 'off', 'screen', '.'] - >>> subjectivity.words(categories='subj') - ['smart', 'and', 'alert', ',', 'thirteen', ...] - -toolbox -------- -The Toolbox corpus distributed with NLTK contains a sample lexicon and -several sample texts from the Rotokas language. The Toolbox corpus -reader returns Toolbox files as XML ElementTree objects. The -following example loads the Rotokas dictionary, and figures out the -distribution of part-of-speech tags for reduplicated words. - -.. doctest: +SKIP - - >>> from nltk.corpus import toolbox - >>> from nltk.probability import FreqDist - >>> from xml.etree import ElementTree - >>> import re - >>> rotokas = toolbox.xml('rotokas.dic') - >>> redup_pos_freqdist = FreqDist() - >>> # Note: we skip over the first record, which is actually - >>> # the header. - >>> for record in rotokas[1:]: - ... lexeme = record.find('lx').text - ... if re.match(r'(.*)\1$', lexeme): - ... redup_pos_freqdist[record.find('ps').text] += 1 - >>> for item, count in redup_pos_freqdist.most_common(): - ... print(item, count) - V 41 - N 14 - ??? 4 - -This example displays some records from a Rotokas text: - -.. doctest: +SKIP - - >>> river = toolbox.xml('rotokas/river.txt', key='ref') - >>> for record in river.findall('record')[:3]: - ... for piece in record: - ... if len(piece.text) > 60: - ... print('%-6s %s...' % (piece.tag, piece.text[:57])) - ... else: - ... print('%-6s %s' % (piece.tag, piece.text)) - ref Paragraph 1 - t ``Viapau oisio ra ovaupasi ... - m viapau oisio ra ovau -pa -si ... - g NEG this way/like this and forget -PROG -2/3.DL... - p NEG ??? CONJ V.I -SUFF.V.3 -SUFF.V... - f ``No ken lus tingting wanema samting papa i bin tok,'' Na... - fe ``Don't forget what Dad said,'' yelled Naomi. - ref 2 - t Osa Ira ora Reviti viapau uvupasiva. - m osa Ira ora Reviti viapau uvu -pa -si ... - g as/like name and name NEG hear/smell -PROG -2/3... - p CONJ N.PN CONJ N.PN NEG V.T -SUFF.V.3 -SUF... - f Tasol Ila na David no bin harim toktok. - fe But Ila and David took no notice. - ref 3 - t Ikaupaoro rokosiva ... - m ikau -pa -oro roko -si -va ... - g run/hurry -PROG -SIM go down -2/3.DL.M -RP ... - p V.T -SUFF.V.3 -SUFF.V.4 ADV -SUFF.V.4 -SUFF.VT.... - f Tupela i bin hariap i go long wara . - fe They raced to the river. - -timit ------ -The NLTK data package includes a fragment of the TIMIT -Acoustic-Phonetic Continuous Speech Corpus. This corpus is broken -down into small speech samples, each of which is available as a wave -file, a phonetic transcription, and a tokenized word list. - - >>> from nltk.corpus import timit - >>> print(timit.utteranceids()) - ['dr1-fvmh0/sa1', 'dr1-fvmh0/sa2', 'dr1-fvmh0/si1466', - 'dr1-fvmh0/si2096', 'dr1-fvmh0/si836', 'dr1-fvmh0/sx116', - 'dr1-fvmh0/sx206', 'dr1-fvmh0/sx26', 'dr1-fvmh0/sx296', ...] - - >>> item = timit.utteranceids()[5] - >>> print(timit.phones(item)) - ['h#', 'k', 'l', 'ae', 's', 'pcl', 'p', 'dh', 'ax', - 's', 'kcl', 'k', 'r', 'ux', 'ix', 'nx', 'y', 'ax', - 'l', 'eh', 'f', 'tcl', 't', 'hh', 'ae', 'n', 'dcl', - 'd', 'h#'] - >>> print(timit.words(item)) - ['clasp', 'the', 'screw', 'in', 'your', 'left', 'hand'] - >>> timit.play(item) # doctest: +SKIP - -The corpus reader can combine the word segmentation information with -the phonemes to produce a single tree structure: - - >>> for tree in timit.phone_trees(item): - ... print(tree) - (S - h# - (clasp k l ae s pcl p) - (the dh ax) - (screw s kcl k r ux) - (in ix nx) - (your y ax) - (left l eh f tcl t) - (hand hh ae n dcl d) - h#) - -The start time and stop time of each phoneme, word, and sentence are -also available: - - >>> print(timit.phone_times(item)) - [('h#', 0, 2190), ('k', 2190, 3430), ('l', 3430, 4326), ...] - >>> print(timit.word_times(item)) - [('clasp', 2190, 8804), ('the', 8804, 9734), ...] - >>> print(timit.sent_times(item)) - [('Clasp the screw in your left hand.', 0, 32154)] - -We can use these times to play selected pieces of a speech sample: - - >>> timit.play(item, 2190, 8804) # 'clasp' # doctest: +SKIP - -The corpus reader can also be queried for information about the -speaker and sentence identifier for a given speech sample: - - >>> print(timit.spkrid(item)) - dr1-fvmh0 - >>> print(timit.sentid(item)) - sx116 - >>> print(timit.spkrinfo(timit.spkrid(item))) - SpeakerInfo(id='VMH0', - sex='F', - dr='1', - use='TRN', - recdate='03/11/86', - birthdate='01/08/60', - ht='5\'05"', - race='WHT', - edu='BS', - comments='BEST NEW ENGLAND ACCENT SO FAR') - - >>> # List the speech samples from the same speaker: - >>> timit.utteranceids(spkrid=timit.spkrid(item)) - ['dr1-fvmh0/sa1', 'dr1-fvmh0/sa2', 'dr1-fvmh0/si1466', ...] - -twitter_samples ---------------- - -Twitter is well-known microblog service that allows public data to be -collected via APIs. NLTK's twitter corpus currently contains a sample of 20k Tweets -retrieved from the Twitter Streaming API. - - >>> from nltk.corpus import twitter_samples - >>> twitter_samples.fileids() - ['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json'] - -We follow standard practice in storing full Tweets as line-separated -JSON. These data structures can be accessed via `tweets.docs()`. However, in general it -is more practical to focus just on the text field of the Tweets, which -are accessed via the `strings()` method. - - >>> twitter_samples.strings('tweets.20150430-223406.json')[:5] - ['RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain \xa3170 billion per year! #BetterOffOut #UKIP', ...] - -The default tokenizer for Tweets is specialised for 'casual' text, and -the `tokenized()` method returns a list of lists of tokens. - - >>> twitter_samples.tokenized('tweets.20150430-223406.json')[:5] - [['RT', '@KirkKus', ':', 'Indirect', 'cost', 'of', 'the', 'UK', 'being', 'in', ...], - ['VIDEO', ':', 'Sturgeon', 'on', 'post-election', 'deals', 'http://t.co/BTJwrpbmOY'], ...] - -rte ---- -The RTE (Recognizing Textual Entailment) corpus was derived from the -RTE1, RTE2 and RTE3 datasets (dev and test data), and consists of a -list of XML-formatted 'text'/'hypothesis' pairs. - - >>> from nltk.corpus import rte - >>> print(rte.fileids()) - ['rte1_dev.xml', 'rte1_test.xml', 'rte2_dev.xml', ..., 'rte3_test.xml'] - >>> rtepairs = rte.pairs(['rte2_test.xml', 'rte3_test.xml']) - >>> print(rtepairs) - [, , , ...] - -In the gold standard test sets, each pair is labeled according to -whether or not the text 'entails' the hypothesis; the -entailment value is mapped to an integer 1 (True) or 0 (False). - - >>> rtepairs[5] - - >>> rtepairs[5].text - 'His wife Strida won a seat in parliament after forging an alliance - with the main anti-Syrian coalition in the recent election.' - >>> rtepairs[5].hyp - 'Strida elected to parliament.' - >>> rtepairs[5].value - 1 - -The RTE corpus also supports an ``xml()`` method which produces ElementTrees. - - >>> xmltree = rte.xml('rte3_dev.xml') - >>> xmltree # doctest: +SKIP - - >>> xmltree[7].findtext('t') - "Mrs. Bush's approval ratings have remained very high, above 80%, - even as her husband's have recently dropped below 50%." - -verbnet -------- -The VerbNet corpus is a lexicon that divides verbs into classes, based -on their syntax-semantics linking behavior. The basic elements in the -lexicon are verb lemmas, such as 'abandon' and 'accept', and verb -classes, which have identifiers such as 'remove-10.1' and -'admire-31.2-1'. These class identifiers consist of a representative -verb selected from the class, followed by a numerical identifier. The -list of verb lemmas, and the list of class identifiers, can be -retrieved with the following methods: - - >>> from nltk.corpus import verbnet - >>> verbnet.lemmas()[20:25] - ['accelerate', 'accept', 'acclaim', 'accompany', 'accrue'] - >>> verbnet.classids()[:5] - ['accompany-51.7', 'admire-31.2', 'admire-31.2-1', 'admit-65', 'adopt-93'] - -The `classids()` method may also be used to retrieve the classes that -a given lemma belongs to: - - >>> verbnet.classids('accept') - ['approve-77', 'characterize-29.2-1-1', 'obtain-13.5.2'] - -The `classids()` method may additionally be used to retrieve all classes -within verbnet if nothing is passed: - - >>> verbnet.classids() - ['accompany-51.7', 'admire-31.2', 'admire-31.2-1', 'admit-65', 'adopt-93', 'advise-37.9', 'advise-37.9-1', 'allow-64', 'amalgamate-22.2', 'amalgamate-22.2-1', 'amalgamate-22.2-1-1', 'amalgamate-22.2-2', 'amalgamate-22.2-2-1', 'amalgamate-22.2-3', 'amalgamate-22.2-3-1', 'amalgamate-22.2-3-1-1', 'amalgamate-22.2-3-2', 'amuse-31.1', 'animal_sounds-38', 'appeal-31.4', 'appeal-31.4-1', 'appeal-31.4-2', 'appeal-31.4-3', 'appear-48.1.1', 'appoint-29.1', 'approve-77', 'assessment-34', 'assuming_position-50', 'avoid-52', 'banish-10.2', 'battle-36.4', 'battle-36.4-1', 'begin-55.1', 'begin-55.1-1', 'being_dressed-41.3.3', 'bend-45.2', 'berry-13.7', 'bill-54.5', 'body_internal_motion-49', 'body_internal_states-40.6', 'braid-41.2.2', 'break-45.1', 'breathe-40.1.2', 'breathe-40.1.2-1', 'bring-11.3', 'bring-11.3-1', 'build-26.1', 'build-26.1-1', 'bulge-47.5.3', 'bump-18.4', 'bump-18.4-1', 'butter-9.9', 'calibratable_cos-45.6', 'calibratable_cos-45.6-1', 'calve-28', 'captain-29.8', 'captain-29.8-1', 'captain-29.8-1-1', 'care-88', 'care-88-1', 'carry-11.4', 'carry-11.4-1', 'carry-11.4-1-1', 'carve-21.2', 'carve-21.2-1', 'carve-21.2-2', 'change_bodily_state-40.8.4', 'characterize-29.2', 'characterize-29.2-1', 'characterize-29.2-1-1', 'characterize-29.2-1-2', 'chase-51.6', 'cheat-10.6', 'cheat-10.6-1', 'cheat-10.6-1-1', 'chew-39.2', 'chew-39.2-1', 'chew-39.2-2', 'chit_chat-37.6', 'clear-10.3', 'clear-10.3-1', 'cling-22.5', 'coil-9.6', 'coil-9.6-1', 'coloring-24', 'complain-37.8', 'complete-55.2', 'concealment-16', 'concealment-16-1', 'confess-37.10', 'confine-92', 'confine-92-1', 'conjecture-29.5', 'conjecture-29.5-1', 'conjecture-29.5-2', 'consider-29.9', 'consider-29.9-1', 'consider-29.9-1-1', 'consider-29.9-1-1-1', 'consider-29.9-2', 'conspire-71', 'consume-66', 'consume-66-1', 'contiguous_location-47.8', 'contiguous_location-47.8-1', 'contiguous_location-47.8-2', 'continue-55.3', 'contribute-13.2', 'contribute-13.2-1', 'contribute-13.2-1-1', 'contribute-13.2-1-1-1', 'contribute-13.2-2', 'contribute-13.2-2-1', 'convert-26.6.2', 'convert-26.6.2-1', 'cooking-45.3', 'cooperate-73', 'cooperate-73-1', 'cooperate-73-2', 'cooperate-73-3', 'cope-83', 'cope-83-1', 'cope-83-1-1', 'correlate-86', 'correspond-36.1', 'correspond-36.1-1', 'correspond-36.1-1-1', 'cost-54.2', 'crane-40.3.2', 'create-26.4', 'create-26.4-1', 'curtsey-40.3.3', 'cut-21.1', 'cut-21.1-1', 'debone-10.8', 'declare-29.4', 'declare-29.4-1', 'declare-29.4-1-1', 'declare-29.4-1-1-1', 'declare-29.4-1-1-2', 'declare-29.4-1-1-3', 'declare-29.4-2', 'dedicate-79', 'defend-85', 'destroy-44', 'devour-39.4', 'devour-39.4-1', 'devour-39.4-2', 'differ-23.4', 'dine-39.5', 'disappearance-48.2', 'disassemble-23.3', 'discover-84', 'discover-84-1', 'discover-84-1-1', 'dress-41.1.1', 'dressing_well-41.3.2', 'drive-11.5', 'drive-11.5-1', 'dub-29.3', 'dub-29.3-1', 'eat-39.1', 'eat-39.1-1', 'eat-39.1-2', 'enforce-63', 'engender-27', 'entity_specific_cos-45.5', 'entity_specific_modes_being-47.2', 'equip-13.4.2', 'equip-13.4.2-1', 'equip-13.4.2-1-1', 'escape-51.1', 'escape-51.1-1', 'escape-51.1-2', 'escape-51.1-2-1', 'exceed-90', 'exchange-13.6', 'exchange-13.6-1', 'exchange-13.6-1-1', 'exhale-40.1.3', 'exhale-40.1.3-1', 'exhale-40.1.3-2', 'exist-47.1', 'exist-47.1-1', 'exist-47.1-1-1', 'feeding-39.7', 'ferret-35.6', 'fill-9.8', 'fill-9.8-1', 'fit-54.3', 'flinch-40.5', 'floss-41.2.1', 'focus-87', 'forbid-67', 'force-59', 'force-59-1', 'free-80', 'free-80-1', 'fulfilling-13.4.1', 'fulfilling-13.4.1-1', 'fulfilling-13.4.1-2', 'funnel-9.3', 'funnel-9.3-1', 'funnel-9.3-2', 'funnel-9.3-2-1', 'future_having-13.3', 'get-13.5.1', 'get-13.5.1-1', 'give-13.1', 'give-13.1-1', 'gobble-39.3', 'gobble-39.3-1', 'gobble-39.3-2', 'gorge-39.6', 'groom-41.1.2', 'grow-26.2', 'help-72', 'help-72-1', 'herd-47.5.2', 'hiccup-40.1.1', 'hit-18.1', 'hit-18.1-1', 'hold-15.1', 'hold-15.1-1', 'hunt-35.1', 'hurt-40.8.3', 'hurt-40.8.3-1', 'hurt-40.8.3-1-1', 'hurt-40.8.3-2', 'illustrate-25.3', 'image_impression-25.1', 'indicate-78', 'indicate-78-1', 'indicate-78-1-1', 'inquire-37.1.2', 'instr_communication-37.4', 'investigate-35.4', 'judgement-33', 'keep-15.2', 'knead-26.5', 'learn-14', 'learn-14-1', 'learn-14-2', 'learn-14-2-1', 'leave-51.2', 'leave-51.2-1', 'lecture-37.11', 'lecture-37.11-1', 'lecture-37.11-1-1', 'lecture-37.11-2', 'light_emission-43.1', 'limit-76', 'linger-53.1', 'linger-53.1-1', 'lodge-46', 'long-32.2', 'long-32.2-1', 'long-32.2-2', 'manner_speaking-37.3', 'marry-36.2', 'marvel-31.3', 'marvel-31.3-1', 'marvel-31.3-2', 'marvel-31.3-3', 'marvel-31.3-4', 'marvel-31.3-5', 'marvel-31.3-6', 'marvel-31.3-7', 'marvel-31.3-8', 'marvel-31.3-9', 'masquerade-29.6', 'masquerade-29.6-1', 'masquerade-29.6-2', 'matter-91', 'meander-47.7', 'meet-36.3', 'meet-36.3-1', 'meet-36.3-2', 'mine-10.9', 'mix-22.1', 'mix-22.1-1', 'mix-22.1-1-1', 'mix-22.1-2', 'mix-22.1-2-1', 'modes_of_being_with_motion-47.3', 'murder-42.1', 'murder-42.1-1', 'neglect-75', 'neglect-75-1', 'neglect-75-1-1', 'neglect-75-2', 'nonvehicle-51.4.2', 'nonverbal_expression-40.2', 'obtain-13.5.2', 'obtain-13.5.2-1', 'occurrence-48.3', 'order-60', 'order-60-1', 'orphan-29.7', 'other_cos-45.4', 'pain-40.8.1', 'pay-68', 'peer-30.3', 'pelt-17.2', 'performance-26.7', 'performance-26.7-1', 'performance-26.7-1-1', 'performance-26.7-2', 'performance-26.7-2-1', 'pit-10.7', 'pocket-9.10', 'pocket-9.10-1', 'poison-42.2', 'poke-19', 'pour-9.5', 'preparing-26.3', 'preparing-26.3-1', 'preparing-26.3-2', 'price-54.4', 'push-12', 'push-12-1', 'push-12-1-1', 'put-9.1', 'put-9.1-1', 'put-9.1-2', 'put_direction-9.4', 'put_spatial-9.2', 'put_spatial-9.2-1', 'reach-51.8', 'reflexive_appearance-48.1.2', 'refrain-69', 'register-54.1', 'rely-70', 'remove-10.1', 'risk-94', 'risk-94-1', 'roll-51.3.1', 'rummage-35.5', 'run-51.3.2', 'rush-53.2', 'say-37.7', 'say-37.7-1', 'say-37.7-1-1', 'say-37.7-2', 'scribble-25.2', 'search-35.2', 'see-30.1', 'see-30.1-1', 'see-30.1-1-1', 'send-11.1', 'send-11.1-1', 'separate-23.1', 'separate-23.1-1', 'separate-23.1-2', 'settle-89', 'shake-22.3', 'shake-22.3-1', 'shake-22.3-1-1', 'shake-22.3-2', 'shake-22.3-2-1', 'sight-30.2', 'simple_dressing-41.3.1', 'slide-11.2', 'slide-11.2-1-1', 'smell_emission-43.3', 'snooze-40.4', 'sound_emission-43.2', 'sound_existence-47.4', 'spank-18.3', 'spatial_configuration-47.6', 'split-23.2', 'spray-9.7', 'spray-9.7-1', 'spray-9.7-1-1', 'spray-9.7-2', 'stalk-35.3', 'steal-10.5', 'stimulus_subject-30.4', 'stop-55.4', 'stop-55.4-1', 'substance_emission-43.4', 'succeed-74', 'succeed-74-1', 'succeed-74-1-1', 'succeed-74-2', 'suffocate-40.7', 'suspect-81', 'swarm-47.5.1', 'swarm-47.5.1-1', 'swarm-47.5.1-2', 'swarm-47.5.1-2-1', 'swat-18.2', 'talk-37.5', 'tape-22.4', 'tape-22.4-1', 'tell-37.2', 'throw-17.1', 'throw-17.1-1', 'throw-17.1-1-1', 'tingle-40.8.2', 'touch-20', 'touch-20-1', 'transcribe-25.4', 'transfer_mesg-37.1.1', 'transfer_mesg-37.1.1-1', 'transfer_mesg-37.1.1-1-1', 'try-61', 'turn-26.6.1', 'turn-26.6.1-1', 'urge-58', 'vehicle-51.4.1', 'vehicle-51.4.1-1', 'waltz-51.5', 'want-32.1', 'want-32.1-1', 'want-32.1-1-1', 'weather-57', 'weekend-56', 'wink-40.3.1', 'wink-40.3.1-1', 'wipe_instr-10.4.2', 'wipe_instr-10.4.2-1', 'wipe_manner-10.4.1', 'wipe_manner-10.4.1-1', 'wish-62', 'withdraw-82', 'withdraw-82-1', 'withdraw-82-2', 'withdraw-82-3'] - -The primary object in the lexicon is a class record, which is stored -as an ElementTree xml object. The class record for a given class -identifier is returned by the `vnclass()` method: - - >>> verbnet.vnclass('remove-10.1') - - -The `vnclass()` method also accepts "short" identifiers, such as '10.1': - - >>> verbnet.vnclass('10.1') - - -See the Verbnet documentation, or the Verbnet files, for information -about the structure of this xml. As an example, we can retrieve a -list of thematic roles for a given Verbnet class: - - >>> vn_31_2 = verbnet.vnclass('admire-31.2') - >>> for themrole in vn_31_2.findall('THEMROLES/THEMROLE'): - ... print(themrole.attrib['type'], end=' ') - ... for selrestr in themrole.findall('SELRESTRS/SELRESTR'): - ... print('[%(Value)s%(type)s]' % selrestr.attrib, end=' ') - ... print() - Theme - Experiencer [+animate] - Predicate - -The Verbnet corpus also provides a variety of pretty printing -functions that can be used to display the xml contents in a more -concise form. The simplest such method is `pprint()`: - - >>> print(verbnet.pprint('57')) - weather-57 - Subclasses: (none) - Members: blow clear drizzle fog freeze gust hail howl lightning mist - mizzle pelt pour precipitate rain roar shower sleet snow spit spot - sprinkle storm swelter teem thaw thunder - Thematic roles: - * Theme[+concrete +force] - Frames: - Intransitive (Expletive Subject) - Example: It's raining. - Syntax: LEX[it] LEX[[+be]] VERB - Semantics: - * weather(during(E), Weather_type, ?Theme) - NP (Expletive Subject, Theme Object) - Example: It's raining cats and dogs. - Syntax: LEX[it] LEX[[+be]] VERB NP[Theme] - Semantics: - * weather(during(E), Weather_type, Theme) - PP (Expletive Subject, Theme-PP) - Example: It was pelting with rain. - Syntax: LEX[it[+be]] VERB PREP[with] NP[Theme] - Semantics: - * weather(during(E), Weather_type, Theme) - -Verbnet gives us frames that link the syntax and semantics using an example. -These frames are part of the corpus and we can use `frames()` to get a frame -for a given verbnet class. - - >>> frame = verbnet.frames('57') - >>> frame == [{'example': "It's raining.", 'description': {'primary': 'Intransitive', 'secondary': 'Expletive Subject'}, 'syntax': [{'pos_tag': 'LEX', 'modifiers': {'value': 'it', 'selrestrs': [], 'synrestrs': []}}, {'pos_tag': 'LEX', 'modifiers': {'value': '[+be]', 'selrestrs': [], 'synrestrs': []}}, {'pos_tag': 'VERB', 'modifiers': {'value': '', 'selrestrs': [], 'synrestrs': []}}], 'semantics': [{'predicate_value': 'weather', 'arguments': [{'type': 'Event', 'value': 'during(E)'}, {'type': 'VerbSpecific', 'value': 'Weather_type'}, {'type': 'ThemRole', 'value': '?Theme'}], 'negated': False}]}, {'example': "It's raining cats and dogs.", 'description': {'primary': 'NP', 'secondary': 'Expletive Subject, Theme Object'}, 'syntax': [{'pos_tag': 'LEX', 'modifiers': {'value': 'it', 'selrestrs': [], 'synrestrs': []}}, {'pos_tag': 'LEX', 'modifiers': {'value': '[+be]', 'selrestrs': [], 'synrestrs': []}}, {'pos_tag': 'VERB', 'modifiers': {'value': '', 'selrestrs': [], 'synrestrs': []}}, {'pos_tag': 'NP', 'modifiers': {'value': 'Theme', 'selrestrs': [], 'synrestrs': []}}], 'semantics': [{'predicate_value': 'weather', 'arguments': [{'type': 'Event', 'value': 'during(E)'}, {'type': 'VerbSpecific', 'value': 'Weather_type'}, {'type': 'ThemRole', 'value': 'Theme'}], 'negated': False}]}, {'example': 'It was pelting with rain.', 'description': {'primary': 'PP', 'secondary': 'Expletive Subject, Theme-PP'}, 'syntax': [{'pos_tag': 'LEX', 'modifiers': {'value': 'it[+be]', 'selrestrs': [], 'synrestrs': []}}, {'pos_tag': 'VERB', 'modifiers': {'value': '', 'selrestrs': [], 'synrestrs': []}}, {'pos_tag': 'PREP', 'modifiers': {'value': 'with', 'selrestrs': [], 'synrestrs': []}}, {'pos_tag': 'NP', 'modifiers': {'value': 'Theme', 'selrestrs': [], 'synrestrs': []}}], 'semantics': [{'predicate_value': 'weather', 'arguments': [{'type': 'Event', 'value': 'during(E)'}, {'type': 'VerbSpecific', 'value': 'Weather_type'}, {'type': 'ThemRole', 'value': 'Theme'}], 'negated': False}]}] - True - -Verbnet corpus lets us access thematic roles individually using `themroles()`. - - >>> themroles = verbnet.themroles('57') - >>> themroles == [{'modifiers': [{'type': 'concrete', 'value': '+'}, {'type': 'force', 'value': '+'}], 'type': 'Theme'}] - True - -Verbnet classes may also have subclasses sharing similar syntactic and semantic properties -while having differences with the superclass. The Verbnet corpus allows us to access these -subclasses using `subclasses()`. - - >>> print(verbnet.subclasses('9.1')) #Testing for 9.1 since '57' does not have subclasses - ['put-9.1-1', 'put-9.1-2'] - - -nps_chat --------- - -The NPS Chat Corpus, Release 1.0 consists of over 10,000 posts in age-specific -chat rooms, which have been anonymized, POS-tagged and dialogue-act tagged. - - >>> print(nltk.corpus.nps_chat.words()) - ['now', 'im', 'left', 'with', 'this', 'gay', ...] - >>> print(nltk.corpus.nps_chat.tagged_words()) - [('now', 'RB'), ('im', 'PRP'), ('left', 'VBD'), ...] - >>> print(nltk.corpus.nps_chat.tagged_posts()) - [[('now', 'RB'), ('im', 'PRP'), ('left', 'VBD'), ('with', 'IN'), - ('this', 'DT'), ('gay', 'JJ'), ('name', 'NN')], [(':P', 'UH')], ...] - -We can access the XML elements corresponding to individual posts. These elements -have ``class`` and ``user`` attributes that we can access using ``p.attrib['class']`` -and ``p.attrib['user']``. They also have text content, accessed using ``p.text``. - - >>> print(nltk.corpus.nps_chat.xml_posts()) - [, , ...] - >>> posts = nltk.corpus.nps_chat.xml_posts() - >>> sorted(nltk.FreqDist(p.attrib['class'] for p in posts).keys()) - ['Accept', 'Bye', 'Clarify', 'Continuer', 'Emotion', 'Emphasis', - 'Greet', 'Other', 'Reject', 'Statement', 'System', 'nAnswer', - 'whQuestion', 'yAnswer', 'ynQuestion'] - >>> posts[0].text - 'now im left with this gay name' - -In addition to the above methods for accessing tagged text, we can navigate -the XML structure directly, as follows: - - >>> tokens = posts[0].findall('terminals/t') - >>> [t.attrib['pos'] + "/" + t.attrib['word'] for t in tokens] - ['RB/now', 'PRP/im', 'VBD/left', 'IN/with', 'DT/this', 'JJ/gay', 'NN/name'] - -multext_east ------------- - -The Multext-East Corpus consists of POS-tagged versions of George Orwell's book -1984 in 12 languages: English, Czech, Hungarian, Macedonian, Slovenian, Serbian, -Slovak, Romanian, Estonian, Farsi, Bulgarian and Polish. -The corpus can be accessed using the usual methods for tagged corpora. The tagset -can be transformed from the Multext-East specific MSD tags to the Universal tagset -using the "tagset" parameter of all functions returning tagged parts of the corpus. - - >>> print(nltk.corpus.multext_east.words("oana-en.xml")) - ['It', 'was', 'a', 'bright', ...] - >>> print(nltk.corpus.multext_east.tagged_words("oana-en.xml")) - [('It', '#Pp3ns'), ('was', '#Vmis3s'), ('a', '#Di'), ...] - >>> print(nltk.corpus.multext_east.tagged_sents("oana-en.xml", "universal")) - [[('It', 'PRON'), ('was', 'VERB'), ('a', 'DET'), ...] - - - ---------------------- -Corpus Reader Classes ---------------------- - -NLTK's *corpus reader* classes are used to access the contents of a -diverse set of corpora. Each corpus reader class is specialized to -handle a specific corpus format. Examples include the -`PlaintextCorpusReader`, which handles corpora that consist of a set -of unannotated text files, and the `BracketParseCorpusReader`, which -handles corpora that consist of files containing -parenthesis-delineated parse trees. - -Automatically Created Corpus Reader Instances -============================================= - -When the `nltk.corpus` module is imported, it automatically creates a -set of corpus reader instances that can be used to access the corpora -in the NLTK data distribution. Here is a small sample of those -corpus reader instances: - - >>> import nltk - >>> nltk.corpus.brown - - >>> nltk.corpus.treebank - - >>> nltk.corpus.names - - >>> nltk.corpus.genesis - - >>> nltk.corpus.inaugural - - -This sample illustrates that different corpus reader classes are used -to read different corpora; but that the same corpus reader class may -be used for more than one corpus (e.g., ``genesis`` and ``inaugural``). - -Creating New Corpus Reader Instances -==================================== - -Although the `nltk.corpus` module automatically creates corpus reader -instances for the corpora in the NLTK data distribution, you may -sometimes need to create your own corpus reader. In particular, you -would need to create your own corpus reader if you want... - -- To access a corpus that is not included in the NLTK data - distribution. - -- To access a full copy of a corpus for which the NLTK data - distribution only provides a sample. - -- To access a corpus using a customized corpus reader (e.g., with - a customized tokenizer). - -To create a new corpus reader, you will first need to look up the -signature for that corpus reader's constructor. Different corpus -readers have different constructor signatures, but most of the -constructor signatures have the basic form:: - - SomeCorpusReader(root, files, ...options...) - -Where ``root`` is an absolute path to the directory containing the -corpus data files; ``files`` is either a list of file names (relative -to ``root``) or a regexp specifying which files should be included; -and ``options`` are additional reader-specific options. For example, -we can create a customized corpus reader for the genesis corpus that -uses a different sentence tokenizer as follows: - - >>> # Find the directory where the corpus lives. - >>> genesis_dir = nltk.data.find('corpora/genesis') - >>> # Create our custom sentence tokenizer. - >>> my_sent_tokenizer = nltk.RegexpTokenizer('[^.!?]+') - >>> # Create the new corpus reader object. - >>> my_genesis = nltk.corpus.PlaintextCorpusReader( - ... genesis_dir, r'.*\.txt', sent_tokenizer=my_sent_tokenizer) - >>> # Use the new corpus reader object. - >>> print(my_genesis.sents('english-kjv.txt')[0]) - ['In', 'the', 'beginning', 'God', 'created', 'the', 'heaven', - 'and', 'the', 'earth'] - -If you wish to read your own plaintext corpus, which is stored in the -directory '/usr/share/some-corpus', then you can create a corpus -reader for it with:: - - >>> my_corpus = nltk.corpus.PlaintextCorpusReader( - ... '/usr/share/some-corpus', r'.*\.txt') # doctest: +SKIP - -For a complete list of corpus reader subclasses, see the API -documentation for `nltk.corpus.reader`. - -Corpus Types -============ - -Corpora vary widely in the types of content they include. This is -reflected in the fact that the base class `CorpusReader` only defines -a few general-purpose methods for listing and accessing the files that -make up a corpus. It is up to the subclasses to define *data access -methods* that provide access to the information in the corpus. -However, corpus reader subclasses should be consistent in their -definitions of these data access methods wherever possible. - -At a high level, corpora can be divided into three basic types: - -- A *token corpus* contains information about specific occurrences of - language use (or linguistic tokens), such as dialogues or written - texts. Examples of token corpora are collections of written text - and collections of speech. - -- A *type corpus*, or *lexicon*, contains information about a coherent - set of lexical items (or linguistic types). Examples of lexicons - are dictionaries and word lists. - -- A *language description corpus* contains information about a set of - non-lexical linguistic constructs, such as grammar rules. - -However, many individual corpora blur the distinctions between these -types. For example, corpora that are primarily lexicons may include -token data in the form of example sentences; and corpora that are -primarily token corpora may be accompanied by one or more word lists -or other lexical data sets. - -Because corpora vary so widely in their information content, we have -decided that it would not be wise to use separate corpus reader base -classes for different corpus types. Instead, we simply try to make -the corpus readers consistent wherever possible, but let them differ -where the underlying data itself differs. - -Common Corpus Reader Methods -============================ - -As mentioned above, there are only a handful of methods that all -corpus readers are guaranteed to implement. These methods provide -access to the files that contain the corpus data. Every corpus is -assumed to consist of one or more files, all located in a common root -directory (or in subdirectories of that root directory). The absolute -path to the root directory is stored in the ``root`` property: - - >>> import os - >>> str(nltk.corpus.genesis.root).replace(os.path.sep,'/') - '.../nltk_data/corpora/genesis' - -Each file within the corpus is identified by a platform-independent -identifier, which is basically a path string that uses ``/`` as the -path separator. I.e., this identifier can be converted to a relative -path as follows: - - >>> some_corpus_file_id = nltk.corpus.reuters.fileids()[0] - >>> import os.path - >>> os.path.normpath(some_corpus_file_id).replace(os.path.sep,'/') - 'test/14826' - -To get a list of all data files that make up a corpus, use the -``fileids()`` method. In some corpora, these files will not all contain -the same type of data; for example, for the ``nltk.corpus.timit`` -corpus, ``fileids()`` will return a list including text files, word -segmentation files, phonetic transcription files, sound files, and -metadata files. For corpora with diverse file types, the ``fileids()`` -method will often take one or more optional arguments, which can be -used to get a list of the files with a specific file type: - - >>> nltk.corpus.timit.fileids() - ['dr1-fvmh0/sa1.phn', 'dr1-fvmh0/sa1.txt', 'dr1-fvmh0/sa1.wav', ...] - >>> nltk.corpus.timit.fileids('phn') - ['dr1-fvmh0/sa1.phn', 'dr1-fvmh0/sa2.phn', 'dr1-fvmh0/si1466.phn', ...] - -In some corpora, the files are divided into distinct categories. For -these corpora, the ``fileids()`` method takes an optional argument, -which can be used to get a list of the files within a specific category: - - >>> nltk.corpus.brown.fileids('hobbies') - ['ce01', 'ce02', 'ce03', 'ce04', 'ce05', 'ce06', 'ce07', ...] - -The ``abspath()`` method can be used to find the absolute path to a -corpus file, given its file identifier: - - >>> str(nltk.corpus.brown.abspath('ce06')).replace(os.path.sep,'/') - '.../corpora/brown/ce06' - -The ``abspaths()`` method can be used to find the absolute paths for -one corpus file, a list of corpus files, or (if no fileids are specified), -all corpus files. - -This method is mainly useful as a helper method when defining corpus -data access methods, since data access methods can usually be called -with a string argument (to get a view for a specific file), with a -list argument (to get a view for a specific list of files), or with no -argument (to get a view for the whole corpus). - -Data Access Methods -=================== - -Individual corpus reader subclasses typically extend this basic set of -file-access methods with one or more *data access methods*, which provide -easy access to the data contained in the corpus. The signatures for -data access methods often have the basic form:: - - corpus_reader.some_data access(fileids=None, ...options...) - -Where ``fileids`` can be a single file identifier string (to get a view -for a specific file); a list of file identifier strings (to get a view -for a specific list of files); or None (to get a view for the entire -corpus). Some of the common data access methods, and their return -types, are: - - - I{corpus}.words(): list of str - - I{corpus}.sents(): list of (list of str) - - I{corpus}.paras(): list of (list of (list of str)) - - I{corpus}.tagged_words(): list of (str,str) tuple - - I{corpus}.tagged_sents(): list of (list of (str,str)) - - I{corpus}.tagged_paras(): list of (list of (list of (str,str))) - - I{corpus}.chunked_sents(): list of (Tree w/ (str,str) leaves) - - I{corpus}.parsed_sents(): list of (Tree with str leaves) - - I{corpus}.parsed_paras(): list of (list of (Tree with str leaves)) - - I{corpus}.xml(): A single xml ElementTree - - I{corpus}.raw(): str (unprocessed corpus contents) - -For example, the `words()` method is supported by many different -corpora, and returns a flat list of word strings: - - >>> nltk.corpus.brown.words() - ['The', 'Fulton', 'County', 'Grand', 'Jury', ...] - >>> nltk.corpus.treebank.words() - ['Pierre', 'Vinken', ',', '61', 'years', 'old', ...] - >>> nltk.corpus.conll2002.words() - ['Sao', 'Paulo', '(', 'Brasil', ')', ',', '23', ...] - >>> nltk.corpus.genesis.words() - ['In', 'the', 'beginning', 'God', 'created', ...] - -On the other hand, the `tagged_words()` method is only supported by -corpora that include part-of-speech annotations: - - >>> nltk.corpus.brown.tagged_words() - [('The', 'AT'), ('Fulton', 'NP-TL'), ...] - >>> nltk.corpus.treebank.tagged_words() - [('Pierre', 'NNP'), ('Vinken', 'NNP'), ...] - >>> nltk.corpus.conll2002.tagged_words() - [('Sao', 'NC'), ('Paulo', 'VMI'), ('(', 'Fpa'), ...] - >>> nltk.corpus.genesis.tagged_words() - Traceback (most recent call last): - ... - AttributeError: 'PlaintextCorpusReader' object has no attribute 'tagged_words' - -Although most corpus readers use file identifiers to index their -content, some corpora use different identifiers instead. For example, -the data access methods for the ``timit`` corpus uses *utterance -identifiers* to select which corpus items should be returned: - - >>> nltk.corpus.timit.utteranceids() - ['dr1-fvmh0/sa1', 'dr1-fvmh0/sa2', 'dr1-fvmh0/si1466', ...] - >>> nltk.corpus.timit.words('dr1-fvmh0/sa2') - ["don't", 'ask', 'me', 'to', 'carry', 'an', 'oily', 'rag', 'like', 'that'] - -Attempting to call ``timit``\ 's data access methods with a file -identifier will result in an exception: - - >>> nltk.corpus.timit.fileids() - ['dr1-fvmh0/sa1.phn', 'dr1-fvmh0/sa1.txt', 'dr1-fvmh0/sa1.wav', ...] - >>> nltk.corpus.timit.words('dr1-fvmh0/sa1.txt') # doctest: +SKIP - Traceback (most recent call last): - ... - IOError: No such file or directory: '.../dr1-fvmh0/sa1.txt.wrd' - -As another example, the ``propbank`` corpus defines the ``roleset()`` -method, which expects a roleset identifier, not a file identifier: - - >>> roleset = nltk.corpus.propbank.roleset('eat.01') - >>> from xml.etree import ElementTree as ET - >>> print(ET.tostring(roleset).decode('utf8')) - - - ...... - ... - ... - -Stream Backed Corpus Views -========================== -An important feature of NLTK's corpus readers is that many of them -access the underlying data files using "corpus views." A *corpus -view* is an object that acts like a simple data structure (such as a -list), but does not store the data elements in memory; instead, data -elements are read from the underlying data files on an as-needed -basis. - -By only loading items from the file on an as-needed basis, corpus -views maintain both memory efficiency and responsiveness. The memory -efficiency of corpus readers is important because some corpora contain -very large amounts of data, and storing the entire data set in memory -could overwhelm many machines. The responsiveness is important when -experimenting with corpora in interactive sessions and in in-class -demonstrations. - -The most common corpus view is the `StreamBackedCorpusView`, which -acts as a read-only list of tokens. Two additional corpus view -classes, `ConcatenatedCorpusView` and `LazySubsequence`, make it -possible to create concatenations and take slices of -`StreamBackedCorpusView` objects without actually storing the -resulting list-like object's elements in memory. - -In the future, we may add additional corpus views that act like other -basic data structures, such as dictionaries. - -Writing New Corpus Readers -========================== - -In order to add support for new corpus formats, it is necessary to -define new corpus reader classes. For many corpus formats, writing -new corpus readers is relatively straight-forward. In this section, -we'll describe what's involved in creating a new corpus reader. If -you do create a new corpus reader, we encourage you to contribute it -back to the NLTK project. - -Don't Reinvent the Wheel ------------------------- -Before you start writing a new corpus reader, you should check to be -sure that the desired format can't be read using an existing corpus -reader with appropriate constructor arguments. For example, although -the `TaggedCorpusReader` assumes that words and tags are separated by -``/`` characters by default, an alternative tag-separation character -can be specified via the ``sep`` constructor argument. You should -also check whether the new corpus format can be handled by subclassing -an existing corpus reader, and tweaking a few methods or variables. - -Design ------- -If you decide to write a new corpus reader from scratch, then you -should first decide which data access methods you want the reader to -provide, and what their signatures should be. You should look at -existing corpus readers that process corpora with similar data -contents, and try to be consistent with those corpus readers whenever -possible. - -You should also consider what sets of identifiers are appropriate for -the corpus format. Where it's practical, file identifiers should be -used. However, for some corpora, it may make sense to use additional -sets of identifiers. Each set of identifiers should have a distinct -name (e.g., fileids, utteranceids, rolesets); and you should be consistent -in using that name to refer to that identifier. Do not use parameter -names like ``id``, which leave it unclear what type of identifier is -required. - -Once you've decided what data access methods and identifiers are -appropriate for your corpus, you should decide if there are any -customizable parameters that you'd like the corpus reader to handle. -These parameters make it possible to use a single corpus reader to -handle a wider variety of corpora. The ``sep`` argument for -`TaggedCorpusReader`, mentioned above, is an example of a customizable -corpus reader parameter. - -Implementation --------------- - -Constructor -~~~~~~~~~~~ -If your corpus reader implements any customizable parameters, then -you'll need to override the constructor. Typically, the new -constructor will first call its base class's constructor, and then -store the customizable parameters. For example, the -`ConllChunkCorpusReader`\ 's constructor is defined as follows: - - >>> def __init__(self, root, fileids, chunk_types, encoding='utf8', - ... tagset=None, separator=None): - ... ConllCorpusReader.__init__( - ... self, root, fileids, ('words', 'pos', 'chunk'), - ... chunk_types=chunk_types, encoding=encoding, - ... tagset=tagset, separator=separator) - -If your corpus reader does not implement any customization parameters, -then you can often just inherit the base class's constructor. - -Data Access Methods -~~~~~~~~~~~~~~~~~~~ - -The most common type of data access method takes an argument -identifying which files to access, and returns a view covering those -files. This argument may be a single file identifier string (to get a -view for a specific file); a list of file identifier strings (to get a -view for a specific list of files); or None (to get a view for the -entire corpus). The method's implementation converts this argument to -a list of path names using the `abspaths()` method, which handles all -three value types (string, list, and None): - - >>> print(str(nltk.corpus.brown.abspaths()).replace('\\\\','/')) - [FileSystemPathPointer('.../corpora/brown/ca01'), - FileSystemPathPointer('.../corpora/brown/ca02'), ...] - >>> print(str(nltk.corpus.brown.abspaths('ce06')).replace('\\\\','/')) - [FileSystemPathPointer('.../corpora/brown/ce06')] - >>> print(str(nltk.corpus.brown.abspaths(['ce06', 'ce07'])).replace('\\\\','/')) - [FileSystemPathPointer('.../corpora/brown/ce06'), - FileSystemPathPointer('.../corpora/brown/ce07')] - -An example of this type of method is the `words()` method, defined by -the `PlaintextCorpusReader` as follows: - - >>> def words(self, fileids=None): - ... return concat([self.CorpusView(fileid, self._read_word_block) - ... for fileid in self.abspaths(fileids)]) - -This method first uses `abspaths()` to convert ``fileids`` to a list of -absolute paths. It then creates a corpus view for each file, using -the `PlaintextCorpusReader._read_word_block()` method to read elements -from the data file (see the discussion of corpus views below). -Finally, it combines these corpus views using the -`nltk.corpus.reader.util.concat()` function. - -When writing a corpus reader for a corpus that is never expected to be -very large, it can sometimes be appropriate to read the files -directly, rather than using a corpus view. For example, the -`WordListCorpusView` class defines its `words()` method as follows: - - >>> def words(self, fileids=None): - ... return concat([[w for w in open(fileid).read().split('\n') if w] - ... for fileid in self.abspaths(fileids)]) - -(This is usually more appropriate for lexicons than for token corpora.) - -If the type of data returned by a data access method is one for which -NLTK has a conventional representation (e.g., words, tagged words, and -parse trees), then you should use that representation. Otherwise, you -may find it necessary to define your own representation. For data -structures that are relatively corpus-specific, it's usually best to -define new classes for these elements. For example, the ``propbank`` -corpus defines the `PropbankInstance` class to store the semantic role -labeling instances described by the corpus; and the ``ppattach`` -corpus defines the `PPAttachment` class to store the prepositional -attachment instances described by the corpus. - -Corpus Views -~~~~~~~~~~~~ -.. (Much of the content for this section is taken from the - StreamBackedCorpusView docstring.) - -The heart of a `StreamBackedCorpusView` is its *block reader* -function, which reads zero or more tokens from a stream, and returns -them as a list. A very simple example of a block reader is: - - >>> def simple_block_reader(stream): - ... return stream.readline().split() - -This simple block reader reads a single line at a time, and returns a -single token (consisting of a string) for each whitespace-separated -substring on the line. A `StreamBackedCorpusView` built from this -block reader will act like a read-only list of all the -whitespace-separated tokens in an underlying file. - -When deciding how to define the block reader for a given corpus, -careful consideration should be given to the size of blocks handled by -the block reader. Smaller block sizes will increase the memory -requirements of the corpus view's internal data structures (by 2 -integers per block). On the other hand, larger block sizes may -decrease performance for random access to the corpus. (But note that -larger block sizes will *not* decrease performance for iteration.) - -Internally, the `StreamBackedCorpusView` class maintains a partial -mapping from token index to file position, with one entry per block. -When a token with a given index *i* is requested, the corpus view -constructs it as follows: - -1. First, it searches the toknum/filepos mapping for the token index - closest to (but less than or equal to) *i*. - -2. Then, starting at the file position corresponding to that index, it - reads one block at a time using the block reader until it reaches - the requested token. - -The toknum/filepos mapping is created lazily: it is initially empty, -but every time a new block is read, the block's initial token is added -to the mapping. (Thus, the toknum/filepos map has one entry per -block.) - -You can create your own corpus view in one of two ways: - -1. Call the `StreamBackedCorpusView` constructor, and provide your - block reader function via the ``block_reader`` argument. - -2. Subclass `StreamBackedCorpusView`, and override the - `read_block()` method. - -The first option is usually easier, but the second option can allow -you to write a single `read_block` method whose behavior can be -customized by different parameters to the subclass's constructor. For -an example of this design pattern, see the `TaggedCorpusView` class, -which is used by `TaggedCorpusView`. - ----------------- -Regression Tests ----------------- - -The following helper functions are used to create and then delete -testing corpora that are stored in temporary directories. These -testing corpora are used to make sure the readers work correctly. - - >>> import tempfile, os.path, textwrap - >>> def make_testcorpus(ext='', **fileids): - ... root = tempfile.mkdtemp() - ... for fileid, contents in fileids.items(): - ... fileid += ext - ... f = open(os.path.join(root, fileid), 'w') - ... f.write(textwrap.dedent(contents)) - ... f.close() - ... return root - >>> def del_testcorpus(root): - ... for fileid in os.listdir(root): - ... os.remove(os.path.join(root, fileid)) - ... os.rmdir(root) - -Plaintext Corpus Reader -======================= -The plaintext corpus reader is used to access corpora that consist of -unprocessed plaintext data. It assumes that paragraph breaks are -indicated by blank lines. Sentences and words can be tokenized using -the default tokenizers, or by custom tokenizers specified as -parameters to the constructor. - - >>> root = make_testcorpus(ext='.txt', - ... a="""\ - ... This is the first sentence. Here is another - ... sentence! And here's a third sentence. - ... - ... This is the second paragraph. Tokenization is currently - ... fairly simple, so the period in Mr. gets tokenized. - ... """, - ... b="""This is the second file.""") - - >>> from nltk.corpus.reader.plaintext import PlaintextCorpusReader - -The list of documents can be specified explicitly, or implicitly (using a -regexp). The ``ext`` argument specifies a file extension. - - >>> corpus = PlaintextCorpusReader(root, ['a.txt', 'b.txt']) - >>> corpus.fileids() - ['a.txt', 'b.txt'] - >>> corpus = PlaintextCorpusReader(root, r'.*\.txt') - >>> corpus.fileids() - ['a.txt', 'b.txt'] - -The directory containing the corpus is corpus.root: - - >>> str(corpus.root) == str(root) - True - -We can get a list of words, or the raw string: - - >>> corpus.words() - ['This', 'is', 'the', 'first', 'sentence', '.', ...] - >>> corpus.raw()[:40] - 'This is the first sentence. Here is ano' - -Check that reading individual documents works, and reading all documents at -once works: - - >>> len(corpus.words()), [len(corpus.words(d)) for d in corpus.fileids()] - (46, [40, 6]) - >>> corpus.words('a.txt') - ['This', 'is', 'the', 'first', 'sentence', '.', ...] - >>> corpus.words('b.txt') - ['This', 'is', 'the', 'second', 'file', '.'] - >>> corpus.words()[:4], corpus.words()[-4:] - (['This', 'is', 'the', 'first'], ['the', 'second', 'file', '.']) - -We're done with the test corpus: - - >>> del_testcorpus(root) - -Test the plaintext corpora that come with nltk: - - >>> from nltk.corpus import abc, genesis, inaugural - >>> from nltk.corpus import state_union, webtext - >>> for corpus in (abc, genesis, inaugural, state_union, - ... webtext): - ... print(str(corpus).replace('\\\\','/')) - ... print(' ', repr(corpus.fileids())[:60]) - ... print(' ', repr(corpus.words()[:10])[:60]) - - ['rural.txt', 'science.txt'] - ['PM', 'denies', 'knowledge', 'of', 'AWB', ... - - ['english-kjv.txt', 'english-web.txt', 'finnish.txt', ... - ['In', 'the', 'beginning', 'God', 'created', 'the', ... - - ['1789-Washington.txt', '1793-Washington.txt', ... - ['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', ... - - ['1945-Truman.txt', '1946-Truman.txt', ... - ['PRESIDENT', 'HARRY', 'S', '.', 'TRUMAN', "'", ... - - ['firefox.txt', 'grail.txt', 'overheard.txt', ... - ['Cookie', 'Manager', ':', '"', 'Don', "'", 't', ... - - -Tagged Corpus Reader -==================== -The Tagged Corpus reader can give us words, sentences, and paragraphs, -each tagged or untagged. All of the read methods can take one item -(in which case they return the contents of that file) or a list of -documents (in which case they concatenate the contents of those files). -By default, they apply to all documents in the corpus. - - >>> root = make_testcorpus( - ... a="""\ - ... This/det is/verb the/det first/adj sentence/noun ./punc - ... Here/det is/verb another/adj sentence/noun ./punc - ... Note/verb that/comp you/pron can/verb use/verb \ - ... any/noun tag/noun set/noun - ... - ... This/det is/verb the/det second/adj paragraph/noun ./punc - ... word/n without/adj a/det tag/noun :/: hello ./punc - ... """, - ... b="""\ - ... This/det is/verb the/det second/adj file/noun ./punc - ... """) - - >>> from nltk.corpus.reader.tagged import TaggedCorpusReader - >>> corpus = TaggedCorpusReader(root, list('ab')) - >>> corpus.fileids() - ['a', 'b'] - >>> str(corpus.root) == str(root) - True - >>> corpus.words() - ['This', 'is', 'the', 'first', 'sentence', '.', ...] - >>> corpus.sents() - [['This', 'is', 'the', 'first', ...], ['Here', 'is', 'another'...], ...] - >>> corpus.paras() - [[['This', ...], ['Here', ...], ...], [['This', ...], ...], ...] - >>> corpus.tagged_words() - [('This', 'DET'), ('is', 'VERB'), ('the', 'DET'), ...] - >>> corpus.tagged_sents() - [[('This', 'DET'), ('is', 'VERB'), ...], [('Here', 'DET'), ...], ...] - >>> corpus.tagged_paras() - [[[('This', 'DET'), ...], ...], [[('This', 'DET'), ...], ...], ...] - >>> corpus.raw()[:40] - 'This/det is/verb the/det first/adj sente' - >>> len(corpus.words()), [len(corpus.words(d)) for d in corpus.fileids()] - (38, [32, 6]) - >>> len(corpus.sents()), [len(corpus.sents(d)) for d in corpus.fileids()] - (6, [5, 1]) - >>> len(corpus.paras()), [len(corpus.paras(d)) for d in corpus.fileids()] - (3, [2, 1]) - >>> print(corpus.words('a')) - ['This', 'is', 'the', 'first', 'sentence', '.', ...] - >>> print(corpus.words('b')) - ['This', 'is', 'the', 'second', 'file', '.'] - >>> del_testcorpus(root) - -The Brown Corpus uses the tagged corpus reader: - - >>> from nltk.corpus import brown - >>> brown.fileids() - ['ca01', 'ca02', 'ca03', 'ca04', 'ca05', 'ca06', 'ca07', ...] - >>> brown.categories() - ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', - 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] - >>> print(repr(brown.root).replace('\\\\','/')) - FileSystemPathPointer('.../corpora/brown') - >>> brown.words() - ['The', 'Fulton', 'County', 'Grand', 'Jury', ...] - >>> brown.sents() - [['The', 'Fulton', 'County', 'Grand', ...], ...] - >>> brown.paras() - [[['The', 'Fulton', 'County', ...]], [['The', 'jury', ...]], ...] - >>> brown.tagged_words() - [('The', 'AT'), ('Fulton', 'NP-TL'), ...] - >>> brown.tagged_sents() - [[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ...], ...] - >>> brown.tagged_paras() - [[[('The', 'AT'), ...]], [[('The', 'AT'), ...]], ...] - -Verbnet Corpus Reader -===================== - -Make sure we're picking up the right number of elements: - - >>> from nltk.corpus import verbnet - >>> len(verbnet.lemmas()) - 3621 - >>> len(verbnet.wordnetids()) - 4953 - >>> len(verbnet.classids()) - 429 - -Selecting classids based on various selectors: - - >>> verbnet.classids(lemma='take') - ['bring-11.3', 'characterize-29.2', 'convert-26.6.2', 'cost-54.2', - 'fit-54.3', 'performance-26.7-2', 'steal-10.5'] - >>> verbnet.classids(wordnetid='lead%2:38:01') - ['accompany-51.7'] - >>> verbnet.classids(fileid='approve-77.xml') - ['approve-77'] - >>> verbnet.classids(classid='admire-31.2') # subclasses - ['admire-31.2-1'] - -vnclass() accepts filenames, long ids, and short ids: - - >>> a = ElementTree.tostring(verbnet.vnclass('admire-31.2.xml')) - >>> b = ElementTree.tostring(verbnet.vnclass('admire-31.2')) - >>> c = ElementTree.tostring(verbnet.vnclass('31.2')) - >>> a == b == c - True - -fileids() can be used to get files based on verbnet class ids: - - >>> verbnet.fileids('admire-31.2') - ['admire-31.2.xml'] - >>> verbnet.fileids(['admire-31.2', 'obtain-13.5.2']) - ['admire-31.2.xml', 'obtain-13.5.2.xml'] - >>> verbnet.fileids('badidentifier') - Traceback (most recent call last): - . . . - ValueError: vnclass identifier 'badidentifier' not found - -longid() and shortid() can be used to convert identifiers: - - >>> verbnet.longid('31.2') - 'admire-31.2' - >>> verbnet.longid('admire-31.2') - 'admire-31.2' - >>> verbnet.shortid('31.2') - '31.2' - >>> verbnet.shortid('admire-31.2') - '31.2' - >>> verbnet.longid('badidentifier') - Traceback (most recent call last): - . . . - ValueError: vnclass identifier 'badidentifier' not found - >>> verbnet.shortid('badidentifier') - Traceback (most recent call last): - . . . - ValueError: vnclass identifier 'badidentifier' not found - -Corpus View Regression Tests -============================ - -Select some corpus files to play with: - - >>> import nltk.data - >>> # A very short file (160 chars): - >>> f1 = nltk.data.find('corpora/inaugural/README') - >>> # A relatively short file (791 chars): - >>> f2 = nltk.data.find('corpora/inaugural/1793-Washington.txt') - >>> # A longer file (32k chars): - >>> f3 = nltk.data.find('corpora/inaugural/1909-Taft.txt') - >>> fileids = [f1, f2, f3] - - -Concatenation -------------- -Check that concatenation works as intended. - - >>> from nltk.corpus.reader.util import * - - >>> c1 = StreamBackedCorpusView(f1, read_whitespace_block, encoding='utf-8') - >>> c2 = StreamBackedCorpusView(f2, read_whitespace_block, encoding='utf-8') - >>> c3 = StreamBackedCorpusView(f3, read_whitespace_block, encoding='utf-8') - >>> c123 = c1+c2+c3 - >>> print(c123) - ['C-Span', 'Inaugural', 'Address', 'Corpus', 'US', ...] - - >>> l1 = f1.open(encoding='utf-8').read().split() - >>> l2 = f2.open(encoding='utf-8').read().split() - >>> l3 = f3.open(encoding='utf-8').read().split() - >>> l123 = l1+l2+l3 - - >>> list(c123) == l123 - True - - >>> (c1+c2+c3)[100] == l123[100] - True - -Slicing -------- -First, do some tests with fairly small slices. These will all -generate tuple values. - - >>> from nltk.util import LazySubsequence - >>> c1 = StreamBackedCorpusView(f1, read_whitespace_block, encoding='utf-8') - >>> l1 = f1.open(encoding='utf-8').read().split() - >>> print(len(c1)) - 21 - >>> len(c1) < LazySubsequence.MIN_SIZE - True - -Choose a list of indices, based on the length, that covers the -important corner cases: - - >>> indices = [-60, -30, -22, -21, -20, -1, - ... 0, 1, 10, 20, 21, 22, 30, 60] - -Test slicing with explicit start & stop value: - - >>> for s in indices: - ... for e in indices: - ... assert list(c1[s:e]) == l1[s:e] - -Test slicing with stop=None: - - >>> for s in indices: - ... assert list(c1[s:]) == l1[s:] - -Test slicing with start=None: - - >>> for e in indices: - ... assert list(c1[:e]) == l1[:e] - -Test slicing with start=stop=None: - - >>> list(c1[:]) == list(l1[:]) - True - -Next, we'll do some tests with much longer slices. These will -generate LazySubsequence objects. - - >>> c3 = StreamBackedCorpusView(f3, read_whitespace_block, encoding='utf-8') - >>> l3 = f3.open(encoding='utf-8').read().split() - >>> print(len(c3)) - 5430 - >>> len(c3) > LazySubsequence.MIN_SIZE*2 - True - -Choose a list of indices, based on the length, that covers the -important corner cases: - - >>> indices = [-12000, -6000, -5431, -5430, -5429, -3000, -200, -1, - ... 0, 1, 200, 3000, 5000, 5429, 5430, 5431, 6000, 12000] - -Test slicing with explicit start & stop value: - - >>> for s in indices: - ... for e in indices: - ... assert list(c3[s:e]) == l3[s:e] - -Test slicing with stop=None: - - >>> for s in indices: - ... assert list(c3[s:]) == l3[s:] - -Test slicing with start=None: - - >>> for e in indices: - ... assert list(c3[:e]) == l3[:e] - -Test slicing with start=stop=None: - - >>> list(c3[:]) == list(l3[:]) - True - -Multiple Iterators ------------------- -If multiple iterators are created for the same corpus view, their -iteration can be interleaved: - - >>> c3 = StreamBackedCorpusView(f3, read_whitespace_block) - >>> iterators = [c3.iterate_from(n) for n in [0,15,30,45]] - >>> for i in range(15): - ... for iterator in iterators: - ... print('%-15s' % next(iterator), end=' ') - ... print() - My a duties in - fellow heavy of a - citizens: weight the proper - Anyone of office sense - who responsibility. upon of - has If which the - taken not, he obligation - the he is which - oath has about the - I no to oath - have conception enter, imposes. - just of or The - taken the he office - must powers is of - feel and lacking an - -SeekableUnicodeStreamReader -=========================== - -The file-like objects provided by the ``codecs`` module unfortunately -suffer from a bug that prevents them from working correctly with -corpus view objects. In particular, although the expose ``seek()`` -and ``tell()`` methods, those methods do not exhibit the expected -behavior, because they are not synchronized with the internal buffers -that are kept by the file-like objects. For example, the ``tell()`` -method will return the file position at the end of the buffers (whose -contents have not yet been returned by the stream); and therefore this -file position can not be used to return to the 'current' location in -the stream (since ``seek()`` has no way to reconstruct the buffers). - -To get around these problems, we define a new class, -`SeekableUnicodeStreamReader`, to act as a file-like interface to -files containing encoded unicode data. This class is loosely based on -the ``codecs.StreamReader`` class. To construct a new reader, we call -the constructor with an underlying stream and an encoding name: - - >>> from io import StringIO, BytesIO - >>> from nltk.data import SeekableUnicodeStreamReader - >>> stream = BytesIO(b"""\ - ... This is a test file. - ... It is encoded in ascii. - ... """.decode('ascii').encode('ascii')) - >>> reader = SeekableUnicodeStreamReader(stream, 'ascii') - -`SeekableUnicodeStreamReader`\ s support all of the normal operations -supplied by a read-only stream. Note that all of the read operations -return ``unicode`` objects (not ``str`` objects). - - >>> reader.read() # read the entire file. - 'This is a test file.\nIt is encoded in ascii.\n' - >>> reader.seek(0) # rewind to the start. - >>> reader.read(5) # read at most 5 bytes. - 'This ' - >>> reader.readline() # read to the end of the line. - 'is a test file.\n' - >>> reader.seek(0) # rewind to the start. - >>> for line in reader: - ... print(repr(line)) # iterate over lines - 'This is a test file.\n' - 'It is encoded in ascii.\n' - >>> reader.seek(0) # rewind to the start. - >>> reader.readlines() # read a list of line strings - ['This is a test file.\n', 'It is encoded in ascii.\n'] - >>> reader.close() - -Size argument to ``read()`` ---------------------------- -The ``size`` argument to ``read()`` specifies the maximum number of -*bytes* to read, not the maximum number of *characters*. Thus, for -encodings that use multiple bytes per character, it may return fewer -characters than the ``size`` argument: - - >>> stream = BytesIO(b"""\ - ... This is a test file. - ... It is encoded in utf-16. - ... """.decode('ascii').encode('utf-16')) - >>> reader = SeekableUnicodeStreamReader(stream, 'utf-16') - >>> reader.read(10) - 'This ' - -If a read block ends in the middle of the byte string encoding a -single character, then that byte string is stored in an internal -buffer, and re-used on the next call to ``read()``. However, if the -size argument is too small to read even a single character, even -though at least one character is available, then the ``read()`` method -will read additional bytes until it can return a single character. -This ensures that the ``read()`` method does not return an empty -string, which could be mistaken for indicating the end of the file. - - >>> reader.seek(0) # rewind to the start. - >>> reader.read(1) # we actually need to read 4 bytes - 'T' - >>> int(reader.tell()) - 4 - -The ``readline()`` method may read more than a single line of text, in -which case it stores the text that it does not return in a buffer. If -this buffer is not empty, then its contents will be included in the -value returned by the next call to ``read()``, regardless of the -``size`` argument, since they are available without reading any new -bytes from the stream: - - >>> reader.seek(0) # rewind to the start. - >>> reader.readline() # stores extra text in a buffer - 'This is a test file.\n' - >>> print(reader.linebuffer) # examine the buffer contents - ['It is encoded i'] - >>> reader.read(0) # returns the contents of the buffer - 'It is encoded i' - >>> print(reader.linebuffer) # examine the buffer contents - None - -Seek and Tell -------------- -In addition to these basic read operations, -`SeekableUnicodeStreamReader` also supports the ``seek()`` and -``tell()`` operations. However, some care must still be taken when -using these operations. In particular, the only file offsets that -should be passed to ``seek()`` are ``0`` and any offset that has been -returned by ``tell``. - - >>> stream = BytesIO(b"""\ - ... This is a test file. - ... It is encoded in utf-16. - ... """.decode('ascii').encode('utf-16')) - >>> reader = SeekableUnicodeStreamReader(stream, 'utf-16') - >>> reader.read(20) - 'This is a ' - >>> pos = reader.tell(); print(pos) - 22 - >>> reader.read(20) - 'test file.' - >>> reader.seek(pos) # rewind to the position from tell. - >>> reader.read(20) - 'test file.' - -The ``seek()`` and ``tell()`` methods work property even when -``readline()`` is used. - - >>> stream = BytesIO(b"""\ - ... This is a test file. - ... It is encoded in utf-16. - ... """.decode('ascii').encode('utf-16')) - >>> reader = SeekableUnicodeStreamReader(stream, 'utf-16') - >>> reader.readline() - 'This is a test file.\n' - >>> pos = reader.tell(); print(pos) - 44 - >>> reader.readline() - 'It is encoded in utf-16.\n' - >>> reader.seek(pos) # rewind to the position from tell. - >>> reader.readline() - 'It is encoded in utf-16.\n' - - -Squashed Bugs -============= - -svn 5276 fixed a bug in the comment-stripping behavior of -parse_sexpr_block. - - >>> from io import StringIO - >>> from nltk.corpus.reader.util import read_sexpr_block - >>> f = StringIO(b""" - ... (a b c) - ... # This line is a comment. - ... (d e f\ng h)""".decode('ascii')) - >>> print(read_sexpr_block(f, block_size=38, comment_char='#')) - ['(a b c)'] - >>> print(read_sexpr_block(f, block_size=38, comment_char='#')) - ['(d e f\ng h)'] - -svn 5277 fixed a bug in parse_sexpr_block, which would cause it to -enter an infinite loop if a file ended mid-sexpr, or ended with a -token that was not followed by whitespace. A related bug caused -an infinite loop if the corpus ended in an unmatched close paren -- -this was fixed in svn 5279 - - >>> f = StringIO(b""" - ... This file ends mid-sexpr - ... (hello (world""".decode('ascii')) - >>> for i in range(3): print(read_sexpr_block(f)) - ['This', 'file', 'ends', 'mid-sexpr'] - ['(hello (world'] - [] - - >>> f = StringIO(b"This file has no trailing whitespace.".decode('ascii')) - >>> for i in range(3): print(read_sexpr_block(f)) - ['This', 'file', 'has', 'no', 'trailing'] - ['whitespace.'] - [] - - >>> # Bug fixed in 5279: - >>> f = StringIO(b"a b c)".decode('ascii')) - >>> for i in range(3): print(read_sexpr_block(f)) - ['a', 'b'] - ['c)'] - [] - - -svn 5624 & 5265 fixed a bug in ConcatenatedCorpusView, which caused it -to return the wrong items when indexed starting at any index beyond -the first file. - - >>> import nltk - >>> sents = nltk.corpus.brown.sents() - >>> print(sents[6000]) - ['Cholesterol', 'and', 'thyroid'] - >>> print(sents[6000]) - ['Cholesterol', 'and', 'thyroid'] - -svn 5728 fixed a bug in Categorized*CorpusReader, which caused them -to return words from *all* files when just one file was specified. - - >>> from nltk.corpus import reuters - >>> reuters.words('training/13085') - ['SNYDER', '&', 'lt', ';', 'SOI', '>', 'MAKES', ...] - >>> reuters.words('training/5082') - ['SHEPPARD', 'RESOURCES', 'TO', 'MERGE', 'WITH', ...] - -svn 7227 fixed a bug in the qc corpus reader, which prevented -access to its tuples() method - - >>> from nltk.corpus import qc - >>> qc.tuples('test.txt') - [('NUM:dist', 'How far is it from Denver to Aspen ?'), ('LOC:city', 'What county is Modesto , California in ?'), ...] - -Ensure that KEYWORD from `comparative_sents.py` no longer contains a ReDoS vulnerability. - - >>> import re - >>> import time - >>> from nltk.corpus.reader.comparative_sents import KEYWORD - >>> sizes = { - ... "short": 4000, - ... "long": 40000 - ... } - >>> exec_times = { - ... "short": [], - ... "long": [], - ... } - >>> for size_name, size in sizes.items(): - ... for j in range(9): - ... start_t = time.perf_counter() - ... payload = "( " + "(" * size - ... output = KEYWORD.findall(payload) - ... exec_times[size_name].append(time.perf_counter() - start_t) - ... exec_times[size_name] = sorted(exec_times[size_name])[4] # Get the median - -Ideally, the execution time of such a regular expression is linear -in the length of the input. As such, we would expect exec_times["long"] -to be roughly 10 times as big as exec_times["short"]. -With the ReDoS in place, it took roughly 80 times as long. -For now, we accept values below 30 (times as long), due to the potential -for variance. This ensures that the ReDoS has certainly been reduced, -if not removed. - - >>> exec_times["long"] / exec_times["short"] < 30 # doctest: +SKIP - True diff --git a/pipeline/nltk/test/crubadan.doctest b/pipeline/nltk/test/crubadan.doctest deleted file mode 100644 index 8c10781333a47ecc4e4e8ed279440dd7fe589639..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/crubadan.doctest +++ /dev/null @@ -1,65 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -Crubadan Corpus Reader -====================== - -Crubadan is an NLTK corpus reader for ngram files provided -by the Crubadan project. It supports several languages. - - >>> from nltk.corpus import crubadan - >>> crubadan.langs() - ['abk', 'abn',..., 'zpa', 'zul'] - ----------------------------------------- -Language code mapping and helper methods ----------------------------------------- - -The web crawler that generates the 3-gram frequencies works at the -level of "writing systems" rather than languages. Writing systems -are assigned internal 2-3 letter codes that require mapping to the -standard ISO 639-3 codes. For more information, please refer to -the README in nltk_data/crubadan folder after installing it. - -To translate ISO 639-3 codes to "Crubadan Code": - - >>> crubadan.iso_to_crubadan('eng') - 'en' - >>> crubadan.iso_to_crubadan('fra') - 'fr' - >>> crubadan.iso_to_crubadan('aaa') - -In reverse, print ISO 639-3 code if we have the Crubadan Code: - - >>> crubadan.crubadan_to_iso('en') - 'eng' - >>> crubadan.crubadan_to_iso('fr') - 'fra' - >>> crubadan.crubadan_to_iso('aa') - ---------------------------- -Accessing ngram frequencies ---------------------------- - -On initialization the reader will create a dictionary of every -language supported by the Crubadan project, mapping the ISO 639-3 -language code to its corresponding ngram frequency. - -You can access individual language FreqDist and the ngrams within them as follows: - - >>> english_fd = crubadan.lang_freq('eng') - >>> english_fd['the'] - 728135 - -Above accesses the FreqDist of English and returns the frequency of the ngram 'the'. -A ngram that isn't found within the language will return 0: - - >>> english_fd['sometest'] - 0 - -A language that isn't supported will raise an exception: - - >>> crubadan.lang_freq('elvish') - Traceback (most recent call last): - ... - RuntimeError: Unsupported language. diff --git a/pipeline/nltk/test/data.doctest b/pipeline/nltk/test/data.doctest deleted file mode 100644 index 0f54657d00c1e719518ca4f8034c1a91d483835c..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/data.doctest +++ /dev/null @@ -1,387 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -========================================= - Loading Resources From the Data Package -========================================= - - >>> import nltk.data - -Overview -~~~~~~~~ -The `nltk.data` module contains functions that can be used to load -NLTK resource files, such as corpora, grammars, and saved processing -objects. - -Loading Data Files -~~~~~~~~~~~~~~~~~~ -Resources are loaded using the function `nltk.data.load()`, which -takes as its first argument a URL specifying what file should be -loaded. The ``nltk:`` protocol loads files from the NLTK data -distribution: - - >>> tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle') - >>> tokenizer.tokenize('Hello. This is a test. It works!') - ['Hello.', 'This is a test.', 'It works!'] - -It is important to note that there should be no space following the -colon (':') in the URL; 'nltk: tokenizers/punkt/english.pickle' will -not work! - -The ``nltk:`` protocol is used by default if no protocol is specified: - - >>> nltk.data.load('tokenizers/punkt/english.pickle') - - -But it is also possible to load resources from ``http:``, ``ftp:``, -and ``file:`` URLs: - - >>> # Load a grammar from the NLTK webpage. - >>> cfg = nltk.data.load('https://raw.githubusercontent.com/nltk/nltk/develop/nltk/test/toy.cfg') - >>> print(cfg) # doctest: +ELLIPSIS - Grammar with 14 productions (start state = S) - S -> NP VP - PP -> P NP - ... - P -> 'on' - P -> 'in' - - >>> # Load a grammar using an absolute path. - >>> url = 'file:%s' % nltk.data.find('grammars/sample_grammars/toy.cfg') - >>> url.replace('\\', '/') - 'file:...toy.cfg' - >>> print(nltk.data.load(url)) - Grammar with 14 productions (start state = S) - S -> NP VP - PP -> P NP - ... - P -> 'on' - P -> 'in' - -The second argument to the `nltk.data.load()` function specifies the -file format, which determines how the file's contents are processed -before they are returned by ``load()``. The formats that are -currently supported by the data module are described by the dictionary -`nltk.data.FORMATS`: - - >>> for format, descr in sorted(nltk.data.FORMATS.items()): - ... print('{0:<7} {1:}'.format(format, descr)) - cfg A context free grammar. - fcfg A feature CFG. - fol A list of first order logic expressions, parsed with - nltk.sem.logic.Expression.fromstring. - json A serialized python object, stored using the json module. - logic A list of first order logic expressions, parsed with - nltk.sem.logic.LogicParser. Requires an additional logic_parser - parameter - pcfg A probabilistic CFG. - pickle A serialized python object, stored using the pickle - module. - raw The raw (byte string) contents of a file. - text The raw (unicode string) contents of a file. - val A semantic valuation, parsed by - nltk.sem.Valuation.fromstring. - yaml A serialized python object, stored using the yaml module. - -`nltk.data.load()` will raise a ValueError if a bad format name is -specified: - - >>> nltk.data.load('grammars/sample_grammars/toy.cfg', 'bar') - Traceback (most recent call last): - . . . - ValueError: Unknown format type! - -By default, the ``"auto"`` format is used, which chooses a format -based on the filename's extension. The mapping from file extensions -to format names is specified by `nltk.data.AUTO_FORMATS`: - - >>> for ext, format in sorted(nltk.data.AUTO_FORMATS.items()): - ... print('.%-7s -> %s' % (ext, format)) - .cfg -> cfg - .fcfg -> fcfg - .fol -> fol - .json -> json - .logic -> logic - .pcfg -> pcfg - .pickle -> pickle - .text -> text - .txt -> text - .val -> val - .yaml -> yaml - -If `nltk.data.load()` is unable to determine the format based on the -filename's extension, it will raise a ValueError: - - >>> nltk.data.load('foo.bar') - Traceback (most recent call last): - . . . - ValueError: Could not determine format for foo.bar based on its file - extension; use the "format" argument to specify the format explicitly. - -Note that by explicitly specifying the ``format`` argument, you can -override the load method's default processing behavior. For example, -to get the raw contents of any file, simply use ``format="raw"``: - - >>> s = nltk.data.load('grammars/sample_grammars/toy.cfg', 'text') - >>> print(s) - S -> NP VP - PP -> P NP - NP -> Det N | NP PP - VP -> V NP | VP PP - ... - -Making Local Copies -~~~~~~~~~~~~~~~~~~~ -.. This will not be visible in the html output: create a tempdir to - play in. - >>> import tempfile, os - >>> tempdir = tempfile.mkdtemp() - >>> old_dir = os.path.abspath('.') - >>> os.chdir(tempdir) - -The function `nltk.data.retrieve()` copies a given resource to a local -file. This can be useful, for example, if you want to edit one of the -sample grammars. - - >>> nltk.data.retrieve('grammars/sample_grammars/toy.cfg') - Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'toy.cfg' - - >>> # Simulate editing the grammar. - >>> with open('toy.cfg') as inp: - ... s = inp.read().replace('NP', 'DP') - >>> with open('toy.cfg', 'w') as out: - ... _bytes_written = out.write(s) - - >>> # Load the edited grammar, & display it. - >>> cfg = nltk.data.load('file:///' + os.path.abspath('toy.cfg')) - >>> print(cfg) - Grammar with 14 productions (start state = S) - S -> DP VP - PP -> P DP - ... - P -> 'on' - P -> 'in' - -The second argument to `nltk.data.retrieve()` specifies the filename -for the new copy of the file. By default, the source file's filename -is used. - - >>> nltk.data.retrieve('grammars/sample_grammars/toy.cfg', 'mytoy.cfg') - Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'mytoy.cfg' - >>> os.path.isfile('./mytoy.cfg') - True - >>> nltk.data.retrieve('grammars/sample_grammars/np.fcfg') - Retrieving 'nltk:grammars/sample_grammars/np.fcfg', saving to 'np.fcfg' - >>> os.path.isfile('./np.fcfg') - True - -If a file with the specified (or default) filename already exists in -the current directory, then `nltk.data.retrieve()` will raise a -ValueError exception. It will *not* overwrite the file: - - >>> os.path.isfile('./toy.cfg') - True - >>> nltk.data.retrieve('grammars/sample_grammars/toy.cfg') - Traceback (most recent call last): - . . . - ValueError: File '...toy.cfg' already exists! - -.. This will not be visible in the html output: clean up the tempdir. - >>> os.chdir(old_dir) - >>> for f in os.listdir(tempdir): - ... os.remove(os.path.join(tempdir, f)) - >>> os.rmdir(tempdir) - -Finding Files in the NLTK Data Package -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The `nltk.data.find()` function searches the NLTK data package for a -given file, and returns a pointer to that file. This pointer can -either be a `FileSystemPathPointer` (whose `path` attribute gives the -absolute path of the file); or a `ZipFilePathPointer`, specifying a -zipfile and the name of an entry within that zipfile. Both pointer -types define the `open()` method, which can be used to read the string -contents of the file. - - >>> path = nltk.data.find('corpora/abc/rural.txt') - >>> str(path) - '...rural.txt' - >>> print(path.open().read(60).decode()) - PM denies knowledge of AWB kickbacks - The Prime Minister has - -Alternatively, the `nltk.data.load()` function can be used with the -keyword argument ``format="raw"``: - - >>> s = nltk.data.load('corpora/abc/rural.txt', format='raw')[:60] - >>> print(s.decode()) - PM denies knowledge of AWB kickbacks - The Prime Minister has - -Alternatively, you can use the keyword argument ``format="text"``: - - >>> s = nltk.data.load('corpora/abc/rural.txt', format='text')[:60] - >>> print(s) - PM denies knowledge of AWB kickbacks - The Prime Minister has - -Resource Caching -~~~~~~~~~~~~~~~~ - -NLTK uses a weakref dictionary to maintain a cache of resources that -have been loaded. If you load a resource that is already stored in -the cache, then the cached copy will be returned. This behavior can -be seen by the trace output generated when verbose=True: - - >>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg', verbose=True) - <> - >>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg', verbose=True) - <> - -If you wish to load a resource from its source, bypassing the cache, -use the ``cache=False`` argument to `nltk.data.load()`. This can be -useful, for example, if the resource is loaded from a local file, and -you are actively editing that file: - - >>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg',cache=False,verbose=True) - <> - -The cache *no longer* uses weak references. A resource will not be -automatically expunged from the cache when no more objects are using -it. In the following example, when we clear the variable ``feat0``, -the reference count for the feature grammar object drops to zero. -However, the object remains cached: - - >>> del feat0 - >>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg', - ... verbose=True) - <> - -You can clear the entire contents of the cache, using -`nltk.data.clear_cache()`: - - >>> nltk.data.clear_cache() - -Retrieving other Data Sources -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - >>> formulas = nltk.data.load('grammars/book_grammars/background.fol') - >>> for f in formulas: print(str(f)) - all x.(boxerdog(x) -> dog(x)) - all x.(boxer(x) -> person(x)) - all x.-(dog(x) & person(x)) - all x.(married(x) <-> exists y.marry(x,y)) - all x.(bark(x) -> dog(x)) - all x y.(marry(x,y) -> (person(x) & person(y))) - -(Vincent = Mia) - -(Vincent = Fido) - -(Mia = Fido) - -Regression Tests -~~~~~~~~~~~~~~~~ -Create a temp dir for tests that write files: - - >>> import tempfile, os - >>> tempdir = tempfile.mkdtemp() - >>> old_dir = os.path.abspath('.') - >>> os.chdir(tempdir) - -The `retrieve()` function accepts all url types: - - >>> urls = ['https://raw.githubusercontent.com/nltk/nltk/develop/nltk/test/toy.cfg', - ... 'file:%s' % nltk.data.find('grammars/sample_grammars/toy.cfg'), - ... 'nltk:grammars/sample_grammars/toy.cfg', - ... 'grammars/sample_grammars/toy.cfg'] - >>> for i, url in enumerate(urls): - ... nltk.data.retrieve(url, 'toy-%d.cfg' % i) - Retrieving 'https://raw.githubusercontent.com/nltk/nltk/develop/nltk/test/toy.cfg', saving to 'toy-0.cfg' - Retrieving 'file:...toy.cfg', saving to 'toy-1.cfg' - Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'toy-2.cfg' - Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'toy-3.cfg' - -Clean up the temp dir: - - >>> os.chdir(old_dir) - >>> for f in os.listdir(tempdir): - ... os.remove(os.path.join(tempdir, f)) - >>> os.rmdir(tempdir) - -Lazy Loader ------------ -A lazy loader is a wrapper object that defers loading a resource until -it is accessed or used in any way. This is mainly intended for -internal use by NLTK's corpus readers. - - >>> # Create a lazy loader for toy.cfg. - >>> ll = nltk.data.LazyLoader('grammars/sample_grammars/toy.cfg') - - >>> # Show that it's not loaded yet: - >>> object.__repr__(ll) - '' - - >>> # printing it is enough to cause it to be loaded: - >>> print(ll) - - - >>> # Show that it's now been loaded: - >>> object.__repr__(ll) - '' - - - >>> # Test that accessing an attribute also loads it: - >>> ll = nltk.data.LazyLoader('grammars/sample_grammars/toy.cfg') - >>> ll.start() - S - >>> object.__repr__(ll) - '' - -Buffered Gzip Reading and Writing ---------------------------------- -Write performance to gzip-compressed is extremely poor when the files become large. -File creation can become a bottleneck in those cases. - -Read performance from large gzipped pickle files was improved in data.py by -buffering the reads. A similar fix can be applied to writes by buffering -the writes to a StringIO object first. - -This is mainly intended for internal use. The test simply tests that reading -and writing work as intended and does not test how much improvement buffering -provides. - - >>> from io import StringIO - >>> test = nltk.data.BufferedGzipFile('testbuf.gz', 'wb', size=2**10) - >>> ans = [] - >>> for i in range(10000): - ... ans.append(str(i).encode('ascii')) - ... test.write(str(i).encode('ascii')) - >>> test.close() - >>> test = nltk.data.BufferedGzipFile('testbuf.gz', 'rb') - >>> test.read() == b''.join(ans) - True - >>> test.close() - >>> import os - >>> os.unlink('testbuf.gz') - -JSON Encoding and Decoding --------------------------- -JSON serialization is used instead of pickle for some classes. - - >>> from nltk import jsontags - >>> from nltk.jsontags import JSONTaggedEncoder, JSONTaggedDecoder, register_tag - >>> @jsontags.register_tag - ... class JSONSerializable: - ... json_tag = 'JSONSerializable' - ... - ... def __init__(self, n): - ... self.n = n - ... - ... def encode_json_obj(self): - ... return self.n - ... - ... @classmethod - ... def decode_json_obj(cls, obj): - ... n = obj - ... return cls(n) - ... - >>> JSONTaggedEncoder().encode(JSONSerializable(1)) - '{"!JSONSerializable": 1}' - >>> JSONTaggedDecoder().decode('{"!JSONSerializable": 1}').n - 1 diff --git a/pipeline/nltk/test/dependency.doctest b/pipeline/nltk/test/dependency.doctest deleted file mode 100644 index f621fac48e3682e7d65ade4819dd53e56e6b9780..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/dependency.doctest +++ /dev/null @@ -1,241 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -=================== -Dependency Grammars -=================== - - >>> from nltk.grammar import DependencyGrammar - >>> from nltk.parse import ( - ... DependencyGraph, - ... ProjectiveDependencyParser, - ... NonprojectiveDependencyParser, - ... ) - -CoNLL Data ----------- - - >>> treebank_data = """Pierre NNP 2 NMOD - ... Vinken NNP 8 SUB - ... , , 2 P - ... 61 CD 5 NMOD - ... years NNS 6 AMOD - ... old JJ 2 NMOD - ... , , 2 P - ... will MD 0 ROOT - ... join VB 8 VC - ... the DT 11 NMOD - ... board NN 9 OBJ - ... as IN 9 VMOD - ... a DT 15 NMOD - ... nonexecutive JJ 15 NMOD - ... director NN 12 PMOD - ... Nov. NNP 9 VMOD - ... 29 CD 16 NMOD - ... . . 9 VMOD - ... """ - - >>> dg = DependencyGraph(treebank_data) - >>> dg.tree().pprint() - (will - (Vinken Pierre , (old (years 61)) ,) - (join (board the) (as (director a nonexecutive)) (Nov. 29) .)) - >>> for head, rel, dep in dg.triples(): - ... print( - ... '({h[0]}, {h[1]}), {r}, ({d[0]}, {d[1]})' - ... .format(h=head, r=rel, d=dep) - ... ) - (will, MD), SUB, (Vinken, NNP) - (Vinken, NNP), NMOD, (Pierre, NNP) - (Vinken, NNP), P, (,, ,) - (Vinken, NNP), NMOD, (old, JJ) - (old, JJ), AMOD, (years, NNS) - (years, NNS), NMOD, (61, CD) - (Vinken, NNP), P, (,, ,) - (will, MD), VC, (join, VB) - (join, VB), OBJ, (board, NN) - (board, NN), NMOD, (the, DT) - (join, VB), VMOD, (as, IN) - (as, IN), PMOD, (director, NN) - (director, NN), NMOD, (a, DT) - (director, NN), NMOD, (nonexecutive, JJ) - (join, VB), VMOD, (Nov., NNP) - (Nov., NNP), NMOD, (29, CD) - (join, VB), VMOD, (., .) - -Using a custom cell extractor. - - >>> def custom_extractor(cells): - ... _, tag, head, rel = cells - ... return 'spam', 'spam', tag, tag, '', head, rel - >>> dg = DependencyGraph(treebank_data, cell_extractor=custom_extractor) - >>> dg.tree().pprint() - (spam - (spam spam spam (spam (spam spam)) spam) - (spam (spam spam) (spam (spam spam spam)) (spam spam) spam)) - -Custom cell extractors can take in and return an index. - - >>> def custom_extractor(cells, index): - ... word, tag, head, rel = cells - ... return (index, '{}-{}'.format(word, index), word, - ... tag, tag, '', head, rel) - >>> dg = DependencyGraph(treebank_data, cell_extractor=custom_extractor) - >>> dg.tree().pprint() - (will-8 - (Vinken-2 Pierre-1 ,-3 (old-6 (years-5 61-4)) ,-7) - (join-9 - (board-11 the-10) - (as-12 (director-15 a-13 nonexecutive-14)) - (Nov.-16 29-17) - .-18)) - -Using the dependency-parsed version of the Penn Treebank corpus sample. - - >>> from nltk.corpus import dependency_treebank - >>> t = dependency_treebank.parsed_sents()[0] - >>> print(t.to_conll(3)) - Pierre NNP 2 - Vinken NNP 8 - , , 2 - 61 CD 5 - years NNS 6 - old JJ 2 - , , 2 - will MD 0 - join VB 8 - the DT 11 - board NN 9 - as IN 9 - a DT 15 - nonexecutive JJ 15 - director NN 12 - Nov. NNP 9 - 29 CD 16 - . . 8 - -Using the output of zpar (like Malt-TAB but with zero-based indexing) - - >>> zpar_data = """ - ... Pierre NNP 1 NMOD - ... Vinken NNP 7 SUB - ... , , 1 P - ... 61 CD 4 NMOD - ... years NNS 5 AMOD - ... old JJ 1 NMOD - ... , , 1 P - ... will MD -1 ROOT - ... join VB 7 VC - ... the DT 10 NMOD - ... board NN 8 OBJ - ... as IN 8 VMOD - ... a DT 14 NMOD - ... nonexecutive JJ 14 NMOD - ... director NN 11 PMOD - ... Nov. NNP 8 VMOD - ... 29 CD 15 NMOD - ... . . 7 P - ... """ - - >>> zdg = DependencyGraph(zpar_data, zero_based=True) - >>> print(zdg.tree()) - (will - (Vinken Pierre , (old (years 61)) ,) - (join (board the) (as (director a nonexecutive)) (Nov. 29)) - .) - - -Projective Dependency Parsing ------------------------------ - - >>> grammar = DependencyGrammar.fromstring(""" - ... 'fell' -> 'price' | 'stock' - ... 'price' -> 'of' 'the' - ... 'of' -> 'stock' - ... 'stock' -> 'the' - ... """) - >>> print(grammar) - Dependency grammar with 5 productions - 'fell' -> 'price' - 'fell' -> 'stock' - 'price' -> 'of' 'the' - 'of' -> 'stock' - 'stock' -> 'the' - - >>> dp = ProjectiveDependencyParser(grammar) - >>> for t in sorted(dp.parse(['the', 'price', 'of', 'the', 'stock', 'fell'])): - ... print(t) - (fell (price the (of (stock the)))) - (fell (price the of) (stock the)) - (fell (price the of the) stock) - -Non-Projective Dependency Parsing ---------------------------------- - - >>> grammar = DependencyGrammar.fromstring(""" - ... 'taught' -> 'play' | 'man' - ... 'man' -> 'the' - ... 'play' -> 'golf' | 'dog' | 'to' - ... 'dog' -> 'his' - ... """) - >>> print(grammar) - Dependency grammar with 7 productions - 'taught' -> 'play' - 'taught' -> 'man' - 'man' -> 'the' - 'play' -> 'golf' - 'play' -> 'dog' - 'play' -> 'to' - 'dog' -> 'his' - - >>> dp = NonprojectiveDependencyParser(grammar) - >>> g, = dp.parse(['the', 'man', 'taught', 'his', 'dog', 'to', 'play', 'golf']) - - >>> print(g.root['word']) - taught - - >>> for _, node in sorted(g.nodes.items()): - ... if node['word'] is not None: - ... print('{address} {word}: {d}'.format(d=node['deps'][''], **node)) - 1 the: [] - 2 man: [1] - 3 taught: [2, 7] - 4 his: [] - 5 dog: [4] - 6 to: [] - 7 play: [5, 6, 8] - 8 golf: [] - - >>> print(g.tree()) - (taught (man the) (play (dog his) to golf)) - -Integration with MALT parser -============================ - -In case the top relation is different from the default, we can set it. In case -of MALT parser, it's set to `'null'`. - ->>> dg_str = """1 I _ NN NN _ 2 nn _ _ -... 2 shot _ NN NN _ 0 null _ _ -... 3 an _ AT AT _ 2 dep _ _ -... 4 elephant _ NN NN _ 7 nn _ _ -... 5 in _ NN NN _ 7 nn _ _ -... 6 my _ NN NN _ 7 nn _ _ -... 7 pajamas _ NNS NNS _ 3 dobj _ _ -... """ ->>> dg = DependencyGraph(dg_str, top_relation_label='null') - ->>> len(dg.nodes) -8 - ->>> dg.root['word'], dg.root['address'] -('shot', 2) - ->>> print(dg.to_conll(10)) -1 I _ NN NN _ 2 nn _ _ -2 shot _ NN NN _ 0 null _ _ -3 an _ AT AT _ 2 dep _ _ -4 elephant _ NN NN _ 7 nn _ _ -5 in _ NN NN _ 7 nn _ _ -6 my _ NN NN _ 7 nn _ _ -7 pajamas _ NNS NNS _ 3 dobj _ _ diff --git a/pipeline/nltk/test/discourse.doctest b/pipeline/nltk/test/discourse.doctest deleted file mode 100644 index 1e37ca56440809055871b656d59fb0f7fd634f2c..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/discourse.doctest +++ /dev/null @@ -1,552 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -================== -Discourse Checking -================== - - >>> from nltk import * - >>> from nltk.sem import logic - >>> logic._counter._value = 0 - -Setup -===== - - >>> from nltk.test.childes_fixt import setup_module - >>> setup_module() - -Introduction -============ - -The NLTK discourse module makes it possible to test consistency and -redundancy of simple discourses, using theorem-proving and -model-building from `nltk.inference`. - -The ``DiscourseTester`` constructor takes a list of sentences as a -parameter. - - >>> dt = DiscourseTester(['a boxer walks', 'every boxer chases a girl']) - -The ``DiscourseTester`` parses each sentence into a list of logical -forms. Once we have created ``DiscourseTester`` object, we can -inspect various properties of the discourse. First off, we might want -to double-check what sentences are currently stored as the discourse. - - >>> dt.sentences() - s0: a boxer walks - s1: every boxer chases a girl - -As you will see, each sentence receives an identifier `s`\ :subscript:`i`. -We might also want to check what grammar the ``DiscourseTester`` is -using (by default, ``book_grammars/discourse.fcfg``): - - >>> dt.grammar() - % start S - # Grammar Rules - S[SEM = ] -> NP[NUM=?n,SEM=?subj] VP[NUM=?n,SEM=?vp] - NP[NUM=?n,SEM= ] -> Det[NUM=?n,SEM=?det] Nom[NUM=?n,SEM=?nom] - NP[LOC=?l,NUM=?n,SEM=?np] -> PropN[LOC=?l,NUM=?n,SEM=?np] - ... - -A different grammar can be invoked by using the optional ``gramfile`` -parameter when a ``DiscourseTester`` object is created. - -Readings and Threads -==================== - -Depending on -the grammar used, we may find some sentences have more than one -logical form. To check this, use the ``readings()`` method. Given a -sentence identifier of the form `s`\ :subscript:`i`, each reading of -that sentence is given an identifier `s`\ :sub:`i`-`r`\ :sub:`j`. - - - >>> dt.readings() - - s0 readings: - - s0-r0: exists z1.(boxer(z1) & walk(z1)) - s0-r1: exists z1.(boxerdog(z1) & walk(z1)) - - s1 readings: - - s1-r0: all z2.(boxer(z2) -> exists z3.(girl(z3) & chase(z2,z3))) - s1-r1: all z1.(boxerdog(z1) -> exists z2.(girl(z2) & chase(z1,z2))) - - -In this case, the only source of ambiguity lies in the word *boxer*, -which receives two translations: ``boxer`` and ``boxerdog``. The -intention is that one of these corresponds to the ``person`` sense and -one to the ``dog`` sense. In principle, we would also expect to see a -quantifier scope ambiguity in ``s1``. However, the simple grammar we -are using, namely `sem4.fcfg `_, doesn't support quantifier -scope ambiguity. - -We can also investigate the readings of a specific sentence: - - >>> dt.readings('a boxer walks') - The sentence 'a boxer walks' has these readings: - exists x.(boxer(x) & walk(x)) - exists x.(boxerdog(x) & walk(x)) - -Given that each sentence is two-ways ambiguous, we potentially have -four different discourse 'threads', taking all combinations of -readings. To see these, specify the ``threaded=True`` parameter on -the ``readings()`` method. Again, each thread is assigned an -identifier of the form `d`\ :sub:`i`. Following the identifier is a -list of the readings that constitute that thread. - - >>> dt.readings(threaded=True) - d0: ['s0-r0', 's1-r0'] - d1: ['s0-r0', 's1-r1'] - d2: ['s0-r1', 's1-r0'] - d3: ['s0-r1', 's1-r1'] - -Of course, this simple-minded approach doesn't scale: a discourse with, say, three -sentences, each of which has 3 readings, will generate 27 different -threads. It is an interesting exercise to consider how to manage -discourse ambiguity more efficiently. - -Checking Consistency -==================== - -Now, we can check whether some or all of the discourse threads are -consistent, using the ``models()`` method. With no parameter, this -method will try to find a model for every discourse thread in the -current discourse. However, we can also specify just one thread, say ``d1``. - - >>> dt.models('d1') - -------------------------------------------------------------------------------- - Model for Discourse Thread d1 - -------------------------------------------------------------------------------- - % number = 1 - % seconds = 0 - - % Interpretation of size 2 - - c1 = 0. - - f1(0) = 0. - f1(1) = 0. - - boxer(0). - - boxer(1). - - - boxerdog(0). - - boxerdog(1). - - - girl(0). - - girl(1). - - walk(0). - - walk(1). - - - chase(0,0). - - chase(0,1). - - chase(1,0). - - chase(1,1). - - Consistent discourse: d1 ['s0-r0', 's1-r1']: - s0-r0: exists z1.(boxer(z1) & walk(z1)) - s1-r1: all z1.(boxerdog(z1) -> exists z2.(girl(z2) & chase(z1,z2))) - - -There are various formats for rendering **Mace4** models --- here, -we have used the 'cooked' format (which is intended to be -human-readable). There are a number of points to note. - -#. The entities in the domain are all treated as non-negative - integers. In this case, there are only two entities, ``0`` and - ``1``. - -#. The ``-`` symbol indicates negation. So ``0`` is the only - ``boxerdog`` and the only thing that ``walk``\ s. Nothing is a - ``boxer``, or a ``girl`` or in the ``chase`` relation. Thus the - universal sentence is vacuously true. - -#. ``c1`` is an introduced constant that denotes ``0``. - -#. ``f1`` is a Skolem function, but it plays no significant role in - this model. - - -We might want to now add another sentence to the discourse, and there -is method ``add_sentence()`` for doing just this. - - >>> dt.add_sentence('John is a boxer') - >>> dt.sentences() - s0: a boxer walks - s1: every boxer chases a girl - s2: John is a boxer - -We can now test all the properties as before; here, we just show a -couple of them. - - >>> dt.readings() - - s0 readings: - - s0-r0: exists z1.(boxer(z1) & walk(z1)) - s0-r1: exists z1.(boxerdog(z1) & walk(z1)) - - s1 readings: - - s1-r0: all z1.(boxer(z1) -> exists z2.(girl(z2) & chase(z1,z2))) - s1-r1: all z1.(boxerdog(z1) -> exists z2.(girl(z2) & chase(z1,z2))) - - s2 readings: - - s2-r0: boxer(John) - s2-r1: boxerdog(John) - >>> dt.readings(threaded=True) - d0: ['s0-r0', 's1-r0', 's2-r0'] - d1: ['s0-r0', 's1-r0', 's2-r1'] - d2: ['s0-r0', 's1-r1', 's2-r0'] - d3: ['s0-r0', 's1-r1', 's2-r1'] - d4: ['s0-r1', 's1-r0', 's2-r0'] - d5: ['s0-r1', 's1-r0', 's2-r1'] - d6: ['s0-r1', 's1-r1', 's2-r0'] - d7: ['s0-r1', 's1-r1', 's2-r1'] - -If you are interested in a particular thread, the ``expand_threads()`` -method will remind you of what readings it consists of: - - >>> thread = dt.expand_threads('d1') - >>> for rid, reading in thread: - ... print(rid, str(reading.normalize())) - s0-r0 exists z1.(boxer(z1) & walk(z1)) - s1-r0 all z1.(boxer(z1) -> exists z2.(girl(z2) & chase(z1,z2))) - s2-r1 boxerdog(John) - -Suppose we have already defined a discourse, as follows: - - >>> dt = DiscourseTester(['A student dances', 'Every student is a person']) - -Now, when we add a new sentence, is it consistent with what we already -have? The `` consistchk=True`` parameter of ``add_sentence()`` allows -us to check: - - >>> dt.add_sentence('No person dances', consistchk=True) - Inconsistent discourse: d0 ['s0-r0', 's1-r0', 's2-r0']: - s0-r0: exists z1.(student(z1) & dance(z1)) - s1-r0: all z1.(student(z1) -> person(z1)) - s2-r0: -exists z1.(person(z1) & dance(z1)) - - >>> dt.readings() - - s0 readings: - - s0-r0: exists z1.(student(z1) & dance(z1)) - - s1 readings: - - s1-r0: all z1.(student(z1) -> person(z1)) - - s2 readings: - - s2-r0: -exists z1.(person(z1) & dance(z1)) - -So let's retract the inconsistent sentence: - - >>> dt.retract_sentence('No person dances', verbose=True) - Current sentences are - s0: A student dances - s1: Every student is a person - -We can now verify that result is consistent. - - >>> dt.models() - -------------------------------------------------------------------------------- - Model for Discourse Thread d0 - -------------------------------------------------------------------------------- - % number = 1 - % seconds = 0 - - % Interpretation of size 2 - - c1 = 0. - - dance(0). - - dance(1). - - person(0). - - person(1). - - student(0). - - student(1). - - Consistent discourse: d0 ['s0-r0', 's1-r0']: - s0-r0: exists z1.(student(z1) & dance(z1)) - s1-r0: all z1.(student(z1) -> person(z1)) - - -Checking Informativity -====================== - -Let's assume that we are still trying to extend the discourse *A -student dances.* *Every student is a person.* We add a new sentence, -but this time, we check whether it is informative with respect to what -has gone before. - - >>> dt.add_sentence('A person dances', informchk=True) - Sentence 'A person dances' under reading 'exists x.(person(x) & dance(x))': - Not informative relative to thread 'd0' - -In fact, we are just checking whether the new sentence is entailed by -the preceding discourse. - - >>> dt.models() - -------------------------------------------------------------------------------- - Model for Discourse Thread d0 - -------------------------------------------------------------------------------- - % number = 1 - % seconds = 0 - - % Interpretation of size 2 - - c1 = 0. - - c2 = 0. - - dance(0). - - dance(1). - - person(0). - - person(1). - - student(0). - - student(1). - - Consistent discourse: d0 ['s0-r0', 's1-r0', 's2-r0']: - s0-r0: exists z1.(student(z1) & dance(z1)) - s1-r0: all z1.(student(z1) -> person(z1)) - s2-r0: exists z1.(person(z1) & dance(z1)) - - - - -Adding Background Knowledge -=========================== - -Let's build a new discourse, and look at the readings of the component sentences: - - >>> dt = DiscourseTester(['Vincent is a boxer', 'Fido is a boxer', 'Vincent is married', 'Fido barks']) - >>> dt.readings() - - s0 readings: - - s0-r0: boxer(Vincent) - s0-r1: boxerdog(Vincent) - - s1 readings: - - s1-r0: boxer(Fido) - s1-r1: boxerdog(Fido) - - s2 readings: - - s2-r0: married(Vincent) - - s3 readings: - - s3-r0: bark(Fido) - -This gives us a lot of threads: - - >>> dt.readings(threaded=True) - d0: ['s0-r0', 's1-r0', 's2-r0', 's3-r0'] - d1: ['s0-r0', 's1-r1', 's2-r0', 's3-r0'] - d2: ['s0-r1', 's1-r0', 's2-r0', 's3-r0'] - d3: ['s0-r1', 's1-r1', 's2-r0', 's3-r0'] - - -We can eliminate some of the readings, and hence some of the threads, -by adding background information. - - >>> import nltk.data - >>> bg = nltk.data.load('grammars/book_grammars/background.fol') - >>> dt.add_background(bg) - >>> dt.background() - all x.(boxerdog(x) -> dog(x)) - all x.(boxer(x) -> person(x)) - all x.-(dog(x) & person(x)) - all x.(married(x) <-> exists y.marry(x,y)) - all x.(bark(x) -> dog(x)) - all x y.(marry(x,y) -> (person(x) & person(y))) - -(Vincent = Mia) - -(Vincent = Fido) - -(Mia = Fido) - -The background information allows us to reject three of the threads as -inconsistent. To see what remains, use the ``filter=True`` parameter -on ``readings()``. - - >>> dt.readings(filter=True) - d1: ['s0-r0', 's1-r1', 's2-r0', 's3-r0'] - -The ``models()`` method gives us more information about the surviving thread. - - >>> dt.models() - -------------------------------------------------------------------------------- - Model for Discourse Thread d0 - -------------------------------------------------------------------------------- - No model found! - - -------------------------------------------------------------------------------- - Model for Discourse Thread d1 - -------------------------------------------------------------------------------- - % number = 1 - % seconds = 0 - - % Interpretation of size 3 - - Fido = 0. - - Mia = 1. - - Vincent = 2. - - f1(0) = 0. - f1(1) = 0. - f1(2) = 2. - - bark(0). - - bark(1). - - bark(2). - - - boxer(0). - - boxer(1). - boxer(2). - - boxerdog(0). - - boxerdog(1). - - boxerdog(2). - - dog(0). - - dog(1). - - dog(2). - - - married(0). - - married(1). - married(2). - - - person(0). - - person(1). - person(2). - - - marry(0,0). - - marry(0,1). - - marry(0,2). - - marry(1,0). - - marry(1,1). - - marry(1,2). - - marry(2,0). - - marry(2,1). - marry(2,2). - - -------------------------------------------------------------------------------- - Model for Discourse Thread d2 - -------------------------------------------------------------------------------- - No model found! - - -------------------------------------------------------------------------------- - Model for Discourse Thread d3 - -------------------------------------------------------------------------------- - No model found! - - Inconsistent discourse: d0 ['s0-r0', 's1-r0', 's2-r0', 's3-r0']: - s0-r0: boxer(Vincent) - s1-r0: boxer(Fido) - s2-r0: married(Vincent) - s3-r0: bark(Fido) - - Consistent discourse: d1 ['s0-r0', 's1-r1', 's2-r0', 's3-r0']: - s0-r0: boxer(Vincent) - s1-r1: boxerdog(Fido) - s2-r0: married(Vincent) - s3-r0: bark(Fido) - - Inconsistent discourse: d2 ['s0-r1', 's1-r0', 's2-r0', 's3-r0']: - s0-r1: boxerdog(Vincent) - s1-r0: boxer(Fido) - s2-r0: married(Vincent) - s3-r0: bark(Fido) - - Inconsistent discourse: d3 ['s0-r1', 's1-r1', 's2-r0', 's3-r0']: - s0-r1: boxerdog(Vincent) - s1-r1: boxerdog(Fido) - s2-r0: married(Vincent) - s3-r0: bark(Fido) - - - -.. This will not be visible in the html output: create a tempdir to - play in. - >>> import tempfile, os - >>> tempdir = tempfile.mkdtemp() - >>> old_dir = os.path.abspath('.') - >>> os.chdir(tempdir) - -In order to play around with your own version of background knowledge, -you might want to start off with a local copy of ``background.fol``: - - >>> nltk.data.retrieve('grammars/book_grammars/background.fol') - Retrieving 'nltk:grammars/book_grammars/background.fol', saving to 'background.fol' - -After you have modified the file, the ``load_fol()`` function will parse -the strings in the file into expressions of ``nltk.sem.logic``. - - >>> from nltk.inference.discourse import load_fol - >>> mybg = load_fol(open('background.fol').read()) - -The result can be loaded as an argument of ``add_background()`` in the -manner shown earlier. - -.. This will not be visible in the html output: clean up the tempdir. - >>> os.chdir(old_dir) - >>> for f in os.listdir(tempdir): - ... os.remove(os.path.join(tempdir, f)) - >>> os.rmdir(tempdir) - >>> nltk.data.clear_cache() - - -Regression Testing from book -============================ - - >>> logic._counter._value = 0 - - >>> from nltk.tag import RegexpTagger - >>> tagger = RegexpTagger( - ... [('^(chases|runs)$', 'VB'), - ... ('^(a)$', 'ex_quant'), - ... ('^(every)$', 'univ_quant'), - ... ('^(dog|boy)$', 'NN'), - ... ('^(He)$', 'PRP') - ... ]) - >>> rc = DrtGlueReadingCommand(depparser=MaltParser(tagger=tagger)) - >>> dt = DiscourseTester(map(str.split, ['Every dog chases a boy', 'He runs']), rc) - >>> dt.readings() - - s0 readings: - - s0-r0: ([z2],[boy(z2), (([z5],[dog(z5)]) -> ([],[chases(z5,z2)]))]) - s0-r1: ([],[(([z1],[dog(z1)]) -> ([z2],[boy(z2), chases(z1,z2)]))]) - - s1 readings: - - s1-r0: ([z1],[PRO(z1), runs(z1)]) - >>> dt.readings(show_thread_readings=True) - d0: ['s0-r0', 's1-r0'] : ([z1,z2],[boy(z1), (([z3],[dog(z3)]) -> ([],[chases(z3,z1)])), (z2 = z1), runs(z2)]) - d1: ['s0-r1', 's1-r0'] : INVALID: AnaphoraResolutionException - >>> dt.readings(filter=True, show_thread_readings=True) - d0: ['s0-r0', 's1-r0'] : ([z1,z3],[boy(z1), (([z2],[dog(z2)]) -> ([],[chases(z2,z1)])), (z3 = z1), runs(z3)]) - - >>> logic._counter._value = 0 - - >>> from nltk.parse import FeatureEarleyChartParser - >>> from nltk.sem.drt import DrtParser - >>> grammar = nltk.data.load('grammars/book_grammars/drt.fcfg', logic_parser=DrtParser()) - >>> parser = FeatureEarleyChartParser(grammar, trace=0) - >>> trees = parser.parse('Angus owns a dog'.split()) - >>> print(list(trees)[0].label()['SEM'].simplify().normalize()) - ([z1,z2],[Angus(z1), dog(z2), own(z1,z2)]) diff --git a/pipeline/nltk/test/drt.doctest b/pipeline/nltk/test/drt.doctest deleted file mode 100644 index 03ba487bcf446483aad8548fc63ad6d06a0e7115..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/drt.doctest +++ /dev/null @@ -1,515 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -================================ - Discourse Representation Theory -================================ - - >>> from nltk.sem import logic - >>> from nltk.inference import TableauProver - -Overview -======== - -A DRS can be created with the ``DRS()`` constructor. This takes two arguments: a list of -discourse referents and list of conditions. . - - >>> from nltk.sem.drt import * - >>> dexpr = DrtExpression.fromstring - >>> man_x = dexpr('man(x)') - >>> walk_x = dexpr('walk(x)') - >>> x = dexpr('x') - >>> print(DRS([x], [man_x, walk_x])) - ([x],[man(x), walk(x)]) - -The ``parse()`` method can also be applied directly to DRS -expressions, which allows them to be specified more -easily. - - >>> drs1 = dexpr('([x],[man(x),walk(x)])') - >>> print(drs1) - ([x],[man(x), walk(x)]) - -DRSs can be *merged* using the ``+`` operator. - - >>> drs2 = dexpr('([y],[woman(y),stop(y)])') - >>> drs3 = drs1 + drs2 - >>> print(drs3) - (([x],[man(x), walk(x)]) + ([y],[woman(y), stop(y)])) - >>> print(drs3.simplify()) - ([x,y],[man(x), walk(x), woman(y), stop(y)]) - -We can embed DRSs as components of an ``implies`` condition. - - >>> s = '([], [(%s -> %s)])' % (drs1, drs2) - >>> print(dexpr(s)) - ([],[(([x],[man(x), walk(x)]) -> ([y],[woman(y), stop(y)]))]) - -The ``fol()`` method converts DRSs into FOL formulae. - - >>> print(dexpr(r'([x],[man(x), walks(x)])').fol()) - exists x.(man(x) & walks(x)) - >>> print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])').fol()) - all x.(man(x) -> walks(x)) - -In order to visualize a DRS, the ``pretty_format()`` method can be used. - - >>> print(drs3.pretty_format()) - _________ __________ - | x | | y | - (|---------| + |----------|) - | man(x) | | woman(y) | - | walk(x) | | stop(y) | - |_________| |__________| - - -Parse to semantics ------------------- - -.. - >>> logic._counter._value = 0 - -DRSs can be used for building compositional semantics in a feature -based grammar. To specify that we want to use DRSs, the appropriate -logic parser needs be passed as a parameter to ``load_earley()`` - - >>> from nltk.parse import load_parser - >>> from nltk.sem.drt import DrtParser - >>> parser = load_parser('grammars/book_grammars/drt.fcfg', trace=0, logic_parser=DrtParser()) - >>> for tree in parser.parse('a dog barks'.split()): - ... print(tree.label()['SEM'].simplify()) - ... - ([x],[dog(x), bark(x)]) - -Alternatively, a ``FeatStructReader`` can be passed with the ``logic_parser`` set on it - - >>> from nltk.featstruct import FeatStructReader - >>> from nltk.grammar import FeatStructNonterminal - >>> parser = load_parser('grammars/book_grammars/drt.fcfg', trace=0, fstruct_reader=FeatStructReader(fdict_class=FeatStructNonterminal, logic_parser=DrtParser())) - >>> for tree in parser.parse('every girl chases a dog'.split()): - ... print(tree.label()['SEM'].simplify().normalize()) - ... - ([],[(([z1],[girl(z1)]) -> ([z2],[dog(z2), chase(z1,z2)]))]) - - - -Unit Tests -========== - -Parser ------- - - >>> print(dexpr(r'([x,y],[sees(x,y)])')) - ([x,y],[sees(x,y)]) - >>> print(dexpr(r'([x],[man(x), walks(x)])')) - ([x],[man(x), walks(x)]) - >>> print(dexpr(r'\x.([],[man(x), walks(x)])')) - \x.([],[man(x), walks(x)]) - >>> print(dexpr(r'\x.\y.([],[sees(x,y)])')) - \x y.([],[sees(x,y)]) - - >>> print(dexpr(r'([x,y],[(x = y)])')) - ([x,y],[(x = y)]) - >>> print(dexpr(r'([x,y],[(x != y)])')) - ([x,y],[-(x = y)]) - - >>> print(dexpr(r'\x.([],[walks(x)])(john)')) - (\x.([],[walks(x)]))(john) - >>> print(dexpr(r'\R.\x.([],[big(x,R)])(\y.([],[mouse(y)]))')) - (\R x.([],[big(x,R)]))(\y.([],[mouse(y)])) - - >>> print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))')) - (([x],[walks(x)]) + ([y],[runs(y)])) - >>> print(dexpr(r'(([x,y],[walks(x), jumps(y)]) + (([z],[twos(z)]) + ([w],[runs(w)])))')) - (([x,y],[walks(x), jumps(y)]) + ([z],[twos(z)]) + ([w],[runs(w)])) - >>> print(dexpr(r'((([],[walks(x)]) + ([],[twos(x)])) + ([],[runs(x)]))')) - (([],[walks(x)]) + ([],[twos(x)]) + ([],[runs(x)])) - >>> print(dexpr(r'((([],[walks(x)]) + ([],[runs(x)])) + (([],[threes(x)]) + ([],[fours(x)])))')) - (([],[walks(x)]) + ([],[runs(x)]) + ([],[threes(x)]) + ([],[fours(x)])) - - >>> print(dexpr(r'(([],[walks(x)]) -> ([],[runs(x)]))')) - (([],[walks(x)]) -> ([],[runs(x)])) - - >>> print(dexpr(r'([x],[PRO(x), sees(John,x)])')) - ([x],[PRO(x), sees(John,x)]) - >>> print(dexpr(r'([x],[man(x), -([],[walks(x)])])')) - ([x],[man(x), -([],[walks(x)])]) - >>> print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])')) - ([],[(([x],[man(x)]) -> ([],[walks(x)]))]) - - >>> print(dexpr(r'DRS([x],[walk(x)])')) - ([x],[walk(x)]) - >>> print(dexpr(r'DRS([x][walk(x)])')) - ([x],[walk(x)]) - >>> print(dexpr(r'([x][walk(x)])')) - ([x],[walk(x)]) - -``simplify()`` --------------- - - >>> print(dexpr(r'\x.([],[man(x), walks(x)])(john)').simplify()) - ([],[man(john), walks(john)]) - >>> print(dexpr(r'\x.\y.([z],[dog(z),sees(x,y)])(john)(mary)').simplify()) - ([z],[dog(z), sees(john,mary)]) - >>> print(dexpr(r'\R x.([],[big(x,R)])(\y.([],[mouse(y)]))').simplify()) - \x.([],[big(x,\y.([],[mouse(y)]))]) - - >>> print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))').simplify()) - ([x,y],[walks(x), runs(y)]) - >>> print(dexpr(r'(([x,y],[walks(x), jumps(y)]) + (([z],[twos(z)]) + ([w],[runs(w)])))').simplify()) - ([w,x,y,z],[walks(x), jumps(y), twos(z), runs(w)]) - >>> print(dexpr(r'((([],[walks(x)]) + ([],[runs(x)]) + ([],[threes(x)]) + ([],[fours(x)])))').simplify()) - ([],[walks(x), runs(x), threes(x), fours(x)]) - >>> dexpr(r'([x],[man(x)])+([x],[walks(x)])').simplify() == \ - ... dexpr(r'([x,z1],[man(x), walks(z1)])') - True - >>> dexpr(r'([y],[boy(y), (([x],[dog(x)]) -> ([],[chase(x,y)]))])+([x],[run(x)])').simplify() == \ - ... dexpr(r'([y,z1],[boy(y), (([x],[dog(x)]) -> ([],[chase(x,y)])), run(z1)])') - True - - >>> dexpr(r'\Q.(([x],[john(x),walks(x)]) + Q)(([x],[PRO(x),leaves(x)]))').simplify() == \ - ... dexpr(r'([x,z1],[john(x), walks(x), PRO(z1), leaves(z1)])') - True - - >>> logic._counter._value = 0 - >>> print(dexpr('([],[(([x],[dog(x)]) -> ([e,y],[boy(y), chase(e), subj(e,x), obj(e,y)]))])+([e,x],[PRO(x), run(e), subj(e,x)])').simplify().normalize().normalize()) - ([e02,z5],[(([z3],[dog(z3)]) -> ([e01,z4],[boy(z4), chase(e01), subj(e01,z3), obj(e01,z4)])), PRO(z5), run(e02), subj(e02,z5)]) - -``fol()`` ------------ - - >>> print(dexpr(r'([x,y],[sees(x,y)])').fol()) - exists x y.sees(x,y) - >>> print(dexpr(r'([x],[man(x), walks(x)])').fol()) - exists x.(man(x) & walks(x)) - >>> print(dexpr(r'\x.([],[man(x), walks(x)])').fol()) - \x.(man(x) & walks(x)) - >>> print(dexpr(r'\x y.([],[sees(x,y)])').fol()) - \x y.sees(x,y) - - >>> print(dexpr(r'\x.([],[walks(x)])(john)').fol()) - \x.walks(x)(john) - >>> print(dexpr(r'\R x.([],[big(x,R)])(\y.([],[mouse(y)]))').fol()) - (\R x.big(x,R))(\y.mouse(y)) - - >>> print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))').fol()) - (exists x.walks(x) & exists y.runs(y)) - - >>> print(dexpr(r'(([],[walks(x)]) -> ([],[runs(x)]))').fol()) - (walks(x) -> runs(x)) - - >>> print(dexpr(r'([x],[PRO(x), sees(John,x)])').fol()) - exists x.(PRO(x) & sees(John,x)) - >>> print(dexpr(r'([x],[man(x), -([],[walks(x)])])').fol()) - exists x.(man(x) & -walks(x)) - >>> print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])').fol()) - all x.(man(x) -> walks(x)) - - >>> print(dexpr(r'([x],[man(x) | walks(x)])').fol()) - exists x.(man(x) | walks(x)) - >>> print(dexpr(r'P(x) + ([x],[walks(x)])').fol()) - (P(x) & exists x.walks(x)) - -``resolve_anaphora()`` ----------------------- - - >>> from nltk.sem.drt import AnaphoraResolutionException - - >>> print(resolve_anaphora(dexpr(r'([x,y,z],[dog(x), cat(y), walks(z), PRO(z)])'))) - ([x,y,z],[dog(x), cat(y), walks(z), (z = [x,y])]) - >>> print(resolve_anaphora(dexpr(r'([],[(([x],[dog(x)]) -> ([y],[walks(y), PRO(y)]))])'))) - ([],[(([x],[dog(x)]) -> ([y],[walks(y), (y = x)]))]) - >>> print(resolve_anaphora(dexpr(r'(([x,y],[]) + ([],[PRO(x)]))')).simplify()) - ([x,y],[(x = y)]) - >>> try: print(resolve_anaphora(dexpr(r'([x],[walks(x), PRO(x)])'))) - ... except AnaphoraResolutionException as e: print(e) - Variable 'x' does not resolve to anything. - >>> print(resolve_anaphora(dexpr('([e01,z6,z7],[boy(z6), PRO(z7), run(e01), subj(e01,z7)])'))) - ([e01,z6,z7],[boy(z6), (z7 = z6), run(e01), subj(e01,z7)]) - -``equiv()``: ----------------- - - >>> a = dexpr(r'([x],[man(x), walks(x)])') - >>> b = dexpr(r'([x],[walks(x), man(x)])') - >>> print(a.equiv(b, TableauProver())) - True - - -``replace()``: --------------- - - >>> a = dexpr(r'a') - >>> w = dexpr(r'w') - >>> x = dexpr(r'x') - >>> y = dexpr(r'y') - >>> z = dexpr(r'z') - - -replace bound -------------- - - >>> print(dexpr(r'([x],[give(x,y,z)])').replace(x.variable, a, False)) - ([x],[give(x,y,z)]) - >>> print(dexpr(r'([x],[give(x,y,z)])').replace(x.variable, a, True)) - ([a],[give(a,y,z)]) - -replace unbound ---------------- - - >>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, a, False)) - ([x],[give(x,a,z)]) - >>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, a, True)) - ([x],[give(x,a,z)]) - -replace unbound with bound --------------------------- - - >>> dexpr(r'([x],[give(x,y,z)])').replace(y.variable, x, False) == \ - ... dexpr('([z1],[give(z1,x,z)])') - True - >>> dexpr(r'([x],[give(x,y,z)])').replace(y.variable, x, True) == \ - ... dexpr('([z1],[give(z1,x,z)])') - True - -replace unbound with unbound ----------------------------- - - >>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, z, False)) - ([x],[give(x,z,z)]) - >>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, z, True)) - ([x],[give(x,z,z)]) - - -replace unbound ---------------- - - >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, False)) - (([x],[P(x,y,a)]) + ([y],[Q(x,y,a)])) - >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, True)) - (([x],[P(x,y,a)]) + ([y],[Q(x,y,a)])) - -replace bound -------------- - - >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(x.variable, a, False)) - (([x],[P(x,y,z)]) + ([y],[Q(x,y,z)])) - >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(x.variable, a, True)) - (([a],[P(a,y,z)]) + ([y],[Q(a,y,z)])) - -replace unbound with unbound ----------------------------- - - >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, False)) - (([x],[P(x,y,a)]) + ([y],[Q(x,y,a)])) - >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, True)) - (([x],[P(x,y,a)]) + ([y],[Q(x,y,a)])) - -replace unbound with bound on same side ---------------------------------------- - - >>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(z.variable, x, False) == \ - ... dexpr(r'(([z1],[P(z1,y,x)]) + ([y],[Q(z1,y,w)]))') - True - >>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(z.variable, x, True) == \ - ... dexpr(r'(([z1],[P(z1,y,x)]) + ([y],[Q(z1,y,w)]))') - True - -replace unbound with bound on other side ----------------------------------------- - - >>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(w.variable, x, False) == \ - ... dexpr(r'(([z1],[P(z1,y,z)]) + ([y],[Q(z1,y,x)]))') - True - >>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(w.variable, x, True) == \ - ... dexpr(r'(([z1],[P(z1,y,z)]) + ([y],[Q(z1,y,x)]))') - True - -replace unbound with double bound ---------------------------------- - - >>> dexpr(r'([x],[P(x,y,z)])+([x],[Q(x,y,w)])').replace(z.variable, x, False) == \ - ... dexpr(r'(([z1],[P(z1,y,x)]) + ([z1],[Q(z1,y,w)]))') - True - >>> dexpr(r'([x],[P(x,y,z)])+([x],[Q(x,y,w)])').replace(z.variable, x, True) == \ - ... dexpr(r'(([z1],[P(z1,y,x)]) + ([z1],[Q(z1,y,w)]))') - True - - -regression tests ----------------- - - >>> d = dexpr('([x],[A(c), ([y],[B(x,y,z,a)])->([z],[C(x,y,z,a)])])') - >>> print(d) - ([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))]) - >>> print(d.pretty_format()) - ____________________________________ - | x | - |------------------------------------| - | A(c) | - | ____________ ____________ | - | | y | | z | | - | (|------------| -> |------------|) | - | | B(x,y,z,a) | | C(x,y,z,a) | | - | |____________| |____________| | - |____________________________________| - >>> print(str(d)) - ([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))]) - >>> print(d.fol()) - exists x.(A(c) & all y.(B(x,y,z,a) -> exists z.C(x,y,z,a))) - >>> print(d.replace(Variable('a'), DrtVariableExpression(Variable('r')))) - ([x],[A(c), (([y],[B(x,y,z,r)]) -> ([z],[C(x,y,z,r)]))]) - >>> print(d.replace(Variable('x'), DrtVariableExpression(Variable('r')))) - ([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))]) - >>> print(d.replace(Variable('y'), DrtVariableExpression(Variable('r')))) - ([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))]) - >>> print(d.replace(Variable('z'), DrtVariableExpression(Variable('r')))) - ([x],[A(c), (([y],[B(x,y,r,a)]) -> ([z],[C(x,y,z,a)]))]) - >>> print(d.replace(Variable('x'), DrtVariableExpression(Variable('r')), True)) - ([r],[A(c), (([y],[B(r,y,z,a)]) -> ([z],[C(r,y,z,a)]))]) - >>> print(d.replace(Variable('y'), DrtVariableExpression(Variable('r')), True)) - ([x],[A(c), (([r],[B(x,r,z,a)]) -> ([z],[C(x,r,z,a)]))]) - >>> print(d.replace(Variable('z'), DrtVariableExpression(Variable('r')), True)) - ([x],[A(c), (([y],[B(x,y,r,a)]) -> ([r],[C(x,y,r,a)]))]) - >>> print(d == dexpr('([l],[A(c), ([m],[B(l,m,z,a)])->([n],[C(l,m,n,a)])])')) - True - >>> d = dexpr('([],[([x,y],[B(x,y,h), ([a,b],[dee(x,a,g)])])->([z,w],[cee(x,y,f), ([c,d],[E(x,c,d,e)])])])') - >>> sorted(d.free()) - [Variable('B'), Variable('E'), Variable('e'), Variable('f'), Variable('g'), Variable('h')] - >>> sorted(d.variables()) - [Variable('B'), Variable('E'), Variable('e'), Variable('f'), Variable('g'), Variable('h')] - >>> sorted(d.get_refs(True)) - [Variable('a'), Variable('b'), Variable('c'), Variable('d'), Variable('w'), Variable('x'), Variable('y'), Variable('z')] - >>> sorted(d.conds[0].get_refs(False)) - [Variable('x'), Variable('y')] - >>> print(dexpr('([x,y],[A(x,y), (x=y), ([],[B(x,y)])->([],[C(x,y)]), ([x,y],[D(x,y)])->([],[E(x,y)]), ([],[F(x,y)])->([x,y],[G(x,y)])])').eliminate_equality()) - ([x],[A(x,x), (([],[B(x,x)]) -> ([],[C(x,x)])), (([x,y],[D(x,y)]) -> ([],[E(x,y)])), (([],[F(x,x)]) -> ([x,y],[G(x,y)]))]) - >>> print(dexpr('([x,y],[A(x,y), (x=y)]) -> ([],[B(x,y)])').eliminate_equality()) - (([x],[A(x,x)]) -> ([],[B(x,x)])) - >>> print(dexpr('([x,y],[A(x,y)]) -> ([],[B(x,y), (x=y)])').eliminate_equality()) - (([x,y],[A(x,y)]) -> ([],[B(x,x)])) - >>> print(dexpr('([x,y],[A(x,y), (x=y), ([],[B(x,y)])])').eliminate_equality()) - ([x],[A(x,x), ([],[B(x,x)])]) - >>> print(dexpr('([x,y],[A(x,y), ([],[B(x,y), (x=y)])])').eliminate_equality()) - ([x,y],[A(x,y), ([],[B(x,x)])]) - >>> print(dexpr('([z8 z9 z10],[A(z8), z8=z10, z9=z10, B(z9), C(z10), D(z10)])').eliminate_equality()) - ([z9],[A(z9), B(z9), C(z9), D(z9)]) - - >>> print(dexpr('([x,y],[A(x,y), (x=y), ([],[B(x,y)]), ([x,y],[C(x,y)])])').eliminate_equality()) - ([x],[A(x,x), ([],[B(x,x)]), ([x,y],[C(x,y)])]) - >>> print(dexpr('([x,y],[A(x,y)]) + ([],[B(x,y), (x=y)]) + ([],[C(x,y)])').eliminate_equality()) - ([x],[A(x,x), B(x,x), C(x,x)]) - >>> print(dexpr('([x,y],[B(x,y)])+([x,y],[C(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x')))) - (([x,y],[B(x,y)]) + ([x,y],[C(x,y)])) - >>> print(dexpr('(([x,y],[B(x,y)])+([],[C(x,y)]))+([],[D(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x')))) - (([x,y],[B(x,y)]) + ([],[C(x,y)]) + ([],[D(x,y)])) - >>> print(dexpr('(([],[B(x,y)])+([],[C(x,y)]))+([],[D(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x')))) - (([],[B(x,x)]) + ([],[C(x,x)]) + ([],[D(x,x)])) - >>> print(dexpr('(([],[B(x,y), ([x,y],[A(x,y)])])+([],[C(x,y)]))+([],[D(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x'))).normalize()) - (([],[B(z3,z1), ([z2,z3],[A(z3,z2)])]) + ([],[C(z3,z1)]) + ([],[D(z3,z1)])) - - -Parse errors -============ - - >>> def parse_error(drtstring): - ... try: dexpr(drtstring) - ... except logic.LogicalExpressionException as e: print(e) - - >>> parse_error(r'') - End of input found. Expression expected. - - ^ - >>> parse_error(r'(') - End of input found. Expression expected. - ( - ^ - >>> parse_error(r'()') - Unexpected token: ')'. Expression expected. - () - ^ - >>> parse_error(r'([') - End of input found. Expected token ']'. - ([ - ^ - >>> parse_error(r'([,') - ',' is an illegal variable name. Constants may not be quantified. - ([, - ^ - >>> parse_error(r'([x,') - End of input found. Variable expected. - ([x, - ^ - >>> parse_error(r'([]') - End of input found. Expected token '['. - ([] - ^ - >>> parse_error(r'([][') - End of input found. Expected token ']'. - ([][ - ^ - >>> parse_error(r'([][,') - Unexpected token: ','. Expression expected. - ([][, - ^ - >>> parse_error(r'([][]') - End of input found. Expected token ')'. - ([][] - ^ - >>> parse_error(r'([x][man(x)]) |') - End of input found. Expression expected. - ([x][man(x)]) | - ^ - -Pretty Printing -=============== - - >>> dexpr(r"([],[])").pretty_print() - __ - | | - |--| - |__| - - >>> dexpr(r"([],[([x],[big(x), dog(x)]) -> ([],[bark(x)]) -([x],[walk(x)])])").pretty_print() - _____________________________ - | | - |-----------------------------| - | ________ _________ | - | | x | | | | - | (|--------| -> |---------|) | - | | big(x) | | bark(x) | | - | | dog(x) | |_________| | - | |________| | - | _________ | - | | x | | - | __ |---------| | - | | | walk(x) | | - | |_________| | - |_____________________________| - - >>> dexpr(r"([x,y],[x=y]) + ([z],[dog(z), walk(z)])").pretty_print() - _________ _________ - | x y | | z | - (|---------| + |---------|) - | (x = y) | | dog(z) | - |_________| | walk(z) | - |_________| - - >>> dexpr(r"([],[([x],[]) | ([y],[]) | ([z],[dog(z), walk(z)])])").pretty_print() - _______________________________ - | | - |-------------------------------| - | ___ ___ _________ | - | | x | | y | | z | | - | (|---| | |---| | |---------|) | - | |___| |___| | dog(z) | | - | | walk(z) | | - | |_________| | - |_______________________________| - - >>> dexpr(r"\P.\Q.(([x],[]) + P(x) + Q(x))(\x.([],[dog(x)]))").pretty_print() - ___ ________ - \ | x | \ | | - /\ P Q.(|---| + P(x) + Q(x))( /\ x.|--------|) - |___| | dog(x) | - |________| diff --git a/pipeline/nltk/test/featgram.doctest b/pipeline/nltk/test/featgram.doctest deleted file mode 100644 index 99e2735e8682ec270dc3039be39c2b3f2e3dc193..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/featgram.doctest +++ /dev/null @@ -1,610 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -========================= - Feature Grammar Parsing -========================= - -.. definitions from nltk_book/definitions.rst - -.. role:: feat - :class: feature -.. role:: fval - :class: fval -.. |rarr| unicode:: U+2192 .. right arrow -.. |dot| unicode:: U+2022 .. bullet -.. |pi| unicode:: U+03C0 - -Grammars can be parsed from strings. - - >>> import nltk - >>> from nltk import grammar, parse - >>> g = """ - ... % start DP - ... DP[AGR=?a] -> D[AGR=?a] N[AGR=?a] - ... D[AGR=[NUM='sg', PERS=3]] -> 'this' | 'that' - ... D[AGR=[NUM='pl', PERS=3]] -> 'these' | 'those' - ... D[AGR=[NUM='pl', PERS=1]] -> 'we' - ... D[AGR=[PERS=2]] -> 'you' - ... N[AGR=[NUM='sg', GND='m']] -> 'boy' - ... N[AGR=[NUM='pl', GND='m']] -> 'boys' - ... N[AGR=[NUM='sg', GND='f']] -> 'girl' - ... N[AGR=[NUM='pl', GND='f']] -> 'girls' - ... N[AGR=[NUM='sg']] -> 'student' - ... N[AGR=[NUM='pl']] -> 'students' - ... """ - >>> grammar = grammar.FeatureGrammar.fromstring(g) - >>> tokens = 'these girls'.split() - >>> parser = parse.FeatureEarleyChartParser(grammar) - >>> trees = parser.parse(tokens) - >>> for tree in trees: print(tree) - (DP[AGR=[GND='f', NUM='pl', PERS=3]] - (D[AGR=[NUM='pl', PERS=3]] these) - (N[AGR=[GND='f', NUM='pl']] girls)) - -In general, when we are trying to develop even a very small grammar, -it is convenient to put the rules in a file where they can be edited, -tested and revised. Let's assume that we have saved feat0cfg as a file named -``'feat0.fcfg'`` and placed it in the NLTK ``data`` directory. We can -inspect it as follows: - - >>> nltk.data.show_cfg('grammars/book_grammars/feat0.fcfg') - % start S - # ################### - # Grammar Productions - # ################### - # S expansion productions - S -> NP[NUM=?n] VP[NUM=?n] - # NP expansion productions - NP[NUM=?n] -> N[NUM=?n] - NP[NUM=?n] -> PropN[NUM=?n] - NP[NUM=?n] -> Det[NUM=?n] N[NUM=?n] - NP[NUM=pl] -> N[NUM=pl] - # VP expansion productions - VP[TENSE=?t, NUM=?n] -> IV[TENSE=?t, NUM=?n] - VP[TENSE=?t, NUM=?n] -> TV[TENSE=?t, NUM=?n] NP - # ################### - # Lexical Productions - # ################### - Det[NUM=sg] -> 'this' | 'every' - Det[NUM=pl] -> 'these' | 'all' - Det -> 'the' | 'some' | 'several' - PropN[NUM=sg]-> 'Kim' | 'Jody' - N[NUM=sg] -> 'dog' | 'girl' | 'car' | 'child' - N[NUM=pl] -> 'dogs' | 'girls' | 'cars' | 'children' - IV[TENSE=pres, NUM=sg] -> 'disappears' | 'walks' - TV[TENSE=pres, NUM=sg] -> 'sees' | 'likes' - IV[TENSE=pres, NUM=pl] -> 'disappear' | 'walk' - TV[TENSE=pres, NUM=pl] -> 'see' | 'like' - IV[TENSE=past] -> 'disappeared' | 'walked' - TV[TENSE=past] -> 'saw' | 'liked' - -Assuming we have saved feat0cfg as a file named -``'feat0.fcfg'``, the function ``parse.load_parser`` allows us to -read the grammar into NLTK, ready for use in parsing. - - - >>> cp = parse.load_parser('grammars/book_grammars/feat0.fcfg', trace=1) - >>> sent = 'Kim likes children' - >>> tokens = sent.split() - >>> tokens - ['Kim', 'likes', 'children'] - >>> trees = cp.parse(tokens) - |.Kim .like.chil.| - |[----] . .| [0:1] 'Kim' - |. [----] .| [1:2] 'likes' - |. . [----]| [2:3] 'children' - |[----] . .| [0:1] PropN[NUM='sg'] -> 'Kim' * - |[----] . .| [0:1] NP[NUM='sg'] -> PropN[NUM='sg'] * - |[----> . .| [0:1] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'sg'} - |. [----] .| [1:2] TV[NUM='sg', TENSE='pres'] -> 'likes' * - |. [----> .| [1:2] VP[NUM=?n, TENSE=?t] -> TV[NUM=?n, TENSE=?t] * NP[] {?n: 'sg', ?t: 'pres'} - |. . [----]| [2:3] N[NUM='pl'] -> 'children' * - |. . [----]| [2:3] NP[NUM='pl'] -> N[NUM='pl'] * - |. . [---->| [2:3] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'pl'} - |. [---------]| [1:3] VP[NUM='sg', TENSE='pres'] -> TV[NUM='sg', TENSE='pres'] NP[] * - |[==============]| [0:3] S[] -> NP[NUM='sg'] VP[NUM='sg'] * - >>> for tree in trees: print(tree) - (S[] - (NP[NUM='sg'] (PropN[NUM='sg'] Kim)) - (VP[NUM='sg', TENSE='pres'] - (TV[NUM='sg', TENSE='pres'] likes) - (NP[NUM='pl'] (N[NUM='pl'] children)))) - -The parser works directly with -the underspecified productions given by the grammar. That is, the -Predictor rule does not attempt to compile out all admissible feature -combinations before trying to expand the non-terminals on the left hand -side of a production. However, when the Scanner matches an input word -against a lexical production that has been predicted, the new edge will -typically contain fully specified features; e.g., the edge -[PropN[`num`:feat: = `sg`:fval:] |rarr| 'Kim', (0, 1)]. Recall from -Chapter 8 that the Fundamental (or Completer) Rule in -standard CFGs is used to combine an incomplete edge that's expecting a -nonterminal *B* with a following, complete edge whose left hand side -matches *B*. In our current setting, rather than checking for a -complete match, we test whether the expected category *B* will -unify with the left hand side *B'* of a following complete -edge. We will explain in more detail in Section 9.2 how -unification works; for the moment, it is enough to know that as a -result of unification, any variable values of features in *B* will be -instantiated by constant values in the corresponding feature structure -in *B'*, and these instantiated values will be used in the new edge -added by the Completer. This instantiation can be seen, for example, -in the edge -[NP [`num`:feat:\ =\ `sg`:fval:] |rarr| PropN[`num`:feat:\ =\ `sg`:fval:] |dot|, (0, 1)] -in Example 9.2, where the feature `num`:feat: has been assigned the value `sg`:fval:. - -Feature structures in NLTK are ... Atomic feature values can be strings or -integers. - - >>> fs1 = nltk.FeatStruct(TENSE='past', NUM='sg') - >>> print(fs1) - [ NUM = 'sg' ] - [ TENSE = 'past' ] - -We can think of a feature structure as being like a Python dictionary, -and access its values by indexing in the usual way. - - >>> fs1 = nltk.FeatStruct(PER=3, NUM='pl', GND='fem') - >>> print(fs1['GND']) - fem - -We can also define feature structures which have complex values, as -discussed earlier. - - >>> fs2 = nltk.FeatStruct(POS='N', AGR=fs1) - >>> print(fs2) - [ [ GND = 'fem' ] ] - [ AGR = [ NUM = 'pl' ] ] - [ [ PER = 3 ] ] - [ ] - [ POS = 'N' ] - >>> print(fs2['AGR']) - [ GND = 'fem' ] - [ NUM = 'pl' ] - [ PER = 3 ] - >>> print(fs2['AGR']['PER']) - 3 - -Feature structures can also be constructed using the ``parse()`` -method of the ``nltk.FeatStruct`` class. Note that in this case, atomic -feature values do not need to be enclosed in quotes. - - >>> f1 = nltk.FeatStruct("[NUMBER = sg]") - >>> f2 = nltk.FeatStruct("[PERSON = 3]") - >>> print(nltk.unify(f1, f2)) - [ NUMBER = 'sg' ] - [ PERSON = 3 ] - - >>> f1 = nltk.FeatStruct("[A = [B = b, D = d]]") - >>> f2 = nltk.FeatStruct("[A = [C = c, D = d]]") - >>> print(nltk.unify(f1, f2)) - [ [ B = 'b' ] ] - [ A = [ C = 'c' ] ] - [ [ D = 'd' ] ] - - -Feature Structures as Graphs ----------------------------- - -Feature structures are not inherently tied to linguistic objects; they are -general purpose structures for representing knowledge. For example, we -could encode information about a person in a feature structure: - - >>> person01 = nltk.FeatStruct("[NAME=Lee, TELNO='01 27 86 42 96',AGE=33]") - >>> print(person01) - [ AGE = 33 ] - [ NAME = 'Lee' ] - [ TELNO = '01 27 86 42 96' ] - -There are a number of notations for representing reentrancy in -matrix-style representations of feature structures. In NLTK, we adopt -the following convention: the first occurrence of a shared feature structure -is prefixed with an integer in parentheses, such as ``(1)``, and any -subsequent reference to that structure uses the notation -``->(1)``, as shown below. - - - >>> fs = nltk.FeatStruct("""[NAME=Lee, ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'], - ... SPOUSE=[NAME=Kim, ADDRESS->(1)]]""") - >>> print(fs) - [ ADDRESS = (1) [ NUMBER = 74 ] ] - [ [ STREET = 'rue Pascal' ] ] - [ ] - [ NAME = 'Lee' ] - [ ] - [ SPOUSE = [ ADDRESS -> (1) ] ] - [ [ NAME = 'Kim' ] ] - -There can be any number of tags within a single feature structure. - - >>> fs3 = nltk.FeatStruct("[A=(1)[B=b], C=(2)[], D->(1), E->(2)]") - >>> print(fs3) - [ A = (1) [ B = 'b' ] ] - [ ] - [ C = (2) [] ] - [ ] - [ D -> (1) ] - [ E -> (2) ] - >>> fs1 = nltk.FeatStruct(NUMBER=74, STREET='rue Pascal') - >>> fs2 = nltk.FeatStruct(CITY='Paris') - >>> print(nltk.unify(fs1, fs2)) - [ CITY = 'Paris' ] - [ NUMBER = 74 ] - [ STREET = 'rue Pascal' ] - -Unification is symmetric: - - >>> nltk.unify(fs1, fs2) == nltk.unify(fs2, fs1) - True - -Unification is commutative: - - >>> fs3 = nltk.FeatStruct(TELNO='01 27 86 42 96') - >>> nltk.unify(nltk.unify(fs1, fs2), fs3) == nltk.unify(fs1, nltk.unify(fs2, fs3)) - True - -Unification between *FS*:math:`_0` and *FS*:math:`_1` will fail if the -two feature structures share a path |pi|, -but the value of |pi| in *FS*:math:`_0` is a distinct -atom from the value of |pi| in *FS*:math:`_1`. In NLTK, -this is implemented by setting the result of unification to be -``None``. - - >>> fs0 = nltk.FeatStruct(A='a') - >>> fs1 = nltk.FeatStruct(A='b') - >>> print(nltk.unify(fs0, fs1)) - None - -Now, if we look at how unification interacts with structure-sharing, -things become really interesting. - - - - >>> fs0 = nltk.FeatStruct("""[NAME=Lee, - ... ADDRESS=[NUMBER=74, - ... STREET='rue Pascal'], - ... SPOUSE= [NAME=Kim, - ... ADDRESS=[NUMBER=74, - ... STREET='rue Pascal']]]""") - >>> print(fs0) - [ ADDRESS = [ NUMBER = 74 ] ] - [ [ STREET = 'rue Pascal' ] ] - [ ] - [ NAME = 'Lee' ] - [ ] - [ [ ADDRESS = [ NUMBER = 74 ] ] ] - [ SPOUSE = [ [ STREET = 'rue Pascal' ] ] ] - [ [ ] ] - [ [ NAME = 'Kim' ] ] - - - >>> fs1 = nltk.FeatStruct("[SPOUSE=[ADDRESS=[CITY=Paris]]]") - >>> print(nltk.unify(fs0, fs1)) - [ ADDRESS = [ NUMBER = 74 ] ] - [ [ STREET = 'rue Pascal' ] ] - [ ] - [ NAME = 'Lee' ] - [ ] - [ [ [ CITY = 'Paris' ] ] ] - [ [ ADDRESS = [ NUMBER = 74 ] ] ] - [ SPOUSE = [ [ STREET = 'rue Pascal' ] ] ] - [ [ ] ] - [ [ NAME = 'Kim' ] ] - - >>> fs2 = nltk.FeatStruct("""[NAME=Lee, ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'], - ... SPOUSE=[NAME=Kim, ADDRESS->(1)]]""") - - - >>> print(fs2) - [ ADDRESS = (1) [ NUMBER = 74 ] ] - [ [ STREET = 'rue Pascal' ] ] - [ ] - [ NAME = 'Lee' ] - [ ] - [ SPOUSE = [ ADDRESS -> (1) ] ] - [ [ NAME = 'Kim' ] ] - - - >>> print(nltk.unify(fs2, fs1)) - [ [ CITY = 'Paris' ] ] - [ ADDRESS = (1) [ NUMBER = 74 ] ] - [ [ STREET = 'rue Pascal' ] ] - [ ] - [ NAME = 'Lee' ] - [ ] - [ SPOUSE = [ ADDRESS -> (1) ] ] - [ [ NAME = 'Kim' ] ] - - - >>> fs1 = nltk.FeatStruct("[ADDRESS1=[NUMBER=74, STREET='rue Pascal']]") - >>> fs2 = nltk.FeatStruct("[ADDRESS1=?x, ADDRESS2=?x]") - >>> print(fs2) - [ ADDRESS1 = ?x ] - [ ADDRESS2 = ?x ] - >>> print(nltk.unify(fs1, fs2)) - [ ADDRESS1 = (1) [ NUMBER = 74 ] ] - [ [ STREET = 'rue Pascal' ] ] - [ ] - [ ADDRESS2 -> (1) ] - - - - - >>> sent = 'who do you claim that you like' - >>> tokens = sent.split() - >>> cp = parse.load_parser('grammars/book_grammars/feat1.fcfg', trace=1) - >>> trees = cp.parse(tokens) - |.w.d.y.c.t.y.l.| - |[-] . . . . . .| [0:1] 'who' - |. [-] . . . . .| [1:2] 'do' - |. . [-] . . . .| [2:3] 'you' - |. . . [-] . . .| [3:4] 'claim' - |. . . . [-] . .| [4:5] 'that' - |. . . . . [-] .| [5:6] 'you' - |. . . . . . [-]| [6:7] 'like' - |# . . . . . . .| [0:0] NP[]/NP[] -> * - |. # . . . . . .| [1:1] NP[]/NP[] -> * - |. . # . . . . .| [2:2] NP[]/NP[] -> * - |. . . # . . . .| [3:3] NP[]/NP[] -> * - |. . . . # . . .| [4:4] NP[]/NP[] -> * - |. . . . . # . .| [5:5] NP[]/NP[] -> * - |. . . . . . # .| [6:6] NP[]/NP[] -> * - |. . . . . . . #| [7:7] NP[]/NP[] -> * - |[-] . . . . . .| [0:1] NP[+WH] -> 'who' * - |[-> . . . . . .| [0:1] S[-INV] -> NP[] * VP[] {} - |[-> . . . . . .| [0:1] S[-INV]/?x[] -> NP[] * VP[]/?x[] {} - |[-> . . . . . .| [0:1] S[-INV] -> NP[] * S[]/NP[] {} - |. [-] . . . . .| [1:2] V[+AUX] -> 'do' * - |. [-> . . . . .| [1:2] S[+INV] -> V[+AUX] * NP[] VP[] {} - |. [-> . . . . .| [1:2] S[+INV]/?x[] -> V[+AUX] * NP[] VP[]/?x[] {} - |. [-> . . . . .| [1:2] VP[] -> V[+AUX] * VP[] {} - |. [-> . . . . .| [1:2] VP[]/?x[] -> V[+AUX] * VP[]/?x[] {} - |. . [-] . . . .| [2:3] NP[-WH] -> 'you' * - |. . [-> . . . .| [2:3] S[-INV] -> NP[] * VP[] {} - |. . [-> . . . .| [2:3] S[-INV]/?x[] -> NP[] * VP[]/?x[] {} - |. . [-> . . . .| [2:3] S[-INV] -> NP[] * S[]/NP[] {} - |. [---> . . . .| [1:3] S[+INV] -> V[+AUX] NP[] * VP[] {} - |. [---> . . . .| [1:3] S[+INV]/?x[] -> V[+AUX] NP[] * VP[]/?x[] {} - |. . . [-] . . .| [3:4] V[-AUX, SUBCAT='clause'] -> 'claim' * - |. . . [-> . . .| [3:4] VP[] -> V[-AUX, SUBCAT='clause'] * SBar[] {} - |. . . [-> . . .| [3:4] VP[]/?x[] -> V[-AUX, SUBCAT='clause'] * SBar[]/?x[] {} - |. . . . [-] . .| [4:5] Comp[] -> 'that' * - |. . . . [-> . .| [4:5] SBar[] -> Comp[] * S[-INV] {} - |. . . . [-> . .| [4:5] SBar[]/?x[] -> Comp[] * S[-INV]/?x[] {} - |. . . . . [-] .| [5:6] NP[-WH] -> 'you' * - |. . . . . [-> .| [5:6] S[-INV] -> NP[] * VP[] {} - |. . . . . [-> .| [5:6] S[-INV]/?x[] -> NP[] * VP[]/?x[] {} - |. . . . . [-> .| [5:6] S[-INV] -> NP[] * S[]/NP[] {} - |. . . . . . [-]| [6:7] V[-AUX, SUBCAT='trans'] -> 'like' * - |. . . . . . [->| [6:7] VP[] -> V[-AUX, SUBCAT='trans'] * NP[] {} - |. . . . . . [->| [6:7] VP[]/?x[] -> V[-AUX, SUBCAT='trans'] * NP[]/?x[] {} - |. . . . . . [-]| [6:7] VP[]/NP[] -> V[-AUX, SUBCAT='trans'] NP[]/NP[] * - |. . . . . [---]| [5:7] S[-INV]/NP[] -> NP[] VP[]/NP[] * - |. . . . [-----]| [4:7] SBar[]/NP[] -> Comp[] S[-INV]/NP[] * - |. . . [-------]| [3:7] VP[]/NP[] -> V[-AUX, SUBCAT='clause'] SBar[]/NP[] * - |. . [---------]| [2:7] S[-INV]/NP[] -> NP[] VP[]/NP[] * - |. [-----------]| [1:7] S[+INV]/NP[] -> V[+AUX] NP[] VP[]/NP[] * - |[=============]| [0:7] S[-INV] -> NP[] S[]/NP[] * - - >>> trees = list(trees) - >>> for tree in trees: print(tree) - (S[-INV] - (NP[+WH] who) - (S[+INV]/NP[] - (V[+AUX] do) - (NP[-WH] you) - (VP[]/NP[] - (V[-AUX, SUBCAT='clause'] claim) - (SBar[]/NP[] - (Comp[] that) - (S[-INV]/NP[] - (NP[-WH] you) - (VP[]/NP[] (V[-AUX, SUBCAT='trans'] like) (NP[]/NP[] ))))))) - -A different parser should give the same parse trees, but perhaps in a different order: - - >>> cp2 = parse.load_parser('grammars/book_grammars/feat1.fcfg', trace=1, - ... parser=parse.FeatureEarleyChartParser) - >>> trees2 = cp2.parse(tokens) - |.w.d.y.c.t.y.l.| - |[-] . . . . . .| [0:1] 'who' - |. [-] . . . . .| [1:2] 'do' - |. . [-] . . . .| [2:3] 'you' - |. . . [-] . . .| [3:4] 'claim' - |. . . . [-] . .| [4:5] 'that' - |. . . . . [-] .| [5:6] 'you' - |. . . . . . [-]| [6:7] 'like' - |> . . . . . . .| [0:0] S[-INV] -> * NP[] VP[] {} - |> . . . . . . .| [0:0] S[-INV]/?x[] -> * NP[] VP[]/?x[] {} - |> . . . . . . .| [0:0] S[-INV] -> * NP[] S[]/NP[] {} - |> . . . . . . .| [0:0] S[-INV] -> * Adv[+NEG] S[+INV] {} - |> . . . . . . .| [0:0] S[+INV] -> * V[+AUX] NP[] VP[] {} - |> . . . . . . .| [0:0] S[+INV]/?x[] -> * V[+AUX] NP[] VP[]/?x[] {} - |> . . . . . . .| [0:0] NP[+WH] -> * 'who' {} - |[-] . . . . . .| [0:1] NP[+WH] -> 'who' * - |[-> . . . . . .| [0:1] S[-INV] -> NP[] * VP[] {} - |[-> . . . . . .| [0:1] S[-INV]/?x[] -> NP[] * VP[]/?x[] {} - |[-> . . . . . .| [0:1] S[-INV] -> NP[] * S[]/NP[] {} - |. > . . . . . .| [1:1] S[-INV]/?x[] -> * NP[] VP[]/?x[] {} - |. > . . . . . .| [1:1] S[+INV]/?x[] -> * V[+AUX] NP[] VP[]/?x[] {} - |. > . . . . . .| [1:1] V[+AUX] -> * 'do' {} - |. > . . . . . .| [1:1] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {} - |. > . . . . . .| [1:1] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {} - |. > . . . . . .| [1:1] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {} - |. > . . . . . .| [1:1] VP[] -> * V[-AUX, SUBCAT='intrans'] {} - |. > . . . . . .| [1:1] VP[] -> * V[-AUX, SUBCAT='trans'] NP[] {} - |. > . . . . . .| [1:1] VP[] -> * V[-AUX, SUBCAT='clause'] SBar[] {} - |. > . . . . . .| [1:1] VP[] -> * V[+AUX] VP[] {} - |. [-] . . . . .| [1:2] V[+AUX] -> 'do' * - |. [-> . . . . .| [1:2] S[+INV]/?x[] -> V[+AUX] * NP[] VP[]/?x[] {} - |. [-> . . . . .| [1:2] VP[]/?x[] -> V[+AUX] * VP[]/?x[] {} - |. [-> . . . . .| [1:2] VP[] -> V[+AUX] * VP[] {} - |. . > . . . . .| [2:2] VP[] -> * V[-AUX, SUBCAT='intrans'] {} - |. . > . . . . .| [2:2] VP[] -> * V[-AUX, SUBCAT='trans'] NP[] {} - |. . > . . . . .| [2:2] VP[] -> * V[-AUX, SUBCAT='clause'] SBar[] {} - |. . > . . . . .| [2:2] VP[] -> * V[+AUX] VP[] {} - |. . > . . . . .| [2:2] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {} - |. . > . . . . .| [2:2] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {} - |. . > . . . . .| [2:2] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {} - |. . > . . . . .| [2:2] NP[-WH] -> * 'you' {} - |. . [-] . . . .| [2:3] NP[-WH] -> 'you' * - |. [---> . . . .| [1:3] S[+INV]/?x[] -> V[+AUX] NP[] * VP[]/?x[] {} - |. . . > . . . .| [3:3] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {} - |. . . > . . . .| [3:3] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {} - |. . . > . . . .| [3:3] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {} - |. . . > . . . .| [3:3] V[-AUX, SUBCAT='clause'] -> * 'claim' {} - |. . . [-] . . .| [3:4] V[-AUX, SUBCAT='clause'] -> 'claim' * - |. . . [-> . . .| [3:4] VP[]/?x[] -> V[-AUX, SUBCAT='clause'] * SBar[]/?x[] {} - |. . . . > . . .| [4:4] SBar[]/?x[] -> * Comp[] S[-INV]/?x[] {} - |. . . . > . . .| [4:4] Comp[] -> * 'that' {} - |. . . . [-] . .| [4:5] Comp[] -> 'that' * - |. . . . [-> . .| [4:5] SBar[]/?x[] -> Comp[] * S[-INV]/?x[] {} - |. . . . . > . .| [5:5] S[-INV]/?x[] -> * NP[] VP[]/?x[] {} - |. . . . . > . .| [5:5] NP[-WH] -> * 'you' {} - |. . . . . [-] .| [5:6] NP[-WH] -> 'you' * - |. . . . . [-> .| [5:6] S[-INV]/?x[] -> NP[] * VP[]/?x[] {} - |. . . . . . > .| [6:6] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {} - |. . . . . . > .| [6:6] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {} - |. . . . . . > .| [6:6] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {} - |. . . . . . > .| [6:6] V[-AUX, SUBCAT='trans'] -> * 'like' {} - |. . . . . . [-]| [6:7] V[-AUX, SUBCAT='trans'] -> 'like' * - |. . . . . . [->| [6:7] VP[]/?x[] -> V[-AUX, SUBCAT='trans'] * NP[]/?x[] {} - |. . . . . . . #| [7:7] NP[]/NP[] -> * - |. . . . . . [-]| [6:7] VP[]/NP[] -> V[-AUX, SUBCAT='trans'] NP[]/NP[] * - |. . . . . [---]| [5:7] S[-INV]/NP[] -> NP[] VP[]/NP[] * - |. . . . [-----]| [4:7] SBar[]/NP[] -> Comp[] S[-INV]/NP[] * - |. . . [-------]| [3:7] VP[]/NP[] -> V[-AUX, SUBCAT='clause'] SBar[]/NP[] * - |. [-----------]| [1:7] S[+INV]/NP[] -> V[+AUX] NP[] VP[]/NP[] * - |[=============]| [0:7] S[-INV] -> NP[] S[]/NP[] * - - >>> sorted(trees) == sorted(trees2) - True - - -Let's load a German grammar: - - >>> cp = parse.load_parser('grammars/book_grammars/german.fcfg', trace=0) - >>> sent = 'die Katze sieht den Hund' - >>> tokens = sent.split() - >>> trees = cp.parse(tokens) - >>> for tree in trees: print(tree) - (S[] - (NP[AGR=[GND='fem', NUM='sg', PER=3], CASE='nom'] - (Det[AGR=[GND='fem', NUM='sg', PER=3], CASE='nom'] die) - (N[AGR=[GND='fem', NUM='sg', PER=3]] Katze)) - (VP[AGR=[NUM='sg', PER=3]] - (TV[AGR=[NUM='sg', PER=3], OBJCASE='acc'] sieht) - (NP[AGR=[GND='masc', NUM='sg', PER=3], CASE='acc'] - (Det[AGR=[GND='masc', NUM='sg', PER=3], CASE='acc'] den) - (N[AGR=[GND='masc', NUM='sg', PER=3]] Hund)))) - -Grammar with Binding Operators ------------------------------- -The bindop.fcfg grammar is a semantic grammar that uses lambda -calculus. Each element has a core semantics, which is a single lambda -calculus expression; and a set of binding operators, which bind -variables. - -In order to make the binding operators work right, they need to -instantiate their bound variable every time they are added to the -chart. To do this, we use a special subclass of `Chart`, called -`InstantiateVarsChart`. - - >>> from nltk.parse.featurechart import InstantiateVarsChart - >>> cp = parse.load_parser('grammars/sample_grammars/bindop.fcfg', trace=1, - ... chart_class=InstantiateVarsChart) - >>> print(cp.grammar()) - Grammar with 15 productions (start state = S[]) - S[SEM=[BO={?b1+?b2}, CORE=]] -> NP[SEM=[BO=?b1, CORE=?subj]] VP[SEM=[BO=?b2, CORE=?vp]] - VP[SEM=[BO={?b1+?b2}, CORE=]] -> TV[SEM=[BO=?b1, CORE=?v]] NP[SEM=[BO=?b2, CORE=?obj]] - VP[SEM=?s] -> IV[SEM=?s] - NP[SEM=[BO={?b1+?b2+{bo(?det(?n),@x)}}, CORE=<@x>]] -> Det[SEM=[BO=?b1, CORE=?det]] N[SEM=[BO=?b2, CORE=?n]] - Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] -> 'a' - N[SEM=[BO={/}, CORE=]] -> 'dog' - N[SEM=[BO={/}, CORE=]] -> 'cat' - N[SEM=[BO={/}, CORE=]] -> 'mouse' - IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'barks' - IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'eats' - IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'walks' - TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] -> 'feeds' - TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] -> 'walks' - NP[SEM=[BO={bo(\P.P(John),@x)}, CORE=<@x>]] -> 'john' - NP[SEM=[BO={bo(\P.P(John),@x)}, CORE=<@x>]] -> 'alex' - -A simple intransitive sentence: - - >>> from nltk.sem import logic - >>> logic._counter._value = 100 - - >>> trees = cp.parse('john barks'.split()) - |. john.barks.| - |[-----] .| [0:1] 'john' - |. [-----]| [1:2] 'barks' - |[-----] .| [0:1] NP[SEM=[BO={bo(\P.P(John),z101)}, CORE=]] -> 'john' * - |[-----> .| [0:1] S[SEM=[BO={?b1+?b2}, CORE=]] -> NP[SEM=[BO=?b1, CORE=?subj]] * VP[SEM=[BO=?b2, CORE=?vp]] {?b1: {bo(\P.P(John),z2)}, ?subj: } - |. [-----]| [1:2] IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'barks' * - |. [-----]| [1:2] VP[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] * - |[===========]| [0:2] S[SEM=[BO={bo(\P.P(John),z2)}, CORE=]] -> NP[SEM=[BO={bo(\P.P(John),z2)}, CORE=]] VP[SEM=[BO={/}, CORE=<\x.bark(x)>]] * - >>> for tree in trees: print(tree) - (S[SEM=[BO={bo(\P.P(John),z2)}, CORE=]] - (NP[SEM=[BO={bo(\P.P(John),z101)}, CORE=]] john) - (VP[SEM=[BO={/}, CORE=<\x.bark(x)>]] - (IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] barks))) - -A transitive sentence: - - >>> trees = cp.parse('john feeds a dog'.split()) - |.joh.fee. a .dog.| - |[---] . . .| [0:1] 'john' - |. [---] . .| [1:2] 'feeds' - |. . [---] .| [2:3] 'a' - |. . . [---]| [3:4] 'dog' - |[---] . . .| [0:1] NP[SEM=[BO={bo(\P.P(John),z102)}, CORE=]] -> 'john' * - |[---> . . .| [0:1] S[SEM=[BO={?b1+?b2}, CORE=]] -> NP[SEM=[BO=?b1, CORE=?subj]] * VP[SEM=[BO=?b2, CORE=?vp]] {?b1: {bo(\P.P(John),z2)}, ?subj: } - |. [---] . .| [1:2] TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] -> 'feeds' * - |. [---> . .| [1:2] VP[SEM=[BO={?b1+?b2}, CORE=]] -> TV[SEM=[BO=?b1, CORE=?v]] * NP[SEM=[BO=?b2, CORE=?obj]] {?b1: {/}, ?v: } - |. . [---] .| [2:3] Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] -> 'a' * - |. . [---> .| [2:3] NP[SEM=[BO={?b1+?b2+{bo(?det(?n),@x)}}, CORE=<@x>]] -> Det[SEM=[BO=?b1, CORE=?det]] * N[SEM=[BO=?b2, CORE=?n]] {?b1: {/}, ?det: } - |. . . [---]| [3:4] N[SEM=[BO={/}, CORE=]] -> 'dog' * - |. . [-------]| [2:4] NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z103)}, CORE=]] -> Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] N[SEM=[BO={/}, CORE=]] * - |. . [------->| [2:4] S[SEM=[BO={?b1+?b2}, CORE=]] -> NP[SEM=[BO=?b1, CORE=?subj]] * VP[SEM=[BO=?b2, CORE=?vp]] {?b1: {bo(\P.exists x.(dog(x) & P(x)),z2)}, ?subj: } - |. [-----------]| [1:4] VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=<\y.feed(y,z2)>]] -> TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=]] * - |[===============]| [0:4] S[SEM=[BO={bo(\P.P(John),z2), bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=]] -> NP[SEM=[BO={bo(\P.P(John),z2)}, CORE=]] VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=<\y.feed(y,z3)>]] * - - >>> for tree in trees: print(tree) - (S[SEM=[BO={bo(\P.P(John),z2), bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=]] - (NP[SEM=[BO={bo(\P.P(John),z102)}, CORE=]] john) - (VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=<\y.feed(y,z2)>]] - (TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] feeds) - (NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z103)}, CORE=]] - (Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] a) - (N[SEM=[BO={/}, CORE=]] dog)))) - -Turn down the verbosity: - - >>> cp = parse.load_parser('grammars/sample_grammars/bindop.fcfg', trace=0, - ... chart_class=InstantiateVarsChart) - -Reuse the same lexical item twice: - - >>> trees = cp.parse('john feeds john'.split()) - >>> for tree in trees: print(tree) - (S[SEM=[BO={bo(\P.P(John),z2), bo(\P.P(John),z3)}, CORE=]] - (NP[SEM=[BO={bo(\P.P(John),z104)}, CORE=]] john) - (VP[SEM=[BO={bo(\P.P(John),z2)}, CORE=<\y.feed(y,z2)>]] - (TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] feeds) - (NP[SEM=[BO={bo(\P.P(John),z105)}, CORE=]] john))) - - >>> trees = cp.parse('a dog feeds a dog'.split()) - >>> for tree in trees: print(tree) - (S[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2), bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=]] - (NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z106)}, CORE=]] - (Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] a) - (N[SEM=[BO={/}, CORE=]] dog)) - (VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=<\y.feed(y,z2)>]] - (TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] feeds) - (NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z107)}, CORE=]] - (Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] a) - (N[SEM=[BO={/}, CORE=]] dog)))) diff --git a/pipeline/nltk/test/featstruct.doctest b/pipeline/nltk/test/featstruct.doctest deleted file mode 100644 index e6062d4fb31a9894d7cae48d9d834e78e5e37a6a..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/featstruct.doctest +++ /dev/null @@ -1,1229 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -================================== - Feature Structures & Unification -================================== - >>> from nltk.featstruct import FeatStruct - >>> from nltk.sem.logic import Variable, VariableExpression, Expression - -.. note:: For now, featstruct uses the older lambdalogic semantics - module. Eventually, it should be updated to use the new first - order predicate logic module. - -Overview -~~~~~~~~ -A feature structure is a mapping from feature identifiers to feature -values, where feature values can be simple values (like strings or -ints), nested feature structures, or variables: - - >>> fs1 = FeatStruct(number='singular', person=3) - >>> print(fs1) - [ number = 'singular' ] - [ person = 3 ] - -Feature structure may be nested: - - >>> fs2 = FeatStruct(type='NP', agr=fs1) - >>> print(fs2) - [ agr = [ number = 'singular' ] ] - [ [ person = 3 ] ] - [ ] - [ type = 'NP' ] - -Variables are used to indicate that two features should be assigned -the same value. For example, the following feature structure requires -that the feature fs3['agr']['number'] be bound to the same value as the -feature fs3['subj']['number']. - - >>> fs3 = FeatStruct(agr=FeatStruct(number=Variable('?n')), - ... subj=FeatStruct(number=Variable('?n'))) - >>> print(fs3) - [ agr = [ number = ?n ] ] - [ ] - [ subj = [ number = ?n ] ] - -Feature structures are typically used to represent partial information -about objects. A feature name that is not mapped to a value stands -for a feature whose value is unknown (*not* a feature without a -value). Two feature structures that represent (potentially -overlapping) information about the same object can be combined by -*unification*. - - >>> print(fs2.unify(fs3)) - [ agr = [ number = 'singular' ] ] - [ [ person = 3 ] ] - [ ] - [ subj = [ number = 'singular' ] ] - [ ] - [ type = 'NP' ] - -When two inconsistent feature structures are unified, the unification -fails and returns ``None``. - - >>> fs4 = FeatStruct(agr=FeatStruct(person=1)) - >>> print(fs4.unify(fs2)) - None - >>> print(fs2.unify(fs4)) - None - -.. - >>> del fs1, fs2, fs3, fs4 # clean-up - -Feature Structure Types ------------------------ -There are actually two types of feature structure: - -- *feature dictionaries*, implemented by `FeatDict`, act like - Python dictionaries. Feature identifiers may be strings or - instances of the `Feature` class. -- *feature lists*, implemented by `FeatList`, act like Python - lists. Feature identifiers are integers. - -When you construct a feature structure using the `FeatStruct` -constructor, it will automatically decide which type is appropriate: - - >>> type(FeatStruct(number='singular')) - - >>> type(FeatStruct([1,2,3])) - - -Usually, we will just use feature dictionaries; but sometimes feature -lists can be useful too. Two feature lists will unify with each other -only if they have equal lengths, and all of their feature values -match. If you wish to write a feature list that contains 'unknown' -values, you must use variables: - - >>> fs1 = FeatStruct([1,2,Variable('?y')]) - >>> fs2 = FeatStruct([1,Variable('?x'),3]) - >>> fs1.unify(fs2) - [1, 2, 3] - -.. - >>> del fs1, fs2 # clean-up - -Parsing Feature Structure Strings ---------------------------------- -Feature structures can be constructed directly from strings. Often, -this is more convenient than constructing them directly. NLTK can -parse most feature strings to produce the corresponding feature -structures. (But you must restrict your base feature values to -strings, ints, logic expressions (`nltk.sem.logic.Expression`), and a -few other types discussed below). - -Feature dictionaries are written like Python dictionaries, except that -keys are not put in quotes; and square brackets (``[]``) are used -instead of braces (``{}``): - - >>> FeatStruct('[tense="past", agr=[number="sing", person=3]]') - [agr=[number='sing', person=3], tense='past'] - -If a feature value is a single alphanumeric word, then it does not -need to be quoted -- it will be automatically treated as a string: - - >>> FeatStruct('[tense=past, agr=[number=sing, person=3]]') - [agr=[number='sing', person=3], tense='past'] - -Feature lists are written like python lists: - - >>> FeatStruct('[1, 2, 3]') - [1, 2, 3] - -The expression ``[]`` is treated as an empty feature dictionary, not -an empty feature list: - - >>> type(FeatStruct('[]')) - - -Feature Paths -------------- -Features can be specified using *feature paths*, or tuples of feature -identifiers that specify path through the nested feature structures to -a value. - - >>> fs1 = FeatStruct('[x=1, y=[1,2,[z=3]]]') - >>> fs1['y'] - [1, 2, [z=3]] - >>> fs1['y', 2] - [z=3] - >>> fs1['y', 2, 'z'] - 3 - -.. - >>> del fs1 # clean-up - -Reentrance ----------- -Feature structures may contain reentrant feature values. A *reentrant -feature value* is a single feature structure that can be accessed via -multiple feature paths. - - >>> fs1 = FeatStruct(x='val') - >>> fs2 = FeatStruct(a=fs1, b=fs1) - >>> print(fs2) - [ a = (1) [ x = 'val' ] ] - [ ] - [ b -> (1) ] - >>> fs2 - [a=(1)[x='val'], b->(1)] - -As you can see, reentrane is displayed by marking a feature structure -with a unique identifier, in this case ``(1)``, the first time it is -encountered; and then using the special form ``var -> id`` whenever it -is encountered again. You can use the same notation to directly -create reentrant feature structures from strings. - - >>> FeatStruct('[a=(1)[], b->(1), c=[d->(1)]]') - [a=(1)[], b->(1), c=[d->(1)]] - -Reentrant feature structures may contain cycles: - - >>> fs3 = FeatStruct('(1)[a->(1)]') - >>> fs3['a', 'a', 'a', 'a'] - (1)[a->(1)] - >>> fs3['a', 'a', 'a', 'a'] is fs3 - True - -Unification preserves the reentrance relations imposed by both of the -unified feature structures. In the feature structure resulting from -unification, any modifications to a reentrant feature value will be -visible using any of its feature paths. - - >>> fs3.unify(FeatStruct('[a=[b=12], c=33]')) - (1)[a->(1), b=12, c=33] - -.. - >>> del fs1, fs2, fs3 # clean-up - -Feature Structure Equality --------------------------- -Two feature structures are considered equal if they assign the same -values to all features, *and* they contain the same reentrances. - - >>> fs1 = FeatStruct('[a=(1)[x=1], b->(1)]') - >>> fs2 = FeatStruct('[a=(1)[x=1], b->(1)]') - >>> fs3 = FeatStruct('[a=[x=1], b=[x=1]]') - >>> fs1 == fs1, fs1 is fs1 - (True, True) - >>> fs1 == fs2, fs1 is fs2 - (True, False) - >>> fs1 == fs3, fs1 is fs3 - (False, False) - -Note that this differs from how Python dictionaries and lists define -equality -- in particular, Python dictionaries and lists ignore -reentrance relations. To test two feature structures for equality -while ignoring reentrance relations, use the `equal_values()` method: - - >>> fs1.equal_values(fs1) - True - >>> fs1.equal_values(fs2) - True - >>> fs1.equal_values(fs3) - True - -.. - >>> del fs1, fs2, fs3 # clean-up - -Feature Value Sets & Feature Value Tuples ------------------------------------------ -`nltk.featstruct` defines two new data types that are intended to be -used as feature values: `FeatureValueTuple` and `FeatureValueSet`. -Both of these types are considered base values -- i.e., unification -does *not* apply to them. However, variable binding *does* apply to -any values that they contain. - -Feature value tuples are written with parentheses: - - >>> fs1 = FeatStruct('[x=(?x, ?y)]') - >>> fs1 - [x=(?x, ?y)] - >>> fs1.substitute_bindings({Variable('?x'): 1, Variable('?y'): 2}) - [x=(1, 2)] - -Feature sets are written with braces: - - >>> fs1 = FeatStruct('[x={?x, ?y}]') - >>> fs1 - [x={?x, ?y}] - >>> fs1.substitute_bindings({Variable('?x'): 1, Variable('?y'): 2}) - [x={1, 2}] - -In addition to the basic feature value tuple & set classes, nltk -defines feature value unions (for sets) and feature value -concatenations (for tuples). These are written using '+', and can be -used to combine sets & tuples: - - >>> fs1 = FeatStruct('[x=((1, 2)+?z), z=?z]') - >>> fs1 - [x=((1, 2)+?z), z=?z] - >>> fs1.unify(FeatStruct('[z=(3, 4, 5)]')) - [x=(1, 2, 3, 4, 5), z=(3, 4, 5)] - -Thus, feature value tuples and sets can be used to build up tuples -and sets of values over the course of unification. For example, when -parsing sentences using a semantic feature grammar, feature sets or -feature tuples can be used to build a list of semantic predicates as -the sentence is parsed. - -As was mentioned above, unification does not apply to feature value -tuples and sets. One reason for this that it's impossible to define a -single correct answer for unification when concatenation is used. -Consider the following example: - - >>> fs1 = FeatStruct('[x=(1, 2, 3, 4)]') - >>> fs2 = FeatStruct('[x=(?a+?b), a=?a, b=?b]') - -If unification applied to feature tuples, then the unification -algorithm would have to arbitrarily choose how to divide the tuple -(1,2,3,4) into two parts. Instead, the unification algorithm refuses -to make this decision, and simply unifies based on value. Because -(1,2,3,4) is not equal to (?a+?b), fs1 and fs2 will not unify: - - >>> print(fs1.unify(fs2)) - None - -If you need a list-like structure that unification does apply to, use -`FeatList`. - -.. - >>> del fs1, fs2 # clean-up - -Light-weight Feature Structures -------------------------------- -Many of the functions defined by `nltk.featstruct` can be applied -directly to simple Python dictionaries and lists, rather than to -full-fledged `FeatDict` and `FeatList` objects. In other words, -Python ``dicts`` and ``lists`` can be used as "light-weight" feature -structures. - - >>> # Note: pprint prints dicts sorted - >>> from pprint import pprint - >>> from nltk.featstruct import unify - >>> pprint(unify(dict(x=1, y=dict()), dict(a='a', y=dict(b='b')))) - {'a': 'a', 'x': 1, 'y': {'b': 'b'}} - -However, you should keep in mind the following caveats: - -- Python dictionaries & lists ignore reentrance when checking for - equality between values. But two FeatStructs with different - reentrances are considered nonequal, even if all their base - values are equal. - -- FeatStructs can be easily frozen, allowing them to be used as - keys in hash tables. Python dictionaries and lists can not. - -- FeatStructs display reentrance in their string representations; - Python dictionaries and lists do not. - -- FeatStructs may *not* be mixed with Python dictionaries and lists - (e.g., when performing unification). - -- FeatStructs provide a number of useful methods, such as `walk()` - and `cyclic()`, which are not available for Python dicts & lists. - -In general, if your feature structures will contain any reentrances, -or if you plan to use them as dictionary keys, it is strongly -recommended that you use full-fledged `FeatStruct` objects. - -Custom Feature Values ---------------------- -The abstract base class `CustomFeatureValue` can be used to define new -base value types that have custom unification methods. For example, -the following feature value type encodes a range, and defines -unification as taking the intersection on the ranges: - - >>> from functools import total_ordering - >>> from nltk.featstruct import CustomFeatureValue, UnificationFailure - >>> @total_ordering - ... class Range(CustomFeatureValue): - ... def __init__(self, low, high): - ... assert low <= high - ... self.low = low - ... self.high = high - ... def unify(self, other): - ... if not isinstance(other, Range): - ... return UnificationFailure - ... low = max(self.low, other.low) - ... high = min(self.high, other.high) - ... if low <= high: return Range(low, high) - ... else: return UnificationFailure - ... def __repr__(self): - ... return '(%s>> fs1 = FeatStruct(x=Range(5,8), y=FeatStruct(z=Range(7,22))) - >>> print(fs1.unify(FeatStruct(x=Range(6, 22)))) - [ x = (6>> print(fs1.unify(FeatStruct(x=Range(9, 12)))) - None - >>> print(fs1.unify(FeatStruct(x=12))) - None - >>> print(fs1.unify(FeatStruct('[x=?x, y=[z=?x]]'))) - [ x = (7>> fs1 = FeatStruct(a=1, b=2, c=3) - >>> fs2 = FeatStruct(x=fs1, y='x') - -Feature structures support all dictionary methods (excluding the class -method `dict.fromkeys()`). Non-mutating methods: - - >>> sorted(fs2.keys()) # keys() - ['x', 'y'] - >>> sorted(fs2.values()) # values() - [[a=1, b=2, c=3], 'x'] - >>> sorted(fs2.items()) # items() - [('x', [a=1, b=2, c=3]), ('y', 'x')] - >>> sorted(fs2) # __iter__() - ['x', 'y'] - >>> 'a' in fs2, 'x' in fs2 # __contains__() - (False, True) - >>> fs2.has_key('a'), fs2.has_key('x') # has_key() - (False, True) - >>> fs2['x'], fs2['y'] # __getitem__() - ([a=1, b=2, c=3], 'x') - >>> fs2['a'] # __getitem__() - Traceback (most recent call last): - . . . - KeyError: 'a' - >>> fs2.get('x'), fs2.get('y'), fs2.get('a') # get() - ([a=1, b=2, c=3], 'x', None) - >>> fs2.get('x', 'hello'), fs2.get('a', 'hello') # get() - ([a=1, b=2, c=3], 'hello') - >>> len(fs1), len(fs2) # __len__ - (3, 2) - >>> fs2.copy() # copy() - [x=[a=1, b=2, c=3], y='x'] - >>> fs2.copy() is fs2 # copy() - False - -Note: by default, `FeatStruct.copy()` does a deep copy. Use -`FeatStruct.copy(deep=False)` for a shallow copy. - -.. - >>> del fs1, fs2 # clean-up. - -Dictionary access methods (mutating) ------------------------------------- - >>> fs1 = FeatStruct(a=1, b=2, c=3) - >>> fs2 = FeatStruct(x=fs1, y='x') - -Setting features (`__setitem__()`) - - >>> fs1['c'] = 5 - >>> fs1 - [a=1, b=2, c=5] - >>> fs1['x'] = 12 - >>> fs1 - [a=1, b=2, c=5, x=12] - >>> fs2['x', 'a'] = 2 - >>> fs2 - [x=[a=2, b=2, c=5, x=12], y='x'] - >>> fs1 - [a=2, b=2, c=5, x=12] - -Deleting features (`__delitem__()`) - - >>> del fs1['x'] - >>> fs1 - [a=2, b=2, c=5] - >>> del fs2['x', 'a'] - >>> fs1 - [b=2, c=5] - -`setdefault()`: - - >>> fs1.setdefault('b', 99) - 2 - >>> fs1 - [b=2, c=5] - >>> fs1.setdefault('x', 99) - 99 - >>> fs1 - [b=2, c=5, x=99] - -`update()`: - - >>> fs2.update({'a':'A', 'b':'B'}, c='C') - >>> fs2 - [a='A', b='B', c='C', x=[b=2, c=5, x=99], y='x'] - -`pop()`: - - >>> fs2.pop('a') - 'A' - >>> fs2 - [b='B', c='C', x=[b=2, c=5, x=99], y='x'] - >>> fs2.pop('a') - Traceback (most recent call last): - . . . - KeyError: 'a' - >>> fs2.pop('a', 'foo') - 'foo' - >>> fs2 - [b='B', c='C', x=[b=2, c=5, x=99], y='x'] - -`clear()`: - - >>> fs1.clear() - >>> fs1 - [] - >>> fs2 - [b='B', c='C', x=[], y='x'] - -`popitem()`: - - >>> sorted([fs2.popitem() for i in range(len(fs2))]) - [('b', 'B'), ('c', 'C'), ('x', []), ('y', 'x')] - >>> fs2 - [] - -Once a feature structure has been frozen, it may not be mutated. - - >>> fs1 = FeatStruct('[x=1, y=2, z=[a=3]]') - >>> fs1.freeze() - >>> fs1.frozen() - True - >>> fs1['z'].frozen() - True - - >>> fs1['x'] = 5 - Traceback (most recent call last): - . . . - ValueError: Frozen FeatStructs may not be modified. - >>> del fs1['x'] - Traceback (most recent call last): - . . . - ValueError: Frozen FeatStructs may not be modified. - >>> fs1.clear() - Traceback (most recent call last): - . . . - ValueError: Frozen FeatStructs may not be modified. - >>> fs1.pop('x') - Traceback (most recent call last): - . . . - ValueError: Frozen FeatStructs may not be modified. - >>> fs1.popitem() - Traceback (most recent call last): - . . . - ValueError: Frozen FeatStructs may not be modified. - >>> fs1.setdefault('x') - Traceback (most recent call last): - . . . - ValueError: Frozen FeatStructs may not be modified. - >>> fs1.update(z=22) - Traceback (most recent call last): - . . . - ValueError: Frozen FeatStructs may not be modified. - -.. - >>> del fs1, fs2 # clean-up. - -Feature Paths -------------- -Make sure that __getitem__ with feature paths works as intended: - - >>> fs1 = FeatStruct(a=1, b=2, - ... c=FeatStruct( - ... d=FeatStruct(e=12), - ... f=FeatStruct(g=55, h='hello'))) - >>> fs1[()] - [a=1, b=2, c=[d=[e=12], f=[g=55, h='hello']]] - >>> fs1['a'], fs1[('a',)] - (1, 1) - >>> fs1['c','d','e'] - 12 - >>> fs1['c','f','g'] - 55 - -Feature paths that select unknown features raise KeyError: - - >>> fs1['c', 'f', 'e'] - Traceback (most recent call last): - . . . - KeyError: ('c', 'f', 'e') - >>> fs1['q', 'p'] - Traceback (most recent call last): - . . . - KeyError: ('q', 'p') - -Feature paths that try to go 'through' a feature that's not a feature -structure raise KeyError: - - >>> fs1['a', 'b'] - Traceback (most recent call last): - . . . - KeyError: ('a', 'b') - -Feature paths can go through reentrant structures: - - >>> fs2 = FeatStruct('(1)[a=[b=[c->(1), d=5], e=11]]') - >>> fs2['a', 'b', 'c', 'a', 'e'] - 11 - >>> fs2['a', 'b', 'c', 'a', 'b', 'd'] - 5 - >>> fs2[tuple('abcabcabcabcabcabcabcabcabcabca')] - (1)[b=[c=[a->(1)], d=5], e=11] - -Indexing requires strings, `Feature`\s, or tuples; other types raise a -TypeError: - - >>> fs2[12] - Traceback (most recent call last): - . . . - TypeError: Expected feature name or path. Got 12. - >>> fs2[list('abc')] - Traceback (most recent call last): - . . . - TypeError: Expected feature name or path. Got ['a', 'b', 'c']. - -Feature paths can also be used with `get()`, `has_key()`, and -`__contains__()`. - - >>> fpath1 = tuple('abcabc') - >>> fpath2 = tuple('abcabz') - >>> fs2.get(fpath1), fs2.get(fpath2) - ((1)[a=[b=[c->(1), d=5], e=11]], None) - >>> fpath1 in fs2, fpath2 in fs2 - (True, False) - >>> fs2.has_key(fpath1), fs2.has_key(fpath2) - (True, False) - -.. - >>> del fs1, fs2 # clean-up - -Reading Feature Structures --------------------------- - -Empty feature struct: - - >>> FeatStruct('[]') - [] - -Test features with integer values: - - >>> FeatStruct('[a=12, b=-33, c=0]') - [a=12, b=-33, c=0] - -Test features with string values. Either single or double quotes may -be used. Strings are evaluated just like python strings -- in -particular, you can use escape sequences and 'u' and 'r' prefixes, and -triple-quoted strings. - - >>> FeatStruct('[a="", b="hello", c="\'", d=\'\', e=\'"\']') - [a='', b='hello', c="'", d='', e='"'] - >>> FeatStruct(r'[a="\\", b="\"", c="\x6f\\y", d="12"]') - [a='\\', b='"', c='o\\y', d='12'] - >>> FeatStruct(r'[b=r"a\b\c"]') - [b='a\\b\\c'] - >>> FeatStruct('[x="""a"""]') - [x='a'] - -Test parsing of reentrant feature structures. - - >>> FeatStruct('[a=(1)[], b->(1)]') - [a=(1)[], b->(1)] - >>> FeatStruct('[a=(1)[x=1, y=2], b->(1)]') - [a=(1)[x=1, y=2], b->(1)] - -Test parsing of cyclic feature structures. - - >>> FeatStruct('[a=(1)[b->(1)]]') - [a=(1)[b->(1)]] - >>> FeatStruct('(1)[a=[b=[c->(1)]]]') - (1)[a=[b=[c->(1)]]] - -Strings of the form "+name" and "-name" may be used to specify boolean -values. - - >>> FeatStruct('[-bar, +baz, +foo]') - [-bar, +baz, +foo] - -None, True, and False are recognized as values: - - >>> FeatStruct('[bar=True, baz=False, foo=None]') - [+bar, -baz, foo=None] - -Special features: - - >>> FeatStruct('NP/VP') - NP[]/VP[] - >>> FeatStruct('?x/?x') - ?x[]/?x[] - >>> print(FeatStruct('VP[+fin, agr=?x, tense=past]/NP[+pl, agr=?x]')) - [ *type* = 'VP' ] - [ ] - [ [ *type* = 'NP' ] ] - [ *slash* = [ agr = ?x ] ] - [ [ pl = True ] ] - [ ] - [ agr = ?x ] - [ fin = True ] - [ tense = 'past' ] - -Here the slash feature gets coerced: - - >>> FeatStruct('[*slash*=a, x=b, *type*="NP"]') - NP[x='b']/a[] - - >>> FeatStruct('NP[sem=]/NP') - NP[sem=]/NP[] - >>> FeatStruct('S[sem=]') - S[sem=] - >>> print(FeatStruct('NP[sem=]/NP')) - [ *type* = 'NP' ] - [ ] - [ *slash* = [ *type* = 'NP' ] ] - [ ] - [ sem = ] - -Playing with ranges: - - >>> from nltk.featstruct import RangeFeature, FeatStructReader - >>> width = RangeFeature('width') - >>> reader = FeatStructReader([width]) - >>> fs1 = reader.fromstring('[*width*=-5:12]') - >>> fs2 = reader.fromstring('[*width*=2:123]') - >>> fs3 = reader.fromstring('[*width*=-7:-2]') - >>> fs1.unify(fs2) - [*width*=(2, 12)] - >>> fs1.unify(fs3) - [*width*=(-5, -2)] - >>> print(fs2.unify(fs3)) # no overlap in width. - None - -The slash feature has a default value of 'False': - - >>> print(FeatStruct('NP[]/VP').unify(FeatStruct('NP[]'), trace=1)) - - Unification trace: - / NP[]/VP[] - |\ NP[] - | - | Unify feature: *type* - | / 'NP' - | |\ 'NP' - | | - | +-->'NP' - | - | Unify feature: *slash* - | / VP[] - | |\ False - | | - X X <-- FAIL - None - -The demo structures from category.py. They all parse, but they don't -do quite the right thing, -- ?x vs x. - - >>> FeatStruct(pos='n', agr=FeatStruct(number='pl', gender='f')) - [agr=[gender='f', number='pl'], pos='n'] - >>> FeatStruct(r'NP[sem=]/NP') - NP[sem=]/NP[] - >>> FeatStruct(r'S[sem=]') - S[sem=] - >>> FeatStruct('?x/?x') - ?x[]/?x[] - >>> FeatStruct('VP[+fin, agr=?x, tense=past]/NP[+pl, agr=?x]') - VP[agr=?x, +fin, tense='past']/NP[agr=?x, +pl] - >>> FeatStruct('S[sem = ]') - S[sem=] - - >>> FeatStruct('S') - S[] - -The parser also includes support for reading sets and tuples. - - >>> FeatStruct('[x={1,2,2,2}, y={/}]') - [x={1, 2}, y={/}] - >>> FeatStruct('[x=(1,2,2,2), y=()]') - [x=(1, 2, 2, 2), y=()] - >>> print(FeatStruct('[x=(1,[z=(1,2,?x)],?z,{/})]')) - [ x = (1, [ z = (1, 2, ?x) ], ?z, {/}) ] - -Note that we can't put a featstruct inside a tuple, because doing so -would hash it, and it's not frozen yet: - - >>> print(FeatStruct('[x={[]}]')) - Traceback (most recent call last): - . . . - TypeError: FeatStructs must be frozen before they can be hashed. - -There's a special syntax for taking the union of sets: "{...+...}". -The elements should only be variables or sets. - - >>> FeatStruct('[x={?a+?b+{1,2,3}}]') - [x={?a+?b+{1, 2, 3}}] - -There's a special syntax for taking the concatenation of tuples: -"(...+...)". The elements should only be variables or tuples. - - >>> FeatStruct('[x=(?a+?b+(1,2,3))]') - [x=(?a+?b+(1, 2, 3))] - -Parsing gives helpful messages if your string contains an error. - - >>> FeatStruct('[a=, b=5]]') - Traceback (most recent call last): - . . . - ValueError: Error parsing feature structure - [a=, b=5]] - ^ Expected value - >>> FeatStruct('[a=12 22, b=33]') - Traceback (most recent call last): - . . . - ValueError: Error parsing feature structure - [a=12 22, b=33] - ^ Expected comma - >>> FeatStruct('[a=5] [b=6]') - Traceback (most recent call last): - . . . - ValueError: Error parsing feature structure - [a=5] [b=6] - ^ Expected end of string - >>> FeatStruct(' *++*') - Traceback (most recent call last): - . . . - ValueError: Error parsing feature structure - *++* - ^ Expected open bracket or identifier - >>> FeatStruct('[x->(1)]') - Traceback (most recent call last): - . . . - ValueError: Error parsing feature structure - [x->(1)] - ^ Expected bound identifier - >>> FeatStruct('[x->y]') - Traceback (most recent call last): - . . . - ValueError: Error parsing feature structure - [x->y] - ^ Expected identifier - >>> FeatStruct('') - Traceback (most recent call last): - . . . - ValueError: Error parsing feature structure - - ^ Expected open bracket or identifier - - -Unification ------------ -Very simple unifications give the expected results: - - >>> FeatStruct().unify(FeatStruct()) - [] - >>> FeatStruct(number='singular').unify(FeatStruct()) - [number='singular'] - >>> FeatStruct().unify(FeatStruct(number='singular')) - [number='singular'] - >>> FeatStruct(number='singular').unify(FeatStruct(person=3)) - [number='singular', person=3] - -Merging nested structures: - - >>> fs1 = FeatStruct('[A=[B=b]]') - >>> fs2 = FeatStruct('[A=[C=c]]') - >>> fs1.unify(fs2) - [A=[B='b', C='c']] - >>> fs2.unify(fs1) - [A=[B='b', C='c']] - -A basic case of reentrant unification - - >>> fs4 = FeatStruct('[A=(1)[B=b], E=[F->(1)]]') - >>> fs5 = FeatStruct("[A=[C='c'], E=[F=[D='d']]]") - >>> fs4.unify(fs5) - [A=(1)[B='b', C='c', D='d'], E=[F->(1)]] - >>> fs5.unify(fs4) - [A=(1)[B='b', C='c', D='d'], E=[F->(1)]] - -More than 2 paths to a value - - >>> fs1 = FeatStruct("[a=[],b=[],c=[],d=[]]") - >>> fs2 = FeatStruct('[a=(1)[], b->(1), c->(1), d->(1)]') - >>> fs1.unify(fs2) - [a=(1)[], b->(1), c->(1), d->(1)] - -fs1[a] gets unified with itself - - >>> fs1 = FeatStruct('[x=(1)[], y->(1)]') - >>> fs2 = FeatStruct('[x=(1)[], y->(1)]') - >>> fs1.unify(fs2) - [x=(1)[], y->(1)] - -Bound variables should get forwarded appropriately - - >>> fs1 = FeatStruct('[A=(1)[X=x], B->(1), C=?cvar, D=?dvar]') - >>> fs2 = FeatStruct('[A=(1)[Y=y], B=(2)[Z=z], C->(1), D->(2)]') - >>> fs1.unify(fs2) - [A=(1)[X='x', Y='y', Z='z'], B->(1), C->(1), D->(1)] - >>> fs2.unify(fs1) - [A=(1)[X='x', Y='y', Z='z'], B->(1), C->(1), D->(1)] - -Cyclic structure created by unification. - - >>> fs1 = FeatStruct('[F=(1)[], G->(1)]') - >>> fs2 = FeatStruct('[F=[H=(2)[]], G->(2)]') - >>> fs3 = fs1.unify(fs2) - >>> fs3 - [F=(1)[H->(1)], G->(1)] - >>> fs3['F'] is fs3['G'] - True - >>> fs3['F'] is fs3['G']['H'] - True - >>> fs3['F'] is fs3['G']['H']['H'] - True - >>> fs3['F'] is fs3['F']['H']['H']['H']['H']['H']['H']['H']['H'] - True - -Cyclic structure created w/ variables. - - >>> fs1 = FeatStruct('[F=[H=?x]]') - >>> fs2 = FeatStruct('[F=?x]') - >>> fs3 = fs1.unify(fs2, rename_vars=False) - >>> fs3 - [F=(1)[H->(1)]] - >>> fs3['F'] is fs3['F']['H'] - True - >>> fs3['F'] is fs3['F']['H']['H'] - True - >>> fs3['F'] is fs3['F']['H']['H']['H']['H']['H']['H']['H']['H'] - True - -Unifying w/ a cyclic feature structure. - - >>> fs4 = FeatStruct('[F=[H=[H=[H=(1)[]]]], K->(1)]') - >>> fs3.unify(fs4) - [F=(1)[H->(1)], K->(1)] - >>> fs4.unify(fs3) - [F=(1)[H->(1)], K->(1)] - -Variable bindings should preserve reentrance. - - >>> bindings = {} - >>> fs1 = FeatStruct("[a=?x]") - >>> fs2 = fs1.unify(FeatStruct("[a=[]]"), bindings) - >>> fs2['a'] is bindings[Variable('?x')] - True - >>> fs2.unify(FeatStruct("[b=?x]"), bindings) - [a=(1)[], b->(1)] - -Aliased variable tests - - >>> fs1 = FeatStruct("[a=?x, b=?x]") - >>> fs2 = FeatStruct("[b=?y, c=?y]") - >>> bindings = {} - >>> fs3 = fs1.unify(fs2, bindings) - >>> fs3 - [a=?x, b=?x, c=?x] - >>> bindings - {Variable('?y'): Variable('?x')} - >>> fs3.unify(FeatStruct("[a=1]")) - [a=1, b=1, c=1] - -If we keep track of the bindings, then we can use the same variable -over multiple calls to unify. - - >>> bindings = {} - >>> fs1 = FeatStruct('[a=?x]') - >>> fs2 = fs1.unify(FeatStruct('[a=[]]'), bindings) - >>> fs2.unify(FeatStruct('[b=?x]'), bindings) - [a=(1)[], b->(1)] - >>> bindings - {Variable('?x'): []} - -.. - >>> del fs1, fs2, fs3, fs4, fs5 # clean-up - -Unification Bindings --------------------- - - >>> bindings = {} - >>> fs1 = FeatStruct('[a=?x]') - >>> fs2 = FeatStruct('[a=12]') - >>> fs3 = FeatStruct('[b=?x]') - >>> fs1.unify(fs2, bindings) - [a=12] - >>> bindings - {Variable('?x'): 12} - >>> fs3.substitute_bindings(bindings) - [b=12] - >>> fs3 # substitute_bindings didn't mutate fs3. - [b=?x] - >>> fs2.unify(fs3, bindings) - [a=12, b=12] - - >>> bindings = {} - >>> fs1 = FeatStruct('[a=?x, b=1]') - >>> fs2 = FeatStruct('[a=5, b=?x]') - >>> fs1.unify(fs2, bindings) - [a=5, b=1] - >>> sorted(bindings.items()) - [(Variable('?x'), 5), (Variable('?x2'), 1)] - -.. - >>> del fs1, fs2, fs3 # clean-up - -Expressions ------------ - - >>> e = Expression.fromstring('\\P y.P(z,y)') - >>> fs1 = FeatStruct(x=e, y=Variable('z')) - >>> fs2 = FeatStruct(y=VariableExpression(Variable('John'))) - >>> fs1.unify(fs2) - [x=<\P y.P(John,y)>, y=] - -Remove Variables ----------------- - - >>> FeatStruct('[a=?x, b=12, c=[d=?y]]').remove_variables() - [b=12, c=[]] - >>> FeatStruct('(1)[a=[b=?x,c->(1)]]').remove_variables() - (1)[a=[c->(1)]] - -Equality & Hashing ------------------- -The `equal_values` method checks whether two feature structures assign -the same value to every feature. If the optional argument -``check_reentrances`` is supplied, then it also returns false if there -is any difference in the reentrances. - - >>> a = FeatStruct('(1)[x->(1)]') - >>> b = FeatStruct('(1)[x->(1)]') - >>> c = FeatStruct('(1)[x=[x->(1)]]') - >>> d = FeatStruct('[x=(1)[x->(1)]]') - >>> e = FeatStruct('(1)[x=[x->(1), y=1], y=1]') - >>> def compare(x,y): - ... assert x.equal_values(y, True) == y.equal_values(x, True) - ... assert x.equal_values(y, False) == y.equal_values(x, False) - ... if x.equal_values(y, True): - ... assert x.equal_values(y, False) - ... print('equal values, same reentrance') - ... elif x.equal_values(y, False): - ... print('equal values, different reentrance') - ... else: - ... print('different values') - - >>> compare(a, a) - equal values, same reentrance - >>> compare(a, b) - equal values, same reentrance - >>> compare(a, c) - equal values, different reentrance - >>> compare(a, d) - equal values, different reentrance - >>> compare(c, d) - equal values, different reentrance - >>> compare(a, e) - different values - >>> compare(c, e) - different values - >>> compare(d, e) - different values - >>> compare(e, e) - equal values, same reentrance - -Feature structures may not be hashed until they are frozen: - - >>> hash(a) - Traceback (most recent call last): - . . . - TypeError: FeatStructs must be frozen before they can be hashed. - >>> a.freeze() - >>> v = hash(a) - -Feature structures define hash consistently. The following example -looks at the hash value for each (fs1,fs2) pair; if their hash values -are not equal, then they must not be equal. If their hash values are -equal, then display a message, and indicate whether their values are -indeed equal. Note that c and d currently have the same hash value, -even though they are not equal. That is not a bug, strictly speaking, -but it wouldn't be a bad thing if it changed. - - >>> for fstruct in (a, b, c, d, e): - ... fstruct.freeze() - >>> for fs1_name in 'abcde': - ... for fs2_name in 'abcde': - ... fs1 = locals()[fs1_name] - ... fs2 = locals()[fs2_name] - ... if hash(fs1) != hash(fs2): - ... assert fs1 != fs2 - ... else: - ... print('%s and %s have the same hash value,' % - ... (fs1_name, fs2_name)) - ... if fs1 == fs2: print('and are equal') - ... else: print('and are not equal') - a and a have the same hash value, and are equal - a and b have the same hash value, and are equal - b and a have the same hash value, and are equal - b and b have the same hash value, and are equal - c and c have the same hash value, and are equal - c and d have the same hash value, and are not equal - d and c have the same hash value, and are not equal - d and d have the same hash value, and are equal - e and e have the same hash value, and are equal - -.. - >>> del a, b, c, d, e, v # clean-up - -Tracing -------- - - >>> fs1 = FeatStruct('[a=[b=(1)[], c=?x], d->(1), e=[f=?x]]') - >>> fs2 = FeatStruct('[a=(1)[c="C"], e=[g->(1)]]') - >>> fs1.unify(fs2, trace=True) - - Unification trace: - / [a=[b=(1)[], c=?x], d->(1), e=[f=?x]] - |\ [a=(1)[c='C'], e=[g->(1)]] - | - | Unify feature: a - | / [b=[], c=?x] - | |\ [c='C'] - | | - | | Unify feature: a.c - | | / ?x - | | |\ 'C' - | | | - | | +-->Variable('?x') - | | - | +-->[b=[], c=?x] - | Bindings: {?x: 'C'} - | - | Unify feature: e - | / [f=?x] - | |\ [g=[c='C']] - | | - | +-->[f=?x, g=[b=[], c=?x]] - | Bindings: {?x: 'C'} - | - +-->[a=(1)[b=(2)[], c='C'], d->(2), e=[f='C', g->(1)]] - Bindings: {?x: 'C'} - [a=(1)[b=(2)[], c='C'], d->(2), e=[f='C', g->(1)]] - >>> - >>> fs1 = FeatStruct('[a=?x, b=?z, c=?z]') - >>> fs2 = FeatStruct('[a=?y, b=?y, c=?q]') - >>> #fs1.unify(fs2, trace=True) - >>> - -.. - >>> del fs1, fs2 # clean-up - -Unification on Dicts & Lists ----------------------------- -It's possible to do unification on dictionaries: - - >>> from nltk.featstruct import unify - >>> pprint(unify(dict(x=1, y=dict(z=2)), dict(x=1, q=5)), width=1) - {'q': 5, 'x': 1, 'y': {'z': 2}} - -It's possible to do unification on lists as well: - - >>> unify([1, 2, 3], [1, Variable('x'), 3]) - [1, 2, 3] - -Mixing dicts and lists is fine: - - >>> pprint(unify([dict(x=1, y=dict(z=2)),3], [dict(x=1, q=5),3]), - ... width=1) - [{'q': 5, 'x': 1, 'y': {'z': 2}}, 3] - -Mixing dicts and FeatStructs is discouraged: - - >>> unify(dict(x=1), FeatStruct(x=1)) - Traceback (most recent call last): - . . . - ValueError: Mixing FeatStruct objects with Python dicts and lists is not supported. - -But you can do it if you really want, by explicitly stating that both -dictionaries and FeatStructs should be treated as feature structures: - - >>> unify(dict(x=1), FeatStruct(x=1), fs_class=(dict, FeatStruct)) - {'x': 1} - -Finding Conflicts ------------------ - - >>> from nltk.featstruct import conflicts - >>> fs1 = FeatStruct('[a=[b=(1)[c=2], d->(1), e=[f->(1)]]]') - >>> fs2 = FeatStruct('[a=[b=[c=[x=5]], d=[c=2], e=[f=[c=3]]]]') - >>> for path in conflicts(fs1, fs2): - ... print('%-8s: %r vs %r' % ('.'.join(path), fs1[path], fs2[path])) - a.b.c : 2 vs [x=5] - a.e.f.c : 2 vs 3 - -.. - >>> del fs1, fs2 # clean-up - -Retracting Bindings -------------------- - - >>> from nltk.featstruct import retract_bindings - >>> bindings = {} - >>> fs1 = FeatStruct('[a=?x, b=[c=?y]]') - >>> fs2 = FeatStruct('[a=(1)[c=[d=1]], b->(1)]') - >>> fs3 = fs1.unify(fs2, bindings) - >>> print(fs3) - [ a = (1) [ c = [ d = 1 ] ] ] - [ ] - [ b -> (1) ] - >>> pprint(bindings) - {Variable('?x'): [c=[d=1]], Variable('?y'): [d=1]} - >>> retract_bindings(fs3, bindings) - [a=?x, b=?x] - >>> pprint(bindings) - {Variable('?x'): [c=?y], Variable('?y'): [d=1]} - -Squashed Bugs -~~~~~~~~~~~~~ -In svn rev 5167, unifying two feature structures that used the same -variable would cause those variables to become aliased in the output. - - >>> fs1 = FeatStruct('[a=?x]') - >>> fs2 = FeatStruct('[b=?x]') - >>> fs1.unify(fs2) - [a=?x, b=?x2] - -There was a bug in svn revision 5172 that caused `rename_variables` to -rename variables to names that are already used. - - >>> FeatStruct('[a=?x, b=?x2]').rename_variables( - ... vars=[Variable('?x')]) - [a=?x3, b=?x2] - >>> fs1 = FeatStruct('[a=?x]') - >>> fs2 = FeatStruct('[a=?x, b=?x2]') - >>> fs1.unify(fs2) - [a=?x, b=?x2] - -There was a bug in svn rev 5167 that caused us to get the following -example wrong. Basically the problem was that we only followed -'forward' pointers for other, not self, when unifying two feature -structures. (nb: this test assumes that features are unified in -alphabetical order -- if they are not, it might pass even if the bug -is present.) - - >>> fs1 = FeatStruct('[a=[x=1], b=?x, c=?x]') - >>> fs2 = FeatStruct('[a=(1)[], b->(1), c=[x=2]]') - >>> print(fs1.unify(fs2)) - None - -.. - >>> del fs1, fs2 # clean-up diff --git a/pipeline/nltk/test/framenet.doctest b/pipeline/nltk/test/framenet.doctest deleted file mode 100644 index 337c348b923a0d3a95c2576f10da6347e7085e7a..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/framenet.doctest +++ /dev/null @@ -1,288 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -======== -FrameNet -======== - -The FrameNet corpus is a lexical database of English that is both human- -and machine-readable, based on annotating examples of how words are used -in actual texts. FrameNet is based on a theory of meaning called Frame -Semantics, deriving from the work of Charles J. Fillmore and colleagues. -The basic idea is straightforward: that the meanings of most words can -best be understood on the basis of a semantic frame: a description of a -type of event, relation, or entity and the participants in it. For -example, the concept of cooking typically involves a person doing the -cooking (Cook), the food that is to be cooked (Food), something to hold -the food while cooking (Container) and a source of heat -(Heating_instrument). In the FrameNet project, this is represented as a -frame called Apply_heat, and the Cook, Food, Heating_instrument and -Container are called frame elements (FEs). Words that evoke this frame, -such as fry, bake, boil, and broil, are called lexical units (LUs) of -the Apply_heat frame. The job of FrameNet is to define the frames -and to annotate sentences to show how the FEs fit syntactically around -the word that evokes the frame. - ------- -Frames ------- - -A Frame is a script-like conceptual structure that describes a -particular type of situation, object, or event along with the -participants and props that are needed for that Frame. For -example, the "Apply_heat" frame describes a common situation -involving a Cook, some Food, and a Heating_Instrument, and is -evoked by words such as bake, blanch, boil, broil, brown, -simmer, steam, etc. - -We call the roles of a Frame "frame elements" (FEs) and the -frame-evoking words are called "lexical units" (LUs). - -FrameNet includes relations between Frames. Several types of -relations are defined, of which the most important are: - -- Inheritance: An IS-A relation. The child frame is a subtype - of the parent frame, and each FE in the parent is bound to - a corresponding FE in the child. An example is the - "Revenge" frame which inherits from the - "Rewards_and_punishments" frame. - -- Using: The child frame presupposes the parent frame as - background, e.g the "Speed" frame "uses" (or presupposes) - the "Motion" frame; however, not all parent FEs need to be - bound to child FEs. - -- Subframe: The child frame is a subevent of a complex event - represented by the parent, e.g. the "Criminal_process" frame - has subframes of "Arrest", "Arraignment", "Trial", and - "Sentencing". - -- Perspective_on: The child frame provides a particular - perspective on an un-perspectivized parent frame. A pair of - examples consists of the "Hiring" and "Get_a_job" frames, - which perspectivize the "Employment_start" frame from the - Employer's and the Employee's point of view, respectively. - -To get a list of all of the Frames in FrameNet, you can use the -`frames()` function. If you supply a regular expression pattern to the -`frames()` function, you will get a list of all Frames whose names match -that pattern: - - >>> from pprint import pprint - >>> from operator import itemgetter - >>> from nltk.corpus import framenet as fn - >>> from nltk.corpus.reader.framenet import PrettyList - >>> x = fn.frames(r'(?i)crim') - >>> x.sort(key=itemgetter('ID')) - >>> x - [, , ...] - >>> PrettyList(sorted(x, key=itemgetter('ID'))) - [, , ...] - -To get the details of a particular Frame, you can use the `frame()` -function passing in the frame number: - - >>> from pprint import pprint - >>> from nltk.corpus import framenet as fn - >>> f = fn.frame(202) - >>> f.ID - 202 - >>> f.name - 'Arrest' - >>> f.definition - "Authorities charge a Suspect, who is under suspicion of having committed a crime..." - >>> len(f.lexUnit) - 11 - >>> pprint(sorted([x for x in f.FE])) - ['Authorities', - 'Charges', - 'Co-participant', - 'Manner', - 'Means', - 'Offense', - 'Place', - 'Purpose', - 'Source_of_legal_authority', - 'Suspect', - 'Time', - 'Type'] - >>> pprint(f.frameRelations) - [ Child=Arrest>, Component=Arrest>, ...] - -The `frame()` function shown above returns a dict object containing -detailed information about the Frame. See the documentation on the -`frame()` function for the specifics. - -You can also search for Frames by their Lexical Units (LUs). The -`frames_by_lemma()` function returns a list of all frames that contain -LUs in which the 'name' attribute of the LU matches the given regular -expression. Note that LU names are composed of "lemma.POS", where the -"lemma" part can be made up of either a single lexeme (e.g. 'run') or -multiple lexemes (e.g. 'a little') (see below). - - >>> PrettyList(sorted(fn.frames_by_lemma(r'(?i)a little'), key=itemgetter('ID'))) - [, ] - -------------- -Lexical Units -------------- - -A lexical unit (LU) is a pairing of a word with a meaning. For -example, the "Apply_heat" Frame describes a common situation -involving a Cook, some Food, and a Heating Instrument, and is -_evoked_ by words such as bake, blanch, boil, broil, brown, -simmer, steam, etc. These frame-evoking words are the LUs in the -Apply_heat frame. Each sense of a polysemous word is a different -LU. - -We have used the word "word" in talking about LUs. The reality -is actually rather complex. When we say that the word "bake" is -polysemous, we mean that the lemma "bake.v" (which has the -word-forms "bake", "bakes", "baked", and "baking") is linked to -three different frames: - -- Apply_heat: "Michelle baked the potatoes for 45 minutes." - -- Cooking_creation: "Michelle baked her mother a cake for her birthday." - -- Absorb_heat: "The potatoes have to bake for more than 30 minutes." - -These constitute three different LUs, with different -definitions. - -Multiword expressions such as "given name" and hyphenated words -like "shut-eye" can also be LUs. Idiomatic phrases such as -"middle of nowhere" and "give the slip (to)" are also defined as -LUs in the appropriate frames ("Isolated_places" and "Evading", -respectively), and their internal structure is not analyzed. - -Framenet provides multiple annotated examples of each sense of a -word (i.e. each LU). Moreover, the set of examples -(approximately 20 per LU) illustrates all of the combinatorial -possibilities of the lexical unit. - -Each LU is linked to a Frame, and hence to the other words which -evoke that Frame. This makes the FrameNet database similar to a -thesaurus, grouping together semantically similar words. - -In the simplest case, frame-evoking words are verbs such as -"fried" in: - - "Matilde fried the catfish in a heavy iron skillet." - -Sometimes event nouns may evoke a Frame. For example, -"reduction" evokes "Cause_change_of_scalar_position" in: - - "...the reduction of debt levels to $665 million from $2.6 billion." - -Adjectives may also evoke a Frame. For example, "asleep" may -evoke the "Sleep" frame as in: - - "They were asleep for hours." - -Many common nouns, such as artifacts like "hat" or "tower", -typically serve as dependents rather than clearly evoking their -own frames. - -Details for a specific lexical unit can be obtained using this class's -`lus()` function, which takes an optional regular expression -pattern that will be matched against the name of the lexical unit: - - >>> from pprint import pprint - >>> PrettyList(sorted(fn.lus(r'(?i)a little'), key=itemgetter('ID'))) - [, , ...] - -You can obtain detailed information on a particular LU by calling the -`lu()` function and passing in an LU's 'ID' number: - - >>> from pprint import pprint - >>> from nltk.corpus import framenet as fn - >>> fn.lu(256).name - 'foresee.v' - >>> fn.lu(256).definition - 'COD: be aware of beforehand; predict.' - >>> fn.lu(256).frame.name - 'Expectation' - >>> fn.lu(256).lexemes[0].name - 'foresee' - -Note that LU names take the form of a dotted string (e.g. "run.v" or "a -little.adv") in which a lemma precedes the "." and a part of speech -(POS) follows the dot. The lemma may be composed of a single lexeme -(e.g. "run") or of multiple lexemes (e.g. "a little"). The list of -POSs used in the LUs is: - -v - verb -n - noun -a - adjective -adv - adverb -prep - preposition -num - numbers -intj - interjection -art - article -c - conjunction -scon - subordinating conjunction - -For more detailed information about the info that is contained in the -dict that is returned by the `lu()` function, see the documentation on -the `lu()` function. - -------------------- -Annotated Documents -------------------- - -The FrameNet corpus contains a small set of annotated documents. A list -of these documents can be obtained by calling the `docs()` function: - - >>> from pprint import pprint - >>> from nltk.corpus import framenet as fn - >>> d = fn.docs('BellRinging')[0] - >>> d.corpname - 'PropBank' - >>> d.sentence[49] - full-text sentence (...) in BellRinging: - - - [POS] 17 tags - - [POS_tagset] PENN - - [text] + [annotationSet] - - `` I live in hopes that the ringers themselves will be drawn into - ***** ******* ***** - Desir Cause_t Cause - [1] [3] [2] - - that fuller life . - ****** - Comple - [4] - (Desir=Desiring, Cause_t=Cause_to_make_noise, Cause=Cause_motion, Comple=Completeness) - - - >>> d.sentence[49].annotationSet[1] - annotation set (...): - - [status] MANUAL - - [LU] (6605) hope.n in Desiring - - [frame] (366) Desiring - - [GF] 2 relations - - [PT] 2 phrases - - [text] + [Target] + [FE] + [Noun] - - `` I live in hopes that the ringers themselves will be drawn into - - ^^^^ ^^ ***** ---------------------------------------------- - E supp su Event - - that fuller life . - ----------------- - - (E=Experiencer, su=supp) - - diff --git a/pipeline/nltk/test/generate.doctest b/pipeline/nltk/test/generate.doctest deleted file mode 100644 index eee322d6d7811e46c5d4c17e7d2daf0ef2e314c2..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/generate.doctest +++ /dev/null @@ -1,78 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -=============================================== -Generating sentences from context-free grammars -=============================================== - -An example grammar: - - >>> from nltk.parse.generate import generate, demo_grammar - >>> from nltk import CFG - >>> grammar = CFG.fromstring(demo_grammar) - >>> print(grammar) - Grammar with 13 productions (start state = S) - S -> NP VP - NP -> Det N - PP -> P NP - VP -> 'slept' - VP -> 'saw' NP - VP -> 'walked' PP - Det -> 'the' - Det -> 'a' - N -> 'man' - N -> 'park' - N -> 'dog' - P -> 'in' - P -> 'with' - -The first 10 generated sentences: - - >>> for sentence in generate(grammar, n=10): - ... print(' '.join(sentence)) - the man slept - the man saw the man - the man saw the park - the man saw the dog - the man saw a man - the man saw a park - the man saw a dog - the man walked in the man - the man walked in the park - the man walked in the dog - -All sentences of max depth 4: - - >>> for sentence in generate(grammar, depth=4): - ... print(' '.join(sentence)) - the man slept - the park slept - the dog slept - a man slept - a park slept - a dog slept - -The number of sentences of different max depths: - - >>> len(list(generate(grammar, depth=3))) - 0 - >>> len(list(generate(grammar, depth=4))) - 6 - >>> len(list(generate(grammar, depth=5))) - 42 - >>> len(list(generate(grammar, depth=6))) - 114 - >>> len(list(generate(grammar))) - 114 - -Infinite grammars will throw a RecursionError when not bounded by some ``depth``: - - >>> grammar = CFG.fromstring(""" - ... S -> A B - ... A -> B - ... B -> "b" | A - ... """) - >>> list(generate(grammar)) - Traceback (most recent call last): - ... - RuntimeError: The grammar has rule(s) that yield infinite recursion! diff --git a/pipeline/nltk/test/gensim.doctest b/pipeline/nltk/test/gensim.doctest deleted file mode 100644 index 65d0c6a53f4ac5d209a8557bc4cec37e98ca1e4d..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/gensim.doctest +++ /dev/null @@ -1,141 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -======================================= -Demonstrate word embedding using Gensim -======================================= - - >>> from nltk.test.gensim_fixt import setup_module - >>> setup_module() - -We demonstrate three functions: -- Train the word embeddings using brown corpus; -- Load the pre-trained model and perform simple tasks; and -- Pruning the pre-trained binary model. - - >>> import gensim - ---------------- -Train the model ---------------- - -Here we train a word embedding using the Brown Corpus: - - >>> from nltk.corpus import brown - >>> train_set = brown.sents()[:10000] - >>> model = gensim.models.Word2Vec(train_set) - -It might take some time to train the model. So, after it is trained, it can be saved as follows: - - >>> model.save('brown.embedding') - >>> new_model = gensim.models.Word2Vec.load('brown.embedding') - -The model will be the list of words with their embedding. We can easily get the vector representation of a word. - - >>> len(new_model.wv['university']) - 100 - -There are some supporting functions already implemented in Gensim to manipulate with word embeddings. -For example, to compute the cosine similarity between 2 words: - - >>> new_model.wv.similarity('university','school') > 0.3 - True - ---------------------------- -Using the pre-trained model ---------------------------- - -NLTK includes a pre-trained model which is part of a model that is trained on 100 billion words from the Google News Dataset. -The full model is from https://code.google.com/p/word2vec/ (about 3 GB). - - >>> from nltk.data import find - >>> word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt')) - >>> model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False) - -We pruned the model to only include the most common words (~44k words). - - >>> len(model) - 43981 - -Each word is represented in the space of 300 dimensions: - - >>> len(model['university']) - 300 - -Finding the top n words that are similar to a target word is simple. The result is the list of n words with the score. - - >>> model.most_similar(positive=['university'], topn = 3) - [('universities', 0.70039...), ('faculty', 0.67809...), ('undergraduate', 0.65870...)] - -Finding a word that is not in a list is also supported, although, implementing this by yourself is simple. - - >>> model.doesnt_match('breakfast cereal dinner lunch'.split()) - 'cereal' - -Mikolov et al. (2013) figured out that word embedding captures much of syntactic and semantic regularities. For example, -the vector 'King - Man + Woman' is close to 'Queen' and 'Germany - Berlin + Paris' is close to 'France'. - - >>> model.most_similar(positive=['woman','king'], negative=['man'], topn = 1) - [('queen', 0.71181...)] - - >>> model.most_similar(positive=['Paris','Germany'], negative=['Berlin'], topn = 1) - [('France', 0.78840...)] - -We can visualize the word embeddings using t-SNE (https://lvdmaaten.github.io/tsne/). For this demonstration, we visualize the first 1000 words. - -| import numpy as np -| labels = [] -| count = 0 -| max_count = 1000 -| X = np.zeros(shape=(max_count,len(model['university']))) -| -| for term in model.index_to_key: -| X[count] = model[term] -| labels.append(term) -| count+= 1 -| if count >= max_count: break -| -| # It is recommended to use PCA first to reduce to ~50 dimensions -| from sklearn.decomposition import PCA -| pca = PCA(n_components=50) -| X_50 = pca.fit_transform(X) -| -| # Using TSNE to further reduce to 2 dimensions -| from sklearn.manifold import TSNE -| model_tsne = TSNE(n_components=2, random_state=0) -| Y = model_tsne.fit_transform(X_50) -| -| # Show the scatter plot -| import matplotlib.pyplot as plt -| plt.scatter(Y[:,0], Y[:,1], 20) -| -| # Add labels -| for label, x, y in zip(labels, Y[:, 0], Y[:, 1]): -| plt.annotate(label, xy = (x,y), xytext = (0, 0), textcoords = 'offset points', size = 10) -| -| plt.show() - ------------------------------- -Prune the trained binary model ------------------------------- - -Here is the supporting code to extract part of the binary model (GoogleNews-vectors-negative300.bin.gz) from https://code.google.com/p/word2vec/ -We use this code to get the `word2vec_sample` model. - -| import gensim -| # Load the binary model -| model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary = True) -| -| # Only output word that appear in the Brown corpus -| from nltk.corpus import brown -| words = set(brown.words()) -| print(len(words)) -| -| # Output presented word to a temporary file -| out_file = 'pruned.word2vec.txt' -| with open(out_file,'w') as f: -| word_presented = words.intersection(model.index_to_key) -| f.write('{} {}\n'.format(len(word_presented),len(model['word']))) -| -| for word in word_presented: -| f.write('{} {}\n'.format(word, ' '.join(str(value) for value in model[word]))) diff --git a/pipeline/nltk/test/gensim_fixt.py b/pipeline/nltk/test/gensim_fixt.py deleted file mode 100644 index ee6855f3d46863f07f7e137be3f2a0fc37e7dcc3..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/gensim_fixt.py +++ /dev/null @@ -1,4 +0,0 @@ -def setup_module(): - import pytest - - pytest.importorskip("gensim") diff --git a/pipeline/nltk/test/gluesemantics.doctest b/pipeline/nltk/test/gluesemantics.doctest deleted file mode 100644 index db502c01a14004ebbeee6434ef388a939259c980..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/gluesemantics.doctest +++ /dev/null @@ -1,383 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -============================================================================== - Glue Semantics -============================================================================== - - - -====================== -Linear logic -====================== - - >>> from nltk.sem import logic - >>> from nltk.sem.glue import * - >>> from nltk.sem.linearlogic import * - - >>> from nltk.sem.linearlogic import Expression - >>> read_expr = Expression.fromstring - -Parser - - >>> print(read_expr(r'f')) - f - >>> print(read_expr(r'(g -o f)')) - (g -o f) - >>> print(read_expr(r'(g -o (h -o f))')) - (g -o (h -o f)) - >>> print(read_expr(r'((g -o G) -o G)')) - ((g -o G) -o G) - >>> print(read_expr(r'(g -o f)(g)')) - (g -o f)(g) - >>> print(read_expr(r'((g -o G) -o G)((g -o f))')) - ((g -o G) -o G)((g -o f)) - -Simplify - - >>> print(read_expr(r'f').simplify()) - f - >>> print(read_expr(r'(g -o f)').simplify()) - (g -o f) - >>> print(read_expr(r'((g -o G) -o G)').simplify()) - ((g -o G) -o G) - >>> print(read_expr(r'(g -o f)(g)').simplify()) - f - >>> try: read_expr(r'(g -o f)(f)').simplify() - ... except LinearLogicApplicationException as e: print(e) - ... - Cannot apply (g -o f) to f. Cannot unify g with f given {} - >>> print(read_expr(r'(G -o f)(g)').simplify()) - f - >>> print(read_expr(r'((g -o G) -o G)((g -o f))').simplify()) - f - -Test BindingDict - - >>> h = ConstantExpression('h') - >>> g = ConstantExpression('g') - >>> f = ConstantExpression('f') - - >>> H = VariableExpression('H') - >>> G = VariableExpression('G') - >>> F = VariableExpression('F') - - >>> d1 = BindingDict({H: h}) - >>> d2 = BindingDict({F: f, G: F}) - >>> d12 = d1 + d2 - >>> all12 = ['%s: %s' % (v, d12[v]) for v in d12.d] - >>> all12.sort() - >>> print(all12) - ['F: f', 'G: f', 'H: h'] - - >>> BindingDict([(F,f),(G,g),(H,h)]) == BindingDict({F:f, G:g, H:h}) - True - - >>> d4 = BindingDict({F: f}) - >>> try: d4[F] = g - ... except VariableBindingException as e: print(e) - Variable F already bound to another value - -Test Unify - - >>> try: f.unify(g, BindingDict()) - ... except UnificationException as e: print(e) - ... - Cannot unify f with g given {} - - >>> f.unify(G, BindingDict()) == BindingDict({G: f}) - True - >>> try: f.unify(G, BindingDict({G: h})) - ... except UnificationException as e: print(e) - ... - Cannot unify f with G given {G: h} - >>> f.unify(G, BindingDict({G: f})) == BindingDict({G: f}) - True - >>> f.unify(G, BindingDict({H: f})) == BindingDict({G: f, H: f}) - True - - >>> G.unify(f, BindingDict()) == BindingDict({G: f}) - True - >>> try: G.unify(f, BindingDict({G: h})) - ... except UnificationException as e: print(e) - ... - Cannot unify G with f given {G: h} - >>> G.unify(f, BindingDict({G: f})) == BindingDict({G: f}) - True - >>> G.unify(f, BindingDict({H: f})) == BindingDict({G: f, H: f}) - True - - >>> G.unify(F, BindingDict()) == BindingDict({G: F}) - True - >>> try: G.unify(F, BindingDict({G: H})) - ... except UnificationException as e: print(e) - ... - Cannot unify G with F given {G: H} - >>> G.unify(F, BindingDict({G: F})) == BindingDict({G: F}) - True - >>> G.unify(F, BindingDict({H: F})) == BindingDict({G: F, H: F}) - True - -Test Compile - - >>> print(read_expr('g').compile_pos(Counter(), GlueFormula)) - (, []) - >>> print(read_expr('(g -o f)').compile_pos(Counter(), GlueFormula)) - (, []) - >>> print(read_expr('(g -o (h -o f))').compile_pos(Counter(), GlueFormula)) - (, []) - - -====================== -Glue -====================== - -Demo of "John walks" --------------------- - - >>> john = GlueFormula("John", "g") - >>> print(john) - John : g - >>> walks = GlueFormula(r"\x.walks(x)", "(g -o f)") - >>> print(walks) - \x.walks(x) : (g -o f) - >>> print(walks.applyto(john)) - \x.walks(x)(John) : (g -o f)(g) - >>> print(walks.applyto(john).simplify()) - walks(John) : f - - -Demo of "A dog walks" ---------------------- - - >>> a = GlueFormula("\\P Q.some x.(P(x) and Q(x))", "((gv -o gr) -o ((g -o G) -o G))") - >>> print(a) - \P Q.exists x.(P(x) & Q(x)) : ((gv -o gr) -o ((g -o G) -o G)) - >>> man = GlueFormula(r"\x.man(x)", "(gv -o gr)") - >>> print(man) - \x.man(x) : (gv -o gr) - >>> walks = GlueFormula(r"\x.walks(x)", "(g -o f)") - >>> print(walks) - \x.walks(x) : (g -o f) - >>> a_man = a.applyto(man) - >>> print(a_man.simplify()) - \Q.exists x.(man(x) & Q(x)) : ((g -o G) -o G) - >>> a_man_walks = a_man.applyto(walks) - >>> print(a_man_walks.simplify()) - exists x.(man(x) & walks(x)) : f - - -Demo of 'every girl chases a dog' ---------------------------------- - -Individual words: - - >>> every = GlueFormula("\\P Q.all x.(P(x) -> Q(x))", "((gv -o gr) -o ((g -o G) -o G))") - >>> print(every) - \P Q.all x.(P(x) -> Q(x)) : ((gv -o gr) -o ((g -o G) -o G)) - >>> girl = GlueFormula(r"\x.girl(x)", "(gv -o gr)") - >>> print(girl) - \x.girl(x) : (gv -o gr) - >>> chases = GlueFormula(r"\x y.chases(x,y)", "(g -o (h -o f))") - >>> print(chases) - \x y.chases(x,y) : (g -o (h -o f)) - >>> a = GlueFormula("\\P Q.some x.(P(x) and Q(x))", "((hv -o hr) -o ((h -o H) -o H))") - >>> print(a) - \P Q.exists x.(P(x) & Q(x)) : ((hv -o hr) -o ((h -o H) -o H)) - >>> dog = GlueFormula(r"\x.dog(x)", "(hv -o hr)") - >>> print(dog) - \x.dog(x) : (hv -o hr) - -Noun Quantification can only be done one way: - - >>> every_girl = every.applyto(girl) - >>> print(every_girl.simplify()) - \Q.all x.(girl(x) -> Q(x)) : ((g -o G) -o G) - >>> a_dog = a.applyto(dog) - >>> print(a_dog.simplify()) - \Q.exists x.(dog(x) & Q(x)) : ((h -o H) -o H) - -The first reading is achieved by combining 'chases' with 'a dog' first. -Since 'a girl' requires something of the form '(h -o H)' we must -get rid of the 'g' in the glue of 'see'. We will do this with -the '-o elimination' rule. So, x1 will be our subject placeholder. - - >>> xPrime = GlueFormula("x1", "g") - >>> print(xPrime) - x1 : g - >>> xPrime_chases = chases.applyto(xPrime) - >>> print(xPrime_chases.simplify()) - \y.chases(x1,y) : (h -o f) - >>> xPrime_chases_a_dog = a_dog.applyto(xPrime_chases) - >>> print(xPrime_chases_a_dog.simplify()) - exists x.(dog(x) & chases(x1,x)) : f - -Now we can retract our subject placeholder using lambda-abstraction and -combine with the true subject. - - >>> chases_a_dog = xPrime_chases_a_dog.lambda_abstract(xPrime) - >>> print(chases_a_dog.simplify()) - \x1.exists x.(dog(x) & chases(x1,x)) : (g -o f) - >>> every_girl_chases_a_dog = every_girl.applyto(chases_a_dog) - >>> r1 = every_girl_chases_a_dog.simplify() - >>> r2 = GlueFormula(r'all x.(girl(x) -> exists z1.(dog(z1) & chases(x,z1)))', 'f') - >>> r1 == r2 - True - -The second reading is achieved by combining 'every girl' with 'chases' first. - - >>> xPrime = GlueFormula("x1", "g") - >>> print(xPrime) - x1 : g - >>> xPrime_chases = chases.applyto(xPrime) - >>> print(xPrime_chases.simplify()) - \y.chases(x1,y) : (h -o f) - >>> yPrime = GlueFormula("x2", "h") - >>> print(yPrime) - x2 : h - >>> xPrime_chases_yPrime = xPrime_chases.applyto(yPrime) - >>> print(xPrime_chases_yPrime.simplify()) - chases(x1,x2) : f - >>> chases_yPrime = xPrime_chases_yPrime.lambda_abstract(xPrime) - >>> print(chases_yPrime.simplify()) - \x1.chases(x1,x2) : (g -o f) - >>> every_girl_chases_yPrime = every_girl.applyto(chases_yPrime) - >>> print(every_girl_chases_yPrime.simplify()) - all x.(girl(x) -> chases(x,x2)) : f - >>> every_girl_chases = every_girl_chases_yPrime.lambda_abstract(yPrime) - >>> print(every_girl_chases.simplify()) - \x2.all x.(girl(x) -> chases(x,x2)) : (h -o f) - >>> every_girl_chases_a_dog = a_dog.applyto(every_girl_chases) - >>> r1 = every_girl_chases_a_dog.simplify() - >>> r2 = GlueFormula(r'exists x.(dog(x) & all z2.(girl(z2) -> chases(z2,x)))', 'f') - >>> r1 == r2 - True - - -Compilation ------------ - - >>> for cp in GlueFormula('m', '(b -o a)').compile(Counter()): print(cp) - m : (b -o a) : {1} - >>> for cp in GlueFormula('m', '((c -o b) -o a)').compile(Counter()): print(cp) - v1 : c : {1} - m : (b[1] -o a) : {2} - >>> for cp in GlueFormula('m', '((d -o (c -o b)) -o a)').compile(Counter()): print(cp) - v1 : c : {1} - v2 : d : {2} - m : (b[1, 2] -o a) : {3} - >>> for cp in GlueFormula('m', '((d -o e) -o ((c -o b) -o a))').compile(Counter()): print(cp) - v1 : d : {1} - v2 : c : {2} - m : (e[1] -o (b[2] -o a)) : {3} - >>> for cp in GlueFormula('m', '(((d -o c) -o b) -o a)').compile(Counter()): print(cp) - v1 : (d -o c) : {1} - m : (b[1] -o a) : {2} - >>> for cp in GlueFormula('m', '((((e -o d) -o c) -o b) -o a)').compile(Counter()): print(cp) - v1 : e : {1} - v2 : (d[1] -o c) : {2} - m : (b[2] -o a) : {3} - - -Demo of 'a man walks' using Compilation ---------------------------------------- - -Premises - - >>> a = GlueFormula('\\P Q.some x.(P(x) and Q(x))', '((gv -o gr) -o ((g -o G) -o G))') - >>> print(a) - \P Q.exists x.(P(x) & Q(x)) : ((gv -o gr) -o ((g -o G) -o G)) - - >>> man = GlueFormula('\\x.man(x)', '(gv -o gr)') - >>> print(man) - \x.man(x) : (gv -o gr) - - >>> walks = GlueFormula('\\x.walks(x)', '(g -o f)') - >>> print(walks) - \x.walks(x) : (g -o f) - -Compiled Premises: - - >>> counter = Counter() - >>> ahc = a.compile(counter) - >>> g1 = ahc[0] - >>> print(g1) - v1 : gv : {1} - >>> g2 = ahc[1] - >>> print(g2) - v2 : g : {2} - >>> g3 = ahc[2] - >>> print(g3) - \P Q.exists x.(P(x) & Q(x)) : (gr[1] -o (G[2] -o G)) : {3} - >>> g4 = man.compile(counter)[0] - >>> print(g4) - \x.man(x) : (gv -o gr) : {4} - >>> g5 = walks.compile(counter)[0] - >>> print(g5) - \x.walks(x) : (g -o f) : {5} - -Derivation: - - >>> g14 = g4.applyto(g1) - >>> print(g14.simplify()) - man(v1) : gr : {1, 4} - >>> g134 = g3.applyto(g14) - >>> print(g134.simplify()) - \Q.exists x.(man(x) & Q(x)) : (G[2] -o G) : {1, 3, 4} - >>> g25 = g5.applyto(g2) - >>> print(g25.simplify()) - walks(v2) : f : {2, 5} - >>> g12345 = g134.applyto(g25) - >>> print(g12345.simplify()) - exists x.(man(x) & walks(x)) : f : {1, 2, 3, 4, 5} - ---------------------------------- -Dependency Graph to Glue Formulas ---------------------------------- - >>> from nltk.corpus.reader.dependency import DependencyGraph - - >>> depgraph = DependencyGraph("""1 John _ NNP NNP _ 2 SUBJ _ _ - ... 2 sees _ VB VB _ 0 ROOT _ _ - ... 3 a _ ex_quant ex_quant _ 4 SPEC _ _ - ... 4 dog _ NN NN _ 2 OBJ _ _ - ... """) - >>> gfl = GlueDict('nltk:grammars/sample_grammars/glue.semtype').to_glueformula_list(depgraph) - >>> print(gfl) # doctest: +SKIP - [\x y.sees(x,y) : (f -o (i -o g)), - \x.dog(x) : (iv -o ir), - \P Q.exists x.(P(x) & Q(x)) : ((iv -o ir) -o ((i -o I3) -o I3)), - \P Q.exists x.(P(x) & Q(x)) : ((fv -o fr) -o ((f -o F4) -o F4)), - \x.John(x) : (fv -o fr)] - >>> glue = Glue() - >>> for r in sorted([r.simplify().normalize() for r in glue.get_readings(glue.gfl_to_compiled(gfl))], key=str): - ... print(r) - exists z1.(John(z1) & exists z2.(dog(z2) & sees(z1,z2))) - exists z1.(dog(z1) & exists z2.(John(z2) & sees(z2,z1))) - ------------------------------------ -Dependency Graph to LFG f-structure ------------------------------------ - >>> from nltk.sem.lfg import FStructure - - >>> fstruct = FStructure.read_depgraph(depgraph) - - >>> print(fstruct) # doctest: +SKIP - f:[pred 'sees' - obj h:[pred 'dog' - spec 'a'] - subj g:[pred 'John']] - - >>> fstruct.to_depgraph().tree().pprint() - (sees (dog a) John) - ---------------------------------- -LFG f-structure to Glue ---------------------------------- - >>> fstruct.to_glueformula_list(GlueDict('nltk:grammars/sample_grammars/glue.semtype')) # doctest: +SKIP - [\x y.sees(x,y) : (i -o (g -o f)), - \x.dog(x) : (gv -o gr), - \P Q.exists x.(P(x) & Q(x)) : ((gv -o gr) -o ((g -o G3) -o G3)), - \P Q.exists x.(P(x) & Q(x)) : ((iv -o ir) -o ((i -o I4) -o I4)), - \x.John(x) : (iv -o ir)] - -.. see gluesemantics_malt.doctest for more diff --git a/pipeline/nltk/test/gluesemantics_malt.doctest b/pipeline/nltk/test/gluesemantics_malt.doctest deleted file mode 100644 index 66bebd35c553e47457bba3b3e15c6f2233698d85..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/gluesemantics_malt.doctest +++ /dev/null @@ -1,69 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -.. see also: gluesemantics.doctest - -============================================================================== - Glue Semantics -============================================================================== - - >>> from nltk.test.gluesemantics_malt_fixt import setup_module - >>> setup_module() - - >>> from nltk.sem.glue import * - >>> nltk.sem.logic._counter._value = 0 - --------------------------------- -Initialize the Dependency Parser --------------------------------- - >>> from nltk.parse.malt import MaltParser - - >>> tagger = RegexpTagger( - ... [('^(John|Mary)$', 'NNP'), - ... ('^(sees|chases)$', 'VB'), - ... ('^(a)$', 'ex_quant'), - ... ('^(every)$', 'univ_quant'), - ... ('^(girl|dog)$', 'NN') - ... ]).tag - >>> depparser = MaltParser(tagger=tagger) - --------------------- -Automated Derivation --------------------- - >>> glue = Glue(depparser=depparser) - >>> readings = glue.parse_to_meaning('every girl chases a dog'.split()) - >>> for reading in sorted([r.simplify().normalize() for r in readings], key=str): - ... print(reading.normalize()) - all z1.(girl(z1) -> exists z2.(dog(z2) & chases(z1,z2))) - exists z1.(dog(z1) & all z2.(girl(z2) -> chases(z2,z1))) - - >>> drtglue = DrtGlue(depparser=depparser) - >>> readings = drtglue.parse_to_meaning('every girl chases a dog'.split()) - >>> for reading in sorted([r.simplify().normalize() for r in readings], key=str): - ... print(reading) - ([],[(([z1],[girl(z1)]) -> ([z2],[dog(z2), chases(z1,z2)]))]) - ([z1],[dog(z1), (([z2],[girl(z2)]) -> ([],[chases(z2,z1)]))]) - --------------- -With inference --------------- - -Checking for equality of two DRSs is very useful when generating readings of a sentence. -For example, the ``glue`` module generates two readings for the sentence -*John sees Mary*: - - >>> from nltk.sem.glue import DrtGlue - >>> readings = drtglue.parse_to_meaning('John sees Mary'.split()) - >>> for drs in sorted([r.simplify().normalize() for r in readings], key=str): - ... print(drs) - ([z1,z2],[John(z1), Mary(z2), sees(z1,z2)]) - ([z1,z2],[Mary(z1), John(z2), sees(z2,z1)]) - -However, it is easy to tell that these two readings are logically the -same, and therefore one of them is superfluous. We can use the theorem prover -to determine this equivalence, and then delete one of them. A particular -theorem prover may be specified, or the argument may be left off to use the -default. - - >>> readings[0].equiv(readings[1]) - True diff --git a/pipeline/nltk/test/gluesemantics_malt_fixt.py b/pipeline/nltk/test/gluesemantics_malt_fixt.py deleted file mode 100644 index ad278231a9c9798936f9c8236dc8c16ed4437a28..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/gluesemantics_malt_fixt.py +++ /dev/null @@ -1,9 +0,0 @@ -def setup_module(): - import pytest - - from nltk.parse.malt import MaltParser - - try: - depparser = MaltParser() - except (AssertionError, LookupError) as e: - pytest.skip("MaltParser is not available") diff --git a/pipeline/nltk/test/grammar.doctest b/pipeline/nltk/test/grammar.doctest deleted file mode 100644 index 5d8f96d5265a0a8d185edbdcff328b932d542343..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/grammar.doctest +++ /dev/null @@ -1,69 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -=============== -Grammar Parsing -=============== - -Grammars can be parsed from strings: - - >>> from nltk import CFG - >>> grammar = CFG.fromstring(""" - ... S -> NP VP - ... PP -> P NP - ... NP -> Det N | NP PP - ... VP -> V NP | VP PP - ... Det -> 'a' | 'the' - ... N -> 'dog' | 'cat' - ... V -> 'chased' | 'sat' - ... P -> 'on' | 'in' - ... """) - >>> grammar - - >>> grammar.start() - S - >>> grammar.productions() - [S -> NP VP, PP -> P NP, NP -> Det N, NP -> NP PP, VP -> V NP, VP -> VP PP, - Det -> 'a', Det -> 'the', N -> 'dog', N -> 'cat', V -> 'chased', V -> 'sat', - P -> 'on', P -> 'in'] - -Probabilistic CFGs: - - >>> from nltk import PCFG - >>> toy_pcfg1 = PCFG.fromstring(""" - ... S -> NP VP [1.0] - ... NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] - ... Det -> 'the' [0.8] | 'my' [0.2] - ... N -> 'man' [0.5] | 'telescope' [0.5] - ... VP -> VP PP [0.1] | V NP [0.7] | V [0.2] - ... V -> 'ate' [0.35] | 'saw' [0.65] - ... PP -> P NP [1.0] - ... P -> 'with' [0.61] | 'under' [0.39] - ... """) - -Chomsky Normal Form grammar (Test for bug 474) - - >>> g = CFG.fromstring("VP^ -> VBP NP^") - >>> g.productions()[0].lhs() - VP^ - -Grammars can contain both empty strings and empty productions: - - >>> from nltk.grammar import CFG - >>> from nltk.parse.generate import generate - >>> grammar = CFG.fromstring(""" - ... S -> A B - ... A -> 'a' - ... # An empty string: - ... B -> 'b' | '' - ... """) - >>> list(generate(grammar)) - [['a', 'b'], ['a', '']] - >>> grammar = CFG.fromstring(""" - ... S -> A B - ... A -> 'a' - ... # An empty production: - ... B -> 'b' | - ... """) - >>> list(generate(grammar)) - [['a', 'b'], ['a']] diff --git a/pipeline/nltk/test/grammartestsuites.doctest b/pipeline/nltk/test/grammartestsuites.doctest deleted file mode 100644 index 2d008b70f6fedd55c537188f2b69f688df873201..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/grammartestsuites.doctest +++ /dev/null @@ -1,109 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -========================== - Test Suites for Grammars -========================== - -Sentences in the test suite are divided into two classes: - -- grammatical (*accept*) and -- ungrammatical (*reject*). - -If a sentence should parse according to the grammar, the value of -``trees`` will be a non-empty list. If a sentence should be rejected -according to the grammar, then the value of ``trees`` will be ``None``. - - >>> from nltk.parse import TestGrammar - >>> germantest1 = {} - >>> germantest1['doc'] = "Tests for person agreement" - >>> germantest1['accept'] = [ - ... 'ich komme', - ... 'ich sehe mich', - ... 'du kommst', - ... 'du siehst mich', - ... 'sie kommt', - ... 'sie sieht mich', - ... 'ihr kommt', - ... 'wir kommen', - ... 'sie kommen', - ... 'du magst mich', - ... 'er mag mich', - ... 'du folgst mir', - ... 'sie hilft mir', - ... ] - >>> germantest1['reject'] = [ - ... 'ich kommt', - ... 'ich kommst', - ... 'ich siehst mich', - ... 'du komme', - ... 'du sehe mich', - ... 'du kommt', - ... 'er komme', - ... 'er siehst mich', - ... 'wir komme', - ... 'wir kommst', - ... 'die Katzen kommst', - ... 'sie komme', - ... 'sie kommst', - ... 'du mag mich', - ... 'er magst mich', - ... 'du folgt mir', - ... 'sie hilfst mir', - ... ] - >>> germantest2 = {} - >>> germantest2['doc'] = "Tests for number agreement" - >>> germantest2['accept'] = [ - ... 'der Hund kommt', - ... 'die Hunde kommen', - ... 'ich komme', - ... 'wir kommen', - ... 'ich sehe die Katzen', - ... 'ich folge den Katzen', - ... 'ich sehe die Katzen', - ... 'ich folge den Katzen', - ... 'wir sehen die Katzen', - ... 'wir folgen den Katzen' - ... ] - >>> germantest2['reject'] = [ - ... 'ich kommen', - ... 'wir komme', - ... 'der Hunde kommt', - ... 'der Hunde kommen', - ... 'die Katzen kommt', - ... 'ich sehe der Hunde', - ... 'ich folge den Hund', - ... 'ich sehen der Hunde', - ... 'ich folgen den Hund', - ... 'wir sehe die Katzen', - ... 'wir folge den Katzen' - ... ] - >>> germantest3 = {} - >>> germantest3['doc'] = "Tests for case government and subcategorization" - >>> germantest3['accept'] = [ - ... 'der Hund sieht mich', - ... 'der Hund kommt', - ... 'ich sehe den Hund', - ... 'ich helfe dem Hund', - ... ] - >>> germantest3['reject'] = [ - ... 'ich sehe', - ... 'ich helfe', - ... 'ich komme den Hund', - ... 'ich sehe den Hund die Katzen', - ... 'du hilfst mich', - ... 'du siehst mir', - ... 'du siehst ich', - ... 'der Hunde kommt mich', - ... 'die Hunde sehe die Hunde', - ... 'der Hund sehe die Hunde', - ... 'ich hilft den Hund', - ... 'ich hilft der Hund', - ... 'ich sehe dem Hund', - ... ] - >>> germantestsuites = [germantest1, germantest2, germantest3] - >>> tester = TestGrammar('grammars/book_grammars/german.fcfg', germantestsuites) - >>> tester.run() - Tests for person agreement: All tests passed! - Tests for number agreement: All tests passed! - Tests for case government and subcategorization: All tests passed! diff --git a/pipeline/nltk/test/index.doctest b/pipeline/nltk/test/index.doctest deleted file mode 100644 index b37310189cbd57495322fd2a5ac6bda89b3e2b3b..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/index.doctest +++ /dev/null @@ -1,100 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -.. _align howto: align.html -.. _ccg howto: ccg.html -.. _chat80 howto: chat80.html -.. _childes howto: childes.html -.. _chunk howto: chunk.html -.. _classify howto: classify.html -.. _collocations howto: collocations.html -.. _compat howto: compat.html -.. _corpus howto: corpus.html -.. _data howto: data.html -.. _dependency howto: dependency.html -.. _discourse howto: discourse.html -.. _drt howto: drt.html -.. _featgram howto: featgram.html -.. _featstruct howto: featstruct.html -.. _framenet howto: framenet.html -.. _generate howto: generate.html -.. _gluesemantics howto: gluesemantics.html -.. _gluesemantics_malt howto: gluesemantics_malt.html -.. _grammar howto: grammar.html -.. _grammartestsuites howto: grammartestsuites.html -.. _index howto: index.html -.. _inference howto: inference.html -.. _internals howto: internals.html -.. _japanese howto: japanese.html -.. _logic howto: logic.html -.. _metrics howto: metrics.html -.. _misc howto: misc.html -.. _nonmonotonic howto: nonmonotonic.html -.. _parse howto: parse.html -.. _portuguese_en howto: portuguese_en.html -.. _probability howto: probability.html -.. _propbank howto: propbank.html -.. _relextract howto: relextract.html -.. _resolution howto: resolution.html -.. _semantics howto: semantics.html -.. _simple howto: simple.html -.. _stem howto: stem.html -.. _tag howto: tag.html -.. _tokenize howto: tokenize.html -.. _toolbox howto: toolbox.html -.. _tree howto: tree.html -.. _treetransforms howto: treetransforms.html -.. _util howto: util.html -.. _wordnet howto: wordnet.html -.. _wordnet_lch howto: wordnet_lch.html - -=========== -NLTK HOWTOs -=========== - -* `align HOWTO`_ -* `ccg HOWTO`_ -* `chat80 HOWTO`_ -* `childes HOWTO`_ -* `chunk HOWTO`_ -* `classify HOWTO`_ -* `collocations HOWTO`_ -* `compat HOWTO`_ -* `corpus HOWTO`_ -* `data HOWTO`_ -* `dependency HOWTO`_ -* `discourse HOWTO`_ -* `drt HOWTO`_ -* `featgram HOWTO`_ -* `featstruct HOWTO`_ -* `framenet HOWTO`_ -* `generate HOWTO`_ -* `gluesemantics HOWTO`_ -* `gluesemantics_malt HOWTO`_ -* `grammar HOWTO`_ -* `grammartestsuites HOWTO`_ -* `index HOWTO`_ -* `inference HOWTO`_ -* `internals HOWTO`_ -* `japanese HOWTO`_ -* `logic HOWTO`_ -* `metrics HOWTO`_ -* `misc HOWTO`_ -* `nonmonotonic HOWTO`_ -* `parse HOWTO`_ -* `portuguese_en HOWTO`_ -* `probability HOWTO`_ -* `propbank HOWTO`_ -* `relextract HOWTO`_ -* `resolution HOWTO`_ -* `semantics HOWTO`_ -* `simple HOWTO`_ -* `stem HOWTO`_ -* `tag HOWTO`_ -* `tokenize HOWTO`_ -* `toolbox HOWTO`_ -* `tree HOWTO`_ -* `treetransforms HOWTO`_ -* `util HOWTO`_ -* `wordnet HOWTO`_ -* `wordnet_lch HOWTO`_ diff --git a/pipeline/nltk/test/inference.doctest b/pipeline/nltk/test/inference.doctest deleted file mode 100644 index 28dad36ef6f22cf31a7d31f6754903aaa6cdb3e0..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/inference.doctest +++ /dev/null @@ -1,536 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -==================================== -Logical Inference and Model Building -==================================== - - >>> from nltk.test.setup_fixt import check_binary - >>> check_binary('mace4') - - >>> from nltk import * - >>> from nltk.sem.drt import DrtParser - >>> from nltk.sem import logic - >>> logic._counter._value = 0 - ------------- -Introduction ------------- - -Within the area of automated reasoning, first order theorem proving -and model building (or model generation) have both received much -attention, and have given rise to highly sophisticated techniques. We -focus therefore on providing an NLTK interface to third party tools -for these tasks. In particular, the module ``nltk.inference`` can be -used to access both theorem provers and model builders. - ---------------------------------- -NLTK Interface to Theorem Provers ---------------------------------- - -The main class used to interface with a theorem prover is the ``Prover`` -class, found in ``nltk.api``. The ``prove()`` method takes three optional -arguments: a goal, a list of assumptions, and a ``verbose`` boolean to -indicate whether the proof should be printed to the console. The proof goal -and any assumptions need to be instances of the ``Expression`` class -specified by ``nltk.sem.logic``. There are currently three theorem provers -included with NLTK: ``Prover9``, ``TableauProver``, and -``ResolutionProver``. The first is an off-the-shelf prover, while the other -two are written in Python and included in the ``nltk.inference`` package. - - >>> from nltk.sem import Expression - >>> read_expr = Expression.fromstring - >>> p1 = read_expr('man(socrates)') - >>> p2 = read_expr('all x.(man(x) -> mortal(x))') - >>> c = read_expr('mortal(socrates)') - >>> Prover9().prove(c, [p1,p2]) - True - >>> TableauProver().prove(c, [p1,p2]) - True - >>> ResolutionProver().prove(c, [p1,p2], verbose=True) - [1] {-mortal(socrates)} A - [2] {man(socrates)} A - [3] {-man(z2), mortal(z2)} A - [4] {-man(socrates)} (1, 3) - [5] {mortal(socrates)} (2, 3) - [6] {} (1, 5) - - True - ---------------------- -The ``ProverCommand`` ---------------------- - -A ``ProverCommand`` is a stateful holder for a theorem -prover. The command stores a theorem prover instance (of type ``Prover``), -a goal, a list of assumptions, the result of the proof, and a string version -of the entire proof. Corresponding to the three included ``Prover`` -implementations, there are three ``ProverCommand`` implementations: -``Prover9Command``, ``TableauProverCommand``, and -``ResolutionProverCommand``. - -The ``ProverCommand``'s constructor takes its goal and assumptions. The -``prove()`` command executes the ``Prover`` and ``proof()`` -returns a String form of the proof -If the ``prove()`` method has not been called, -then the prover command will be unable to display a proof. - - >>> prover = ResolutionProverCommand(c, [p1,p2]) - >>> print(prover.proof()) - Traceback (most recent call last): - File "...", line 1212, in __run - compileflags, 1) in test.globs - File "", line 1, in - File "...", line ..., in proof - raise LookupError("You have to call prove() first to get a proof!") - LookupError: You have to call prove() first to get a proof! - >>> prover.prove() - True - >>> print(prover.proof()) - [1] {-mortal(socrates)} A - [2] {man(socrates)} A - [3] {-man(z4), mortal(z4)} A - [4] {-man(socrates)} (1, 3) - [5] {mortal(socrates)} (2, 3) - [6] {} (1, 5) - - -The prover command stores the result of proving so that if ``prove()`` is -called again, then the command can return the result without executing the -prover again. This allows the user to access the result of the proof without -wasting time re-computing what it already knows. - - >>> prover.prove() - True - >>> prover.prove() - True - -The assumptions and goal may be accessed using the ``assumptions()`` and -``goal()`` methods, respectively. - - >>> prover.assumptions() - [, mortal(x))>] - >>> prover.goal() - - -The assumptions list may be modified using the ``add_assumptions()`` and -``retract_assumptions()`` methods. Both methods take a list of ``Expression`` -objects. Since adding or removing assumptions may change the result of the -proof, the stored result is cleared when either of these methods are called. -That means that ``proof()`` will be unavailable until ``prove()`` is called and -a call to ``prove()`` will execute the theorem prover. - - >>> prover.retract_assumptions([read_expr('man(socrates)')]) - >>> print(prover.proof()) - Traceback (most recent call last): - File "...", line 1212, in __run - compileflags, 1) in test.globs - File "", line 1, in - File "...", line ..., in proof - raise LookupError("You have to call prove() first to get a proof!") - LookupError: You have to call prove() first to get a proof! - >>> prover.prove() - False - >>> print(prover.proof()) - [1] {-mortal(socrates)} A - [2] {-man(z6), mortal(z6)} A - [3] {-man(socrates)} (1, 2) - - >>> prover.add_assumptions([read_expr('man(socrates)')]) - >>> prover.prove() - True - -------- -Prover9 -------- - -Prover9 Installation -~~~~~~~~~~~~~~~~~~~~ - -You can download Prover9 from https://www.cs.unm.edu/~mccune/prover9/. - -Extract the source code into a suitable directory and follow the -instructions in the Prover9 ``README.make`` file to compile the executables. -Install these into an appropriate location; the -``prover9_search`` variable is currently configured to look in the -following locations: - - >>> p = Prover9() - >>> p.binary_locations() - ['/usr/local/bin/prover9', - '/usr/local/bin/prover9/bin', - '/usr/local/bin', - '/usr/bin', - '/usr/local/prover9', - '/usr/local/share/prover9'] - -Alternatively, the environment variable ``PROVER9HOME`` may be configured with -the binary's location. - -The path to the correct directory can be set manually in the following -manner: - - >>> config_prover9(path='/usr/local/bin') # doctest: +SKIP - [Found prover9: /usr/local/bin/prover9] - -If the executables cannot be found, ``Prover9`` will issue a warning message: - - >>> p.prove() # doctest: +SKIP - Traceback (most recent call last): - ... - LookupError: - =========================================================================== - NLTK was unable to find the prover9 executable! Use config_prover9() or - set the PROVER9HOME environment variable. - - >> config_prover9('/path/to/prover9') - - For more information, on prover9, see: - - =========================================================================== - - -Using Prover9 -~~~~~~~~~~~~~ - -The general case in theorem proving is to determine whether ``S |- g`` -holds, where ``S`` is a possibly empty set of assumptions, and ``g`` -is a proof goal. - -As mentioned earlier, NLTK input to ``Prover9`` must be -``Expression``\ s of ``nltk.sem.logic``. A ``Prover9`` instance is -initialized with a proof goal and, possibly, some assumptions. The -``prove()`` method attempts to find a proof of the goal, given the -list of assumptions (in this case, none). - - >>> goal = read_expr('(man(x) <-> --man(x))') - >>> prover = Prover9Command(goal) - >>> prover.prove() - True - -Given a ``ProverCommand`` instance ``prover``, the method -``prover.proof()`` will return a String of the extensive proof information -provided by Prover9, shown in abbreviated form here:: - - ============================== Prover9 =============================== - Prover9 (32) version ... - Process ... was started by ... on ... - ... - The command was ".../prover9 -f ...". - ============================== end of head =========================== - - ============================== INPUT ================================= - - % Reading from file /var/... - - - formulas(goals). - (all x (man(x) -> man(x))). - end_of_list. - - ... - ============================== end of search ========================= - - THEOREM PROVED - - Exiting with 1 proof. - - Process 6317 exit (max_proofs) Mon Jan 21 15:23:28 2008 - - -As mentioned earlier, we may want to list some assumptions for -the proof, as shown here. - - >>> g = read_expr('mortal(socrates)') - >>> a1 = read_expr('all x.(man(x) -> mortal(x))') - >>> prover = Prover9Command(g, assumptions=[a1]) - >>> prover.print_assumptions() - all x.(man(x) -> mortal(x)) - -However, the assumptions are not sufficient to derive the goal: - - >>> print(prover.prove()) - False - -So let's add another assumption: - - >>> a2 = read_expr('man(socrates)') - >>> prover.add_assumptions([a2]) - >>> prover.print_assumptions() - all x.(man(x) -> mortal(x)) - man(socrates) - >>> print(prover.prove()) - True - -We can also show the assumptions in ``Prover9`` format. - - >>> prover.print_assumptions(output_format='Prover9') - all x (man(x) -> mortal(x)) - man(socrates) - - >>> prover.print_assumptions(output_format='Spass') - Traceback (most recent call last): - . . . - NameError: Unrecognized value for 'output_format': Spass - -Assumptions can be retracted from the list of assumptions. - - >>> prover.retract_assumptions([a1]) - >>> prover.print_assumptions() - man(socrates) - >>> prover.retract_assumptions([a1]) - -Statements can be loaded from a file and parsed. We can then add these -statements as new assumptions. - - >>> g = read_expr('all x.(boxer(x) -> -boxerdog(x))') - >>> prover = Prover9Command(g) - >>> prover.prove() - False - >>> import nltk.data - >>> new = nltk.data.load('grammars/sample_grammars/background0.fol') - >>> for a in new: - ... print(a) - all x.(boxerdog(x) -> dog(x)) - all x.(boxer(x) -> person(x)) - all x.-(dog(x) & person(x)) - exists x.boxer(x) - exists x.boxerdog(x) - >>> prover.add_assumptions(new) - >>> print(prover.prove()) - True - >>> print(prover.proof()) - ============================== prooftrans ============================ - Prover9 (...) version ... - Process ... was started by ... on ... - ... - The command was ".../prover9". - ============================== end of head =========================== - - ============================== end of input ========================== - - ============================== PROOF ================================= - - % -------- Comments from original proof -------- - % Proof 1 at ... seconds. - % Length of proof is 13. - % Level of proof is 4. - % Maximum clause weight is 0. - % Given clauses 0. - - 1 (all x (boxerdog(x) -> dog(x))). [assumption]. - 2 (all x (boxer(x) -> person(x))). [assumption]. - 3 (all x -(dog(x) & person(x))). [assumption]. - 6 (all x (boxer(x) -> -boxerdog(x))). [goal]. - 8 -boxerdog(x) | dog(x). [clausify(1)]. - 9 boxerdog(c3). [deny(6)]. - 11 -boxer(x) | person(x). [clausify(2)]. - 12 boxer(c3). [deny(6)]. - 14 -dog(x) | -person(x). [clausify(3)]. - 15 dog(c3). [resolve(9,a,8,a)]. - 18 person(c3). [resolve(12,a,11,a)]. - 19 -person(c3). [resolve(15,a,14,a)]. - 20 $F. [resolve(19,a,18,a)]. - - ============================== end of proof ========================== - ----------------------- -The equiv() method ----------------------- - -One application of the theorem prover functionality is to check if -two Expressions have the same meaning. -The ``equiv()`` method calls a theorem prover to determine whether two -Expressions are logically equivalent. - - >>> a = read_expr(r'exists x.(man(x) & walks(x))') - >>> b = read_expr(r'exists x.(walks(x) & man(x))') - >>> print(a.equiv(b)) - True - -The same method can be used on Discourse Representation Structures (DRSs). -In this case, each DRS is converted to a first order logic form, and then -passed to the theorem prover. - - >>> dp = DrtParser() - >>> a = dp.parse(r'([x],[man(x), walks(x)])') - >>> b = dp.parse(r'([x],[walks(x), man(x)])') - >>> print(a.equiv(b)) - True - - --------------------------------- -NLTK Interface to Model Builders --------------------------------- - -The top-level to model builders is parallel to that for -theorem-provers. The ``ModelBuilder`` interface is located -in ``nltk.inference.api``. It is currently only implemented by -``Mace``, which interfaces with the Mace4 model builder. - -Typically we use a model builder to show that some set of formulas has -a model, and is therefore consistent. One way of doing this is by -treating our candidate set of sentences as assumptions, and leaving -the goal unspecified. -Thus, the following interaction shows how both ``{a, c1}`` and ``{a, c2}`` -are consistent sets, since Mace succeeds in a building a -model for each of them, while ``{c1, c2}`` is inconsistent. - - >>> a3 = read_expr('exists x.(man(x) and walks(x))') - >>> c1 = read_expr('mortal(socrates)') - >>> c2 = read_expr('-mortal(socrates)') - >>> mace = Mace() - >>> print(mace.build_model(None, [a3, c1])) - True - >>> print(mace.build_model(None, [a3, c2])) - True - -We can also use the model builder as an adjunct to theorem prover. -Let's suppose we are trying to prove ``S |- g``, i.e. that ``g`` -is logically entailed by assumptions ``S = {s1, s2, ..., sn}``. -We can this same input to Mace4, and the model builder will try to -find a counterexample, that is, to show that ``g`` does *not* follow -from ``S``. So, given this input, Mace4 will try to find a model for -the set ``S' = {s1, s2, ..., sn, (not g)}``. If ``g`` fails to follow -from ``S``, then Mace4 may well return with a counterexample faster -than Prover9 concludes that it cannot find the required proof. -Conversely, if ``g`` *is* provable from ``S``, Mace4 may take a long -time unsuccessfully trying to find a counter model, and will eventually give up. - -In the following example, we see that the model builder does succeed -in building a model of the assumptions together with the negation of -the goal. That is, it succeeds in finding a model -where there is a woman that every man loves; Adam is a man; Eve is a -woman; but Adam does not love Eve. - - >>> a4 = read_expr('exists y. (woman(y) & all x. (man(x) -> love(x,y)))') - >>> a5 = read_expr('man(adam)') - >>> a6 = read_expr('woman(eve)') - >>> g = read_expr('love(adam,eve)') - >>> print(mace.build_model(g, [a4, a5, a6])) - True - -The Model Builder will fail to find a model if the assumptions do entail -the goal. Mace will continue to look for models of ever-increasing sizes -until the end_size number is reached. By default, end_size is 500, -but it can be set manually for quicker response time. - - >>> a7 = read_expr('all x.(man(x) -> mortal(x))') - >>> a8 = read_expr('man(socrates)') - >>> g2 = read_expr('mortal(socrates)') - >>> print(Mace(end_size=50).build_model(g2, [a7, a8])) - False - -There is also a ``ModelBuilderCommand`` class that, like ``ProverCommand``, -stores a ``ModelBuilder``, a goal, assumptions, a result, and a model. The -only implementation in NLTK is ``MaceCommand``. - - ------ -Mace4 ------ - -Mace4 Installation -~~~~~~~~~~~~~~~~~~ - -Mace4 is packaged with Prover9, and can be downloaded from the same -source, namely https://www.cs.unm.edu/~mccune/prover9/. It is installed -in the same manner as Prover9. - -Using Mace4 -~~~~~~~~~~~ - -Check whether Mace4 can find a model. - - >>> a = read_expr('(see(mary,john) & -(mary = john))') - >>> mb = MaceCommand(assumptions=[a]) - >>> mb.build_model() - True - -Show the model in 'tabular' format. - - >>> print(mb.model(format='tabular')) - % number = 1 - % seconds = 0 - - % Interpretation of size 2 - - john : 0 - - mary : 1 - - see : - | 0 1 - ---+---- - 0 | 0 0 - 1 | 1 0 - - -Show the model in 'tabular' format. - - >>> print(mb.model(format='cooked')) - % number = 1 - % seconds = 0 - - % Interpretation of size 2 - - john = 0. - - mary = 1. - - - see(0,0). - - see(0,1). - see(1,0). - - see(1,1). - - -The property ``valuation`` accesses the stored ``Valuation``. - - >>> print(mb.valuation) - {'john': 'a', 'mary': 'b', 'see': {('b', 'a')}} - -We can return to our earlier example and inspect the model: - - >>> mb = MaceCommand(g, assumptions=[a4, a5, a6]) - >>> m = mb.build_model() - >>> print(mb.model(format='cooked')) - % number = 1 - % seconds = 0 - - % Interpretation of size 2 - - adam = 0. - - eve = 0. - - c1 = 1. - - man(0). - - man(1). - - woman(0). - woman(1). - - - love(0,0). - love(0,1). - - love(1,0). - - love(1,1). - - -Here, we can see that ``adam`` and ``eve`` have been assigned the same -individual, namely ``0`` as value; ``0`` is both a man and a woman; a second -individual ``1`` is also a woman; and ``0`` loves ``1``. Thus, this is -an interpretation in which there is a woman that every man loves but -Adam doesn't love Eve. - -Mace can also be used with propositional logic. - - >>> p = read_expr('P') - >>> q = read_expr('Q') - >>> mb = MaceCommand(q, [p, p>-q]) - >>> mb.build_model() - True - >>> mb.valuation['P'] - True - >>> mb.valuation['Q'] - False diff --git a/pipeline/nltk/test/internals.doctest b/pipeline/nltk/test/internals.doctest deleted file mode 100644 index 2d09a0b698ea626c1098d5fb374c478d4b73c0fd..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/internals.doctest +++ /dev/null @@ -1,161 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -========================================== - Unit tests for the nltk.utilities module -========================================== - -overridden() -~~~~~~~~~~~~ - >>> from nltk.internals import overridden - -The typical use case is in defining methods for an interface or -abstract base class, in such a way that subclasses don't have to -implement all of the methods: - - >>> class EaterI(object): - ... '''Subclass must define eat() or batch_eat().''' - ... def eat(self, food): - ... if overridden(self.batch_eat): - ... return self.batch_eat([food])[0] - ... else: - ... raise NotImplementedError() - ... def batch_eat(self, foods): - ... return [self.eat(food) for food in foods] - -As long as a subclass implements one method, it will be used to -perform the other method: - - >>> class GoodEater1(EaterI): - ... def eat(self, food): - ... return 'yum' - >>> GoodEater1().eat('steak') - 'yum' - >>> GoodEater1().batch_eat(['steak', 'peas']) - ['yum', 'yum'] - - >>> class GoodEater2(EaterI): - ... def batch_eat(self, foods): - ... return ['yum' for food in foods] - >>> GoodEater2().eat('steak') - 'yum' - >>> GoodEater2().batch_eat(['steak', 'peas']) - ['yum', 'yum'] - -But if a subclass doesn't implement either one, then they'll get an -error when they try to call them. (nb this is better than infinite -recursion): - - >>> class BadEater1(EaterI): - ... pass - >>> BadEater1().eat('steak') - Traceback (most recent call last): - . . . - NotImplementedError - >>> BadEater1().batch_eat(['steak', 'peas']) - Traceback (most recent call last): - . . . - NotImplementedError - -Trying to use the abstract base class itself will also result in an -error: - - >>> class EaterI(EaterI): - ... pass - >>> EaterI().eat('steak') - Traceback (most recent call last): - . . . - NotImplementedError - >>> EaterI().batch_eat(['steak', 'peas']) - Traceback (most recent call last): - . . . - NotImplementedError - -It's ok to use intermediate abstract classes: - - >>> class AbstractEater(EaterI): - ... pass - - >>> class GoodEater3(AbstractEater): - ... def eat(self, food): - ... return 'yum' - ... - >>> GoodEater3().eat('steak') - 'yum' - >>> GoodEater3().batch_eat(['steak', 'peas']) - ['yum', 'yum'] - - >>> class GoodEater4(AbstractEater): - ... def batch_eat(self, foods): - ... return ['yum' for food in foods] - >>> GoodEater4().eat('steak') - 'yum' - >>> GoodEater4().batch_eat(['steak', 'peas']) - ['yum', 'yum'] - - >>> class BadEater2(AbstractEater): - ... pass - >>> BadEater2().eat('steak') - Traceback (most recent call last): - . . . - NotImplementedError - >>> BadEater2().batch_eat(['steak', 'peas']) - Traceback (most recent call last): - . . . - NotImplementedError - -Here's some extra tests: - - >>> class A(object): - ... def f(x): pass - >>> class B(A): - ... def f(x): pass - >>> class C(A): pass - >>> class D(B): pass - - >>> overridden(A().f) - False - >>> overridden(B().f) - True - >>> overridden(C().f) - False - >>> overridden(D().f) - True - -It works for classic classes, too: - - >>> class A: - ... def f(x): pass - >>> class B(A): - ... def f(x): pass - >>> class C(A): pass - >>> class D(B): pass - >>> overridden(A().f) - False - >>> overridden(B().f) - True - >>> overridden(C().f) - False - >>> overridden(D().f) - True - - -read_str() -~~~~~~~~~~~~ - >>> from nltk.internals import read_str - -Test valid scenarios - - >>> read_str("'valid string'", 0) - ('valid string', 14) - -Now test invalid scenarios - - >>> read_str("should error", 0) - Traceback (most recent call last): - ... - nltk.internals.ReadError: Expected open quote at 0 - >>> read_str("'should error", 0) - Traceback (most recent call last): - ... - nltk.internals.ReadError: Expected close quote at 1 diff --git a/pipeline/nltk/test/japanese.doctest b/pipeline/nltk/test/japanese.doctest deleted file mode 100644 index 61bbc06a4a65311695f61e2b2891a03a58181d04..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/japanese.doctest +++ /dev/null @@ -1,48 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -============================ -Japanese Language Processing -============================ - - >>> from nltk import * - -------------- -Corpus Access -------------- - -KNB Corpus ----------- - - >>> from nltk.corpus import knbc - -Access the words: this should produce a list of strings: - - >>> type(knbc.words()[0]) is not bytes - True - -Access the sentences: this should produce a list of lists of strings: - - >>> type(knbc.sents()[0][0]) is not bytes - True - -Access the tagged words: this should produce a list of word, tag pairs: - - >>> type(knbc.tagged_words()[0]) - <... 'tuple'> - -Access the tagged sentences: this should produce a list of lists of word, tag pairs: - - >>> type(knbc.tagged_sents()[0][0]) - <... 'tuple'> - - -JEITA Corpus ------------- - - >>> from nltk.corpus import jeita - -Access the tagged words: this should produce a list of word, tag pairs, where a tag is a string: - - >>> type(jeita.tagged_words()[0][1]) is not bytes - True diff --git a/pipeline/nltk/test/lm.doctest b/pipeline/nltk/test/lm.doctest deleted file mode 100644 index 9668582b3f90f8bcc5ee48f72b0a41d2da9660e5..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/lm.doctest +++ /dev/null @@ -1,135 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -.. -*- coding: utf-8 -*- - - -Regression Tests -================ - - -Issue 167 ---------- -https://github.com/nltk/nltk/issues/167 - - >>> from nltk.corpus import brown - >>> from nltk.lm.preprocessing import padded_everygram_pipeline - >>> ngram_order = 3 - >>> train_data, vocab_data = padded_everygram_pipeline( - ... ngram_order, - ... brown.sents(categories="news") - ... ) - - >>> from nltk.lm import WittenBellInterpolated - >>> lm = WittenBellInterpolated(ngram_order) - >>> lm.fit(train_data, vocab_data) - - - - -Sentence containing an unseen word should result in infinite entropy because -Witten-Bell is based ultimately on MLE, which cannot handle unseen ngrams. -Crucially, it shouldn't raise any exceptions for unseen words. - - >>> from nltk.util import ngrams - >>> sent = ngrams("This is a sentence with the word aaddvark".split(), 3) - >>> lm.entropy(sent) - inf - -If we remove all unseen ngrams from the sentence, we'll get a non-infinite value -for the entropy. - - >>> sent = ngrams("This is a sentence".split(), 3) - >>> round(lm.entropy(sent), 14) - 10.23701322869105 - - -Issue 367 ---------- -https://github.com/nltk/nltk/issues/367 - -Reproducing Dan Blanchard's example: -https://github.com/nltk/nltk/issues/367#issuecomment-14646110 - - >>> from nltk.lm import Lidstone, Vocabulary - >>> word_seq = list('aaaababaaccbacb') - >>> ngram_order = 2 - >>> from nltk.util import everygrams - >>> train_data = [everygrams(word_seq, max_len=ngram_order)] - >>> V = Vocabulary(['a', 'b', 'c', '']) - >>> lm = Lidstone(0.2, ngram_order, vocabulary=V) - >>> lm.fit(train_data) - -For doctest to work we have to sort the vocabulary keys. - - >>> V_keys = sorted(V) - >>> round(sum(lm.score(w, ("b",)) for w in V_keys), 6) - 1.0 - >>> round(sum(lm.score(w, ("a",)) for w in V_keys), 6) - 1.0 - - >>> [lm.score(w, ("b",)) for w in V_keys] - [0.05, 0.05, 0.8, 0.05, 0.05] - >>> [round(lm.score(w, ("a",)), 4) for w in V_keys] - [0.0222, 0.0222, 0.4667, 0.2444, 0.2444] - - -Here's reproducing @afourney's comment: -https://github.com/nltk/nltk/issues/367#issuecomment-15686289 - - >>> sent = ['foo', 'foo', 'foo', 'foo', 'bar', 'baz'] - >>> ngram_order = 3 - >>> from nltk.lm.preprocessing import padded_everygram_pipeline - >>> train_data, vocab_data = padded_everygram_pipeline(ngram_order, [sent]) - >>> from nltk.lm import Lidstone - >>> lm = Lidstone(0.2, ngram_order) - >>> lm.fit(train_data, vocab_data) - -The vocabulary includes the "UNK" symbol as well as two padding symbols. - - >>> len(lm.vocab) - 6 - >>> word = "foo" - >>> context = ("bar", "baz") - -The raw counts. - - >>> lm.context_counts(context)[word] - 0 - >>> lm.context_counts(context).N() - 1 - -Counts with Lidstone smoothing. - - >>> lm.context_counts(context)[word] + lm.gamma - 0.2 - >>> lm.context_counts(context).N() + len(lm.vocab) * lm.gamma - 2.2 - -Without any backoff, just using Lidstone smoothing, P("foo" | "bar", "baz") should be: -0.2 / 2.2 ~= 0.090909 - - >>> round(lm.score(word, context), 6) - 0.090909 - - -Issue 380 ---------- -https://github.com/nltk/nltk/issues/380 - -Reproducing setup akin to this comment: -https://github.com/nltk/nltk/issues/380#issue-12879030 - -For speed take only the first 100 sentences of reuters. Shouldn't affect the test. - - >>> from nltk.corpus import reuters - >>> sents = reuters.sents()[:100] - >>> ngram_order = 3 - >>> from nltk.lm.preprocessing import padded_everygram_pipeline - >>> train_data, vocab_data = padded_everygram_pipeline(ngram_order, sents) - - >>> from nltk.lm import Lidstone - >>> lm = Lidstone(0.2, ngram_order) - >>> lm.fit(train_data, vocab_data) - >>> lm.score("said", ("",)) < 1 - True diff --git a/pipeline/nltk/test/logic.doctest b/pipeline/nltk/test/logic.doctest deleted file mode 100644 index 1c08675f1ca21ae8826851f7299cb482bf812910..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/logic.doctest +++ /dev/null @@ -1,1096 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -======================= -Logic & Lambda Calculus -======================= - -The `nltk.logic` package allows expressions of First-Order Logic (FOL) to be -parsed into ``Expression`` objects. In addition to FOL, the parser -handles lambda-abstraction with variables of higher order. - --------- -Overview --------- - - >>> from nltk.sem.logic import * - -The default inventory of logical constants is the following: - - >>> boolean_ops() - negation - - conjunction & - disjunction | - implication -> - equivalence <-> - >>> equality_preds() - equality = - inequality != - >>> binding_ops() - existential exists - universal all - lambda \ - ----------------- -Regression Tests ----------------- - - -Untyped Logic -+++++++++++++ - -Process logical expressions conveniently: - - >>> read_expr = Expression.fromstring - -Test for equality under alpha-conversion -======================================== - - >>> e1 = read_expr('exists x.P(x)') - >>> print(e1) - exists x.P(x) - >>> e2 = e1.alpha_convert(Variable('z')) - >>> print(e2) - exists z.P(z) - >>> e1 == e2 - True - - - >>> l = read_expr(r'\X.\X.X(X)(1)').simplify() - >>> id = read_expr(r'\X.X(X)') - >>> l == id - True - -Test numerals -============= - - >>> zero = read_expr(r'\F x.x') - >>> one = read_expr(r'\F x.F(x)') - >>> two = read_expr(r'\F x.F(F(x))') - >>> three = read_expr(r'\F x.F(F(F(x)))') - >>> four = read_expr(r'\F x.F(F(F(F(x))))') - >>> succ = read_expr(r'\N F x.F(N(F,x))') - >>> plus = read_expr(r'\M N F x.M(F,N(F,x))') - >>> mult = read_expr(r'\M N F.M(N(F))') - >>> pred = read_expr(r'\N F x.(N(\G H.H(G(F)))(\u.x)(\u.u))') - >>> v1 = ApplicationExpression(succ, zero).simplify() - >>> v1 == one - True - >>> v2 = ApplicationExpression(succ, v1).simplify() - >>> v2 == two - True - >>> v3 = ApplicationExpression(ApplicationExpression(plus, v1), v2).simplify() - >>> v3 == three - True - >>> v4 = ApplicationExpression(ApplicationExpression(mult, v2), v2).simplify() - >>> v4 == four - True - >>> v5 = ApplicationExpression(pred, ApplicationExpression(pred, v4)).simplify() - >>> v5 == two - True - -Overloaded operators also exist, for convenience. - - >>> print(succ(zero).simplify() == one) - True - >>> print(plus(one,two).simplify() == three) - True - >>> print(mult(two,two).simplify() == four) - True - >>> print(pred(pred(four)).simplify() == two) - True - - >>> john = read_expr(r'john') - >>> man = read_expr(r'\x.man(x)') - >>> walk = read_expr(r'\x.walk(x)') - >>> man(john).simplify() - - >>> print(-walk(john).simplify()) - -walk(john) - >>> print((man(john) & walk(john)).simplify()) - (man(john) & walk(john)) - >>> print((man(john) | walk(john)).simplify()) - (man(john) | walk(john)) - >>> print((man(john) > walk(john)).simplify()) - (man(john) -> walk(john)) - >>> print((man(john) < walk(john)).simplify()) - (man(john) <-> walk(john)) - -Python's built-in lambda operator can also be used with Expressions - - >>> john = VariableExpression(Variable('john')) - >>> run_var = VariableExpression(Variable('run')) - >>> run = lambda x: run_var(x) - >>> run(john) - - - -``betaConversionTestSuite.pl`` ------------------------------- - -Tests based on Blackburn & Bos' book, *Representation and Inference -for Natural Language*. - - >>> x1 = read_expr(r'\P.P(mia)(\x.walk(x))').simplify() - >>> x2 = read_expr(r'walk(mia)').simplify() - >>> x1 == x2 - True - - >>> x1 = read_expr(r'exists x.(man(x) & ((\P.exists x.(woman(x) & P(x)))(\y.love(x,y))))').simplify() - >>> x2 = read_expr(r'exists x.(man(x) & exists y.(woman(y) & love(x,y)))').simplify() - >>> x1 == x2 - True - >>> x1 = read_expr(r'\a.sleep(a)(mia)').simplify() - >>> x2 = read_expr(r'sleep(mia)').simplify() - >>> x1 == x2 - True - >>> x1 = read_expr(r'\a.\b.like(b,a)(mia)').simplify() - >>> x2 = read_expr(r'\b.like(b,mia)').simplify() - >>> x1 == x2 - True - >>> x1 = read_expr(r'\a.(\b.like(b,a)(vincent))').simplify() - >>> x2 = read_expr(r'\a.like(vincent,a)').simplify() - >>> x1 == x2 - True - >>> x1 = read_expr(r'\a.((\b.like(b,a)(vincent)) & sleep(a))').simplify() - >>> x2 = read_expr(r'\a.(like(vincent,a) & sleep(a))').simplify() - >>> x1 == x2 - True - - >>> x1 = read_expr(r'(\a.\b.like(b,a)(mia)(vincent))').simplify() - >>> x2 = read_expr(r'like(vincent,mia)').simplify() - >>> x1 == x2 - True - - >>> x1 = read_expr(r'P((\a.sleep(a)(vincent)))').simplify() - >>> x2 = read_expr(r'P(sleep(vincent))').simplify() - >>> x1 == x2 - True - - >>> x1 = read_expr(r'\A.A((\b.sleep(b)(vincent)))').simplify() - >>> x2 = read_expr(r'\A.A(sleep(vincent))').simplify() - >>> x1 == x2 - True - - >>> x1 = read_expr(r'\A.A(sleep(vincent))').simplify() - >>> x2 = read_expr(r'\A.A(sleep(vincent))').simplify() - >>> x1 == x2 - True - - >>> x1 = read_expr(r'(\A.A(vincent)(\b.sleep(b)))').simplify() - >>> x2 = read_expr(r'sleep(vincent)').simplify() - >>> x1 == x2 - True - - >>> x1 = read_expr(r'\A.believe(mia,A(vincent))(\b.sleep(b))').simplify() - >>> x2 = read_expr(r'believe(mia,sleep(vincent))').simplify() - >>> x1 == x2 - True - - >>> x1 = read_expr(r'(\A.(A(vincent) & A(mia)))(\b.sleep(b))').simplify() - >>> x2 = read_expr(r'(sleep(vincent) & sleep(mia))').simplify() - >>> x1 == x2 - True - - >>> x1 = read_expr(r'\A.\B.(\C.C(A(vincent))(\d.probably(d)) & (\C.C(B(mia))(\d.improbably(d))))(\f.walk(f))(\f.talk(f))').simplify() - >>> x2 = read_expr(r'(probably(walk(vincent)) & improbably(talk(mia)))').simplify() - >>> x1 == x2 - True - - >>> x1 = read_expr(r'(\a.\b.(\C.C(a,b)(\d.\f.love(d,f))))(jules)(mia)').simplify() - >>> x2 = read_expr(r'love(jules,mia)').simplify() - >>> x1 == x2 - True - - >>> x1 = read_expr(r'(\A.\B.exists c.(A(c) & B(c)))(\d.boxer(d),\d.sleep(d))').simplify() - >>> x2 = read_expr(r'exists c.(boxer(c) & sleep(c))').simplify() - >>> x1 == x2 - True - - >>> x1 = read_expr(r'\A.Z(A)(\c.\a.like(a,c))').simplify() - >>> x2 = read_expr(r'Z(\c.\a.like(a,c))').simplify() - >>> x1 == x2 - True - - >>> x1 = read_expr(r'\A.\b.A(b)(\c.\b.like(b,c))').simplify() - >>> x2 = read_expr(r'\b.(\c.\b.like(b,c)(b))').simplify() - >>> x1 == x2 - True - - >>> x1 = read_expr(r'(\a.\b.(\C.C(a,b)(\b.\a.loves(b,a))))(jules)(mia)').simplify() - >>> x2 = read_expr(r'loves(jules,mia)').simplify() - >>> x1 == x2 - True - - >>> x1 = read_expr(r'(\A.\b.(exists b.A(b) & A(b)))(\c.boxer(c))(vincent)').simplify() - >>> x2 = read_expr(r'((exists b.boxer(b)) & boxer(vincent))').simplify() - >>> x1 == x2 - True - -Test Parser -=========== - - >>> print(read_expr(r'john')) - john - >>> print(read_expr(r'x')) - x - >>> print(read_expr(r'-man(x)')) - -man(x) - >>> print(read_expr(r'--man(x)')) - --man(x) - >>> print(read_expr(r'(man(x))')) - man(x) - >>> print(read_expr(r'((man(x)))')) - man(x) - >>> print(read_expr(r'man(x) <-> tall(x)')) - (man(x) <-> tall(x)) - >>> print(read_expr(r'(man(x) <-> tall(x))')) - (man(x) <-> tall(x)) - >>> print(read_expr(r'(man(x) & tall(x) & walks(x))')) - (man(x) & tall(x) & walks(x)) - >>> print(read_expr(r'(man(x) & tall(x) & walks(x))').first) - (man(x) & tall(x)) - >>> print(read_expr(r'man(x) | tall(x) & walks(x)')) - (man(x) | (tall(x) & walks(x))) - >>> print(read_expr(r'((man(x) & tall(x)) | walks(x))')) - ((man(x) & tall(x)) | walks(x)) - >>> print(read_expr(r'man(x) & (tall(x) | walks(x))')) - (man(x) & (tall(x) | walks(x))) - >>> print(read_expr(r'(man(x) & (tall(x) | walks(x)))')) - (man(x) & (tall(x) | walks(x))) - >>> print(read_expr(r'P(x) -> Q(x) <-> R(x) | S(x) & T(x)')) - ((P(x) -> Q(x)) <-> (R(x) | (S(x) & T(x)))) - >>> print(read_expr(r'exists x.man(x)')) - exists x.man(x) - >>> print(read_expr(r'exists x.(man(x) & tall(x))')) - exists x.(man(x) & tall(x)) - >>> print(read_expr(r'exists x.(man(x) & tall(x) & walks(x))')) - exists x.(man(x) & tall(x) & walks(x)) - >>> print(read_expr(r'-P(x) & Q(x)')) - (-P(x) & Q(x)) - >>> read_expr(r'-P(x) & Q(x)') == read_expr(r'(-P(x)) & Q(x)') - True - >>> print(read_expr(r'\x.man(x)')) - \x.man(x) - >>> print(read_expr(r'\x.man(x)(john)')) - \x.man(x)(john) - >>> print(read_expr(r'\x.man(x)(john) & tall(x)')) - (\x.man(x)(john) & tall(x)) - >>> print(read_expr(r'\x.\y.sees(x,y)')) - \x y.sees(x,y) - >>> print(read_expr(r'\x y.sees(x,y)')) - \x y.sees(x,y) - >>> print(read_expr(r'\x.\y.sees(x,y)(a)')) - (\x y.sees(x,y))(a) - >>> print(read_expr(r'\x y.sees(x,y)(a)')) - (\x y.sees(x,y))(a) - >>> print(read_expr(r'\x.\y.sees(x,y)(a)(b)')) - ((\x y.sees(x,y))(a))(b) - >>> print(read_expr(r'\x y.sees(x,y)(a)(b)')) - ((\x y.sees(x,y))(a))(b) - >>> print(read_expr(r'\x.\y.sees(x,y)(a,b)')) - ((\x y.sees(x,y))(a))(b) - >>> print(read_expr(r'\x y.sees(x,y)(a,b)')) - ((\x y.sees(x,y))(a))(b) - >>> print(read_expr(r'((\x.\y.sees(x,y))(a))(b)')) - ((\x y.sees(x,y))(a))(b) - >>> print(read_expr(r'P(x)(y)(z)')) - P(x,y,z) - >>> print(read_expr(r'P(Q)')) - P(Q) - >>> print(read_expr(r'P(Q(x))')) - P(Q(x)) - >>> print(read_expr(r'(\x.exists y.walks(x,y))(x)')) - (\x.exists y.walks(x,y))(x) - >>> print(read_expr(r'exists x.(x = john)')) - exists x.(x = john) - >>> print(read_expr(r'((\P.\Q.exists x.(P(x) & Q(x)))(\x.dog(x)))(\x.bark(x))')) - ((\P Q.exists x.(P(x) & Q(x)))(\x.dog(x)))(\x.bark(x)) - >>> a = read_expr(r'exists c.exists b.A(b,c) & A(b,c)') - >>> b = read_expr(r'(exists c.(exists b.A(b,c))) & A(b,c)') - >>> print(a == b) - True - >>> a = read_expr(r'exists c.(exists b.A(b,c) & A(b,c))') - >>> b = read_expr(r'exists c.((exists b.A(b,c)) & A(b,c))') - >>> print(a == b) - True - >>> print(read_expr(r'exists x.x = y')) - exists x.(x = y) - >>> print(read_expr('A(B)(C)')) - A(B,C) - >>> print(read_expr('(A(B))(C)')) - A(B,C) - >>> print(read_expr('A((B)(C))')) - A(B(C)) - >>> print(read_expr('A(B(C))')) - A(B(C)) - >>> print(read_expr('(A)(B(C))')) - A(B(C)) - >>> print(read_expr('(((A)))(((B))(((C))))')) - A(B(C)) - >>> print(read_expr(r'A != B')) - -(A = B) - >>> print(read_expr('P(x) & x=y & P(y)')) - (P(x) & (x = y) & P(y)) - >>> try: print(read_expr(r'\walk.walk(x)')) - ... except LogicalExpressionException as e: print(e) - 'walk' is an illegal variable name. Constants may not be abstracted. - \walk.walk(x) - ^ - >>> try: print(read_expr(r'all walk.walk(john)')) - ... except LogicalExpressionException as e: print(e) - 'walk' is an illegal variable name. Constants may not be quantified. - all walk.walk(john) - ^ - >>> try: print(read_expr(r'x(john)')) - ... except LogicalExpressionException as e: print(e) - 'x' is an illegal predicate name. Individual variables may not be used as predicates. - x(john) - ^ - - >>> from nltk.sem.logic import LogicParser # hack to give access to custom quote chars - >>> lpq = LogicParser() - >>> lpq.quote_chars = [("'", "'", "\\", False)] - >>> print(lpq.parse(r"(man(x) & 'tall\'s,' (x) & walks (x) )")) - (man(x) & tall's,(x) & walks(x)) - >>> lpq.quote_chars = [("'", "'", "\\", True)] - >>> print(lpq.parse(r"'tall\'s,'")) - 'tall\'s,' - >>> print(lpq.parse(r"'spaced name(x)'")) - 'spaced name(x)' - >>> print(lpq.parse(r"-'tall\'s,'(x)")) - -'tall\'s,'(x) - >>> print(lpq.parse(r"(man(x) & 'tall\'s,' (x) & walks (x) )")) - (man(x) & 'tall\'s,'(x) & walks(x)) - - -Simplify -======== - - >>> print(read_expr(r'\x.man(x)(john)').simplify()) - man(john) - >>> print(read_expr(r'\x.((man(x)))(john)').simplify()) - man(john) - >>> print(read_expr(r'\x.\y.sees(x,y)(john, mary)').simplify()) - sees(john,mary) - >>> print(read_expr(r'\x y.sees(x,y)(john, mary)').simplify()) - sees(john,mary) - >>> print(read_expr(r'\x.\y.sees(x,y)(john)(mary)').simplify()) - sees(john,mary) - >>> print(read_expr(r'\x y.sees(x,y)(john)(mary)').simplify()) - sees(john,mary) - >>> print(read_expr(r'\x.\y.sees(x,y)(john)').simplify()) - \y.sees(john,y) - >>> print(read_expr(r'\x y.sees(x,y)(john)').simplify()) - \y.sees(john,y) - >>> print(read_expr(r'(\x.\y.sees(x,y)(john))(mary)').simplify()) - sees(john,mary) - >>> print(read_expr(r'(\x y.sees(x,y)(john))(mary)').simplify()) - sees(john,mary) - >>> print(read_expr(r'exists x.(man(x) & (\x.exists y.walks(x,y))(x))').simplify()) - exists x.(man(x) & exists y.walks(x,y)) - >>> e1 = read_expr(r'exists x.(man(x) & (\x.exists y.walks(x,y))(y))').simplify() - >>> e2 = read_expr(r'exists x.(man(x) & exists z1.walks(y,z1))') - >>> e1 == e2 - True - >>> print(read_expr(r'(\P Q.exists x.(P(x) & Q(x)))(\x.dog(x))').simplify()) - \Q.exists x.(dog(x) & Q(x)) - >>> print(read_expr(r'((\P.\Q.exists x.(P(x) & Q(x)))(\x.dog(x)))(\x.bark(x))').simplify()) - exists x.(dog(x) & bark(x)) - >>> print(read_expr(r'\P.(P(x)(y))(\a b.Q(a,b))').simplify()) - Q(x,y) - -Replace -======= - - >>> a = read_expr(r'a') - >>> x = read_expr(r'x') - >>> y = read_expr(r'y') - >>> z = read_expr(r'z') - - >>> print(read_expr(r'man(x)').replace(x.variable, a, False)) - man(a) - >>> print(read_expr(r'(man(x) & tall(x))').replace(x.variable, a, False)) - (man(a) & tall(a)) - >>> print(read_expr(r'exists x.man(x)').replace(x.variable, a, False)) - exists x.man(x) - >>> print(read_expr(r'exists x.man(x)').replace(x.variable, a, True)) - exists a.man(a) - >>> print(read_expr(r'exists x.give(x,y,z)').replace(y.variable, a, False)) - exists x.give(x,a,z) - >>> print(read_expr(r'exists x.give(x,y,z)').replace(y.variable, a, True)) - exists x.give(x,a,z) - >>> e1 = read_expr(r'exists x.give(x,y,z)').replace(y.variable, x, False) - >>> e2 = read_expr(r'exists z1.give(z1,x,z)') - >>> e1 == e2 - True - >>> e1 = read_expr(r'exists x.give(x,y,z)').replace(y.variable, x, True) - >>> e2 = read_expr(r'exists z1.give(z1,x,z)') - >>> e1 == e2 - True - >>> print(read_expr(r'\x y z.give(x,y,z)').replace(y.variable, a, False)) - \x y z.give(x,y,z) - >>> print(read_expr(r'\x y z.give(x,y,z)').replace(y.variable, a, True)) - \x a z.give(x,a,z) - >>> print(read_expr(r'\x.\y.give(x,y,z)').replace(z.variable, a, False)) - \x y.give(x,y,a) - >>> print(read_expr(r'\x.\y.give(x,y,z)').replace(z.variable, a, True)) - \x y.give(x,y,a) - >>> e1 = read_expr(r'\x.\y.give(x,y,z)').replace(z.variable, x, False) - >>> e2 = read_expr(r'\z1.\y.give(z1,y,x)') - >>> e1 == e2 - True - >>> e1 = read_expr(r'\x.\y.give(x,y,z)').replace(z.variable, x, True) - >>> e2 = read_expr(r'\z1.\y.give(z1,y,x)') - >>> e1 == e2 - True - >>> print(read_expr(r'\x.give(x,y,z)').replace(z.variable, y, False)) - \x.give(x,y,y) - >>> print(read_expr(r'\x.give(x,y,z)').replace(z.variable, y, True)) - \x.give(x,y,y) - - >>> from nltk.sem import logic - >>> logic._counter._value = 0 - >>> e1 = read_expr('e1') - >>> e2 = read_expr('e2') - >>> print(read_expr('exists e1 e2.(walk(e1) & talk(e2))').replace(e1.variable, e2, True)) - exists e2 e01.(walk(e2) & talk(e01)) - - -Variables / Free -================ - - >>> examples = [r'walk(john)', - ... r'walk(x)', - ... r'?vp(?np)', - ... r'see(john,mary)', - ... r'exists x.walk(x)', - ... r'\x.see(john,x)', - ... r'\x.see(john,x)(mary)', - ... r'P(x)', - ... r'\P.P(x)', - ... r'aa(x,bb(y),cc(z),P(w),u)', - ... r'bo(?det(?n),@x)'] - >>> examples = [read_expr(e) for e in examples] - - >>> for e in examples: - ... print('%-25s' % e, sorted(e.free())) - walk(john) [] - walk(x) [Variable('x')] - ?vp(?np) [] - see(john,mary) [] - exists x.walk(x) [] - \x.see(john,x) [] - (\x.see(john,x))(mary) [] - P(x) [Variable('P'), Variable('x')] - \P.P(x) [Variable('x')] - aa(x,bb(y),cc(z),P(w),u) [Variable('P'), Variable('u'), Variable('w'), Variable('x'), Variable('y'), Variable('z')] - bo(?det(?n),@x) [] - - >>> for e in examples: - ... print('%-25s' % e, sorted(e.constants())) - walk(john) [Variable('john')] - walk(x) [] - ?vp(?np) [Variable('?np')] - see(john,mary) [Variable('john'), Variable('mary')] - exists x.walk(x) [] - \x.see(john,x) [Variable('john')] - (\x.see(john,x))(mary) [Variable('john'), Variable('mary')] - P(x) [] - \P.P(x) [] - aa(x,bb(y),cc(z),P(w),u) [] - bo(?det(?n),@x) [Variable('?n'), Variable('@x')] - - >>> for e in examples: - ... print('%-25s' % e, sorted(e.predicates())) - walk(john) [Variable('walk')] - walk(x) [Variable('walk')] - ?vp(?np) [Variable('?vp')] - see(john,mary) [Variable('see')] - exists x.walk(x) [Variable('walk')] - \x.see(john,x) [Variable('see')] - (\x.see(john,x))(mary) [Variable('see')] - P(x) [] - \P.P(x) [] - aa(x,bb(y),cc(z),P(w),u) [Variable('aa'), Variable('bb'), Variable('cc')] - bo(?det(?n),@x) [Variable('?det'), Variable('bo')] - - >>> for e in examples: - ... print('%-25s' % e, sorted(e.variables())) - walk(john) [] - walk(x) [Variable('x')] - ?vp(?np) [Variable('?np'), Variable('?vp')] - see(john,mary) [] - exists x.walk(x) [] - \x.see(john,x) [] - (\x.see(john,x))(mary) [] - P(x) [Variable('P'), Variable('x')] - \P.P(x) [Variable('x')] - aa(x,bb(y),cc(z),P(w),u) [Variable('P'), Variable('u'), Variable('w'), Variable('x'), Variable('y'), Variable('z')] - bo(?det(?n),@x) [Variable('?det'), Variable('?n'), Variable('@x')] - - - -`normalize` - >>> print(read_expr(r'\e083.(walk(e083, z472) & talk(e092, z938))').normalize()) - \e01.(walk(e01,z3) & talk(e02,z4)) - -Typed Logic -+++++++++++ - - >>> from nltk.sem.logic import LogicParser - >>> tlp = LogicParser(True) - >>> print(tlp.parse(r'man(x)').type) - ? - >>> print(tlp.parse(r'walk(angus)').type) - ? - >>> print(tlp.parse(r'-man(x)').type) - t - >>> print(tlp.parse(r'(man(x) <-> tall(x))').type) - t - >>> print(tlp.parse(r'exists x.(man(x) & tall(x))').type) - t - >>> print(tlp.parse(r'\x.man(x)').type) - - >>> print(tlp.parse(r'john').type) - e - >>> print(tlp.parse(r'\x y.sees(x,y)').type) - > - >>> print(tlp.parse(r'\x.man(x)(john)').type) - ? - >>> print(tlp.parse(r'\x.\y.sees(x,y)(john)').type) - - >>> print(tlp.parse(r'\x.\y.sees(x,y)(john)(mary)').type) - ? - >>> print(tlp.parse(r'\P.\Q.exists x.(P(x) & Q(x))').type) - <,<,t>> - >>> print(tlp.parse(r'\x.y').type) - - >>> print(tlp.parse(r'\P.P(x)').type) - <,?> - - >>> parsed = tlp.parse('see(john,mary)') - >>> print(parsed.type) - ? - >>> print(parsed.function) - see(john) - >>> print(parsed.function.type) - - >>> print(parsed.function.function) - see - >>> print(parsed.function.function.type) - > - - >>> parsed = tlp.parse('P(x,y)') - >>> print(parsed) - P(x,y) - >>> print(parsed.type) - ? - >>> print(parsed.function) - P(x) - >>> print(parsed.function.type) - - >>> print(parsed.function.function) - P - >>> print(parsed.function.function.type) - > - - >>> print(tlp.parse(r'P').type) - ? - - >>> print(tlp.parse(r'P', {'P': 't'}).type) - t - - >>> a = tlp.parse(r'P(x)') - >>> print(a.type) - ? - >>> print(a.function.type) - - >>> print(a.argument.type) - e - - >>> a = tlp.parse(r'-P(x)') - >>> print(a.type) - t - >>> print(a.term.type) - t - >>> print(a.term.function.type) - - >>> print(a.term.argument.type) - e - - >>> a = tlp.parse(r'P & Q') - >>> print(a.type) - t - >>> print(a.first.type) - t - >>> print(a.second.type) - t - - >>> a = tlp.parse(r'(P(x) & Q(x))') - >>> print(a.type) - t - >>> print(a.first.type) - t - >>> print(a.first.function.type) - - >>> print(a.first.argument.type) - e - >>> print(a.second.type) - t - >>> print(a.second.function.type) - - >>> print(a.second.argument.type) - e - - >>> a = tlp.parse(r'\x.P(x)') - >>> print(a.type) - - >>> print(a.term.function.type) - - >>> print(a.term.argument.type) - e - - >>> a = tlp.parse(r'\P.P(x)') - >>> print(a.type) - <,?> - >>> print(a.term.function.type) - - >>> print(a.term.argument.type) - e - - >>> a = tlp.parse(r'(\x.P(x)(john)) & Q(x)') - >>> print(a.type) - t - >>> print(a.first.type) - t - >>> print(a.first.function.type) - - >>> print(a.first.function.term.function.type) - - >>> print(a.first.function.term.argument.type) - e - >>> print(a.first.argument.type) - e - - >>> a = tlp.parse(r'\x y.P(x,y)(john)(mary) & Q(x)') - >>> print(a.type) - t - >>> print(a.first.type) - t - >>> print(a.first.function.type) - - >>> print(a.first.function.function.type) - > - - >>> a = tlp.parse(r'--P') - >>> print(a.type) - t - >>> print(a.term.type) - t - >>> print(a.term.term.type) - t - - >>> tlp.parse(r'\x y.P(x,y)').type - > - >>> tlp.parse(r'\x y.P(x,y)', {'P': '>'}).type - > - - >>> a = tlp.parse(r'\P y.P(john,y)(\x y.see(x,y))') - >>> a.type - - >>> a.function.type - <>,> - >>> a.function.term.term.function.function.type - > - >>> a.argument.type - > - - >>> a = tlp.parse(r'exists c f.(father(c) = f)') - >>> a.type - t - >>> a.term.term.type - t - >>> a.term.term.first.type - e - >>> a.term.term.first.function.type - - >>> a.term.term.second.type - e - -typecheck() - - >>> a = tlp.parse('P(x)') - >>> b = tlp.parse('Q(x)') - >>> a.type - ? - >>> c = a & b - >>> c.first.type - ? - >>> c.typecheck() - {...} - >>> c.first.type - t - - >>> a = tlp.parse('P(x)') - >>> b = tlp.parse('P(x) & Q(x)') - >>> a.type - ? - >>> typecheck([a,b]) - {...} - >>> a.type - t - - >>> e = tlp.parse(r'man(x)') - >>> print(dict((k,str(v)) for k,v in e.typecheck().items()) == {'x': 'e', 'man': ''}) - True - >>> sig = {'man': ''} - >>> e = tlp.parse(r'man(x)', sig) - >>> print(e.function.type) - - >>> print(dict((k,str(v)) for k,v in e.typecheck().items()) == {'x': 'e', 'man': ''}) - True - >>> print(e.function.type) - - >>> print(dict((k,str(v)) for k,v in e.typecheck(sig).items()) == {'x': 'e', 'man': ''}) - True - -findtype() - - >>> print(tlp.parse(r'man(x)').findtype(Variable('man'))) - - >>> print(tlp.parse(r'see(x,y)').findtype(Variable('see'))) - > - >>> print(tlp.parse(r'P(Q(R(x)))').findtype(Variable('Q'))) - ? - -reading types from strings - - >>> Type.fromstring('e') - e - >>> Type.fromstring('') - - >>> Type.fromstring('<,>') - <,> - >>> Type.fromstring('<,?>') - <,?> - -alternative type format - - >>> Type.fromstring('e').str() - 'IND' - >>> Type.fromstring('').str() - '(IND -> ANY)' - >>> Type.fromstring('<,t>').str() - '((IND -> BOOL) -> BOOL)' - -Type.__eq__() - - >>> from nltk.sem.logic import * - - >>> e = ENTITY_TYPE - >>> t = TRUTH_TYPE - >>> a = ANY_TYPE - >>> et = ComplexType(e,t) - >>> eet = ComplexType(e,ComplexType(e,t)) - >>> at = ComplexType(a,t) - >>> ea = ComplexType(e,a) - >>> aa = ComplexType(a,a) - - >>> e == e - True - >>> t == t - True - >>> e == t - False - >>> a == t - False - >>> t == a - False - >>> a == a - True - >>> et == et - True - >>> a == et - False - >>> et == a - False - >>> a == ComplexType(a,aa) - True - >>> ComplexType(a,aa) == a - True - -matches() - - >>> e.matches(t) - False - >>> a.matches(t) - True - >>> t.matches(a) - True - >>> a.matches(et) - True - >>> et.matches(a) - True - >>> ea.matches(eet) - True - >>> eet.matches(ea) - True - >>> aa.matches(et) - True - >>> aa.matches(t) - True - -Type error during parsing -========================= - - >>> try: print(tlp.parse(r'exists x y.(P(x) & P(x,y))')) - ... except InconsistentTypeHierarchyException as e: print(e) - The variable 'P' was found in multiple places with different types. - >>> try: tlp.parse(r'\x y.see(x,y)(\x.man(x))') - ... except TypeException as e: print(e) - The function '\x y.see(x,y)' is of type '>' and cannot be applied to '\x.man(x)' of type ''. Its argument must match type 'e'. - >>> try: tlp.parse(r'\P x y.-P(x,y)(\x.-man(x))') - ... except TypeException as e: print(e) - The function '\P x y.-P(x,y)' is of type '<>,>>' and cannot be applied to '\x.-man(x)' of type ''. Its argument must match type '>'. - - >>> a = tlp.parse(r'-talk(x)') - >>> signature = a.typecheck() - >>> try: print(tlp.parse(r'-talk(x,y)', signature)) - ... except InconsistentTypeHierarchyException as e: print(e) - The variable 'talk' was found in multiple places with different types. - - >>> a = tlp.parse(r'-P(x)') - >>> b = tlp.parse(r'-P(x,y)') - >>> a.typecheck() - {...} - >>> b.typecheck() - {...} - >>> try: typecheck([a,b]) - ... except InconsistentTypeHierarchyException as e: print(e) - The variable 'P' was found in multiple places with different types. - - >>> a = tlp.parse(r'P(x)') - >>> b = tlp.parse(r'P(x,y)') - >>> signature = {'P': ''} - >>> a.typecheck(signature) - {...} - >>> try: typecheck([a,b], signature) - ... except InconsistentTypeHierarchyException as e: print(e) - The variable 'P' was found in multiple places with different types. - -Parse errors -============ - - >>> try: read_expr(r'') - ... except LogicalExpressionException as e: print(e) - End of input found. Expression expected. - - ^ - >>> try: read_expr(r'(') - ... except LogicalExpressionException as e: print(e) - End of input found. Expression expected. - ( - ^ - >>> try: read_expr(r')') - ... except LogicalExpressionException as e: print(e) - Unexpected token: ')'. Expression expected. - ) - ^ - >>> try: read_expr(r'()') - ... except LogicalExpressionException as e: print(e) - Unexpected token: ')'. Expression expected. - () - ^ - >>> try: read_expr(r'(P(x) & Q(x)') - ... except LogicalExpressionException as e: print(e) - End of input found. Expected token ')'. - (P(x) & Q(x) - ^ - >>> try: read_expr(r'(P(x) &') - ... except LogicalExpressionException as e: print(e) - End of input found. Expression expected. - (P(x) & - ^ - >>> try: read_expr(r'(P(x) | )') - ... except LogicalExpressionException as e: print(e) - Unexpected token: ')'. Expression expected. - (P(x) | ) - ^ - >>> try: read_expr(r'P(x) ->') - ... except LogicalExpressionException as e: print(e) - End of input found. Expression expected. - P(x) -> - ^ - >>> try: read_expr(r'P(x') - ... except LogicalExpressionException as e: print(e) - End of input found. Expected token ')'. - P(x - ^ - >>> try: read_expr(r'P(x,') - ... except LogicalExpressionException as e: print(e) - End of input found. Expression expected. - P(x, - ^ - >>> try: read_expr(r'P(x,)') - ... except LogicalExpressionException as e: print(e) - Unexpected token: ')'. Expression expected. - P(x,) - ^ - >>> try: read_expr(r'exists') - ... except LogicalExpressionException as e: print(e) - End of input found. Variable and Expression expected following quantifier 'exists'. - exists - ^ - >>> try: read_expr(r'exists x') - ... except LogicalExpressionException as e: print(e) - End of input found. Expression expected. - exists x - ^ - >>> try: read_expr(r'exists x.') - ... except LogicalExpressionException as e: print(e) - End of input found. Expression expected. - exists x. - ^ - >>> try: read_expr(r'\ ') - ... except LogicalExpressionException as e: print(e) - End of input found. Variable and Expression expected following lambda operator. - \ - ^ - >>> try: read_expr(r'\ x') - ... except LogicalExpressionException as e: print(e) - End of input found. Expression expected. - \ x - ^ - >>> try: read_expr(r'\ x y') - ... except LogicalExpressionException as e: print(e) - End of input found. Expression expected. - \ x y - ^ - >>> try: read_expr(r'\ x.') - ... except LogicalExpressionException as e: print(e) - End of input found. Expression expected. - \ x. - ^ - >>> try: read_expr(r'P(x)Q(x)') - ... except LogicalExpressionException as e: print(e) - Unexpected token: 'Q'. - P(x)Q(x) - ^ - >>> try: read_expr(r'(P(x)Q(x)') - ... except LogicalExpressionException as e: print(e) - Unexpected token: 'Q'. Expected token ')'. - (P(x)Q(x) - ^ - >>> try: read_expr(r'exists x y') - ... except LogicalExpressionException as e: print(e) - End of input found. Expression expected. - exists x y - ^ - >>> try: read_expr(r'exists x y.') - ... except LogicalExpressionException as e: print(e) - End of input found. Expression expected. - exists x y. - ^ - >>> try: read_expr(r'exists x -> y') - ... except LogicalExpressionException as e: print(e) - Unexpected token: '->'. Expression expected. - exists x -> y - ^ - - - >>> try: read_expr(r'A -> ((P(x) & Q(x)) -> Z') - ... except LogicalExpressionException as e: print(e) - End of input found. Expected token ')'. - A -> ((P(x) & Q(x)) -> Z - ^ - >>> try: read_expr(r'A -> ((P(x) &) -> Z') - ... except LogicalExpressionException as e: print(e) - Unexpected token: ')'. Expression expected. - A -> ((P(x) &) -> Z - ^ - >>> try: read_expr(r'A -> ((P(x) | )) -> Z') - ... except LogicalExpressionException as e: print(e) - Unexpected token: ')'. Expression expected. - A -> ((P(x) | )) -> Z - ^ - >>> try: read_expr(r'A -> (P(x) ->) -> Z') - ... except LogicalExpressionException as e: print(e) - Unexpected token: ')'. Expression expected. - A -> (P(x) ->) -> Z - ^ - >>> try: read_expr(r'A -> (P(x) -> Z') - ... except LogicalExpressionException as e: print(e) - End of input found. Expected token ')'. - A -> (P(x) -> Z - ^ - >>> try: read_expr(r'A -> (P(x,) -> Z') - ... except LogicalExpressionException as e: print(e) - Unexpected token: ')'. Expression expected. - A -> (P(x,) -> Z - ^ - >>> try: read_expr(r'A -> (P(x,)) -> Z') - ... except LogicalExpressionException as e: print(e) - Unexpected token: ')'. Expression expected. - A -> (P(x,)) -> Z - ^ - >>> try: read_expr(r'A -> (exists) -> Z') - ... except LogicalExpressionException as e: print(e) - ')' is an illegal variable name. Constants may not be quantified. - A -> (exists) -> Z - ^ - >>> try: read_expr(r'A -> (exists x) -> Z') - ... except LogicalExpressionException as e: print(e) - Unexpected token: ')'. Expression expected. - A -> (exists x) -> Z - ^ - >>> try: read_expr(r'A -> (exists x.) -> Z') - ... except LogicalExpressionException as e: print(e) - Unexpected token: ')'. Expression expected. - A -> (exists x.) -> Z - ^ - >>> try: read_expr(r'A -> (\ ) -> Z') - ... except LogicalExpressionException as e: print(e) - ')' is an illegal variable name. Constants may not be abstracted. - A -> (\ ) -> Z - ^ - >>> try: read_expr(r'A -> (\ x) -> Z') - ... except LogicalExpressionException as e: print(e) - Unexpected token: ')'. Expression expected. - A -> (\ x) -> Z - ^ - >>> try: read_expr(r'A -> (\ x y) -> Z') - ... except LogicalExpressionException as e: print(e) - Unexpected token: ')'. Expression expected. - A -> (\ x y) -> Z - ^ - >>> try: read_expr(r'A -> (\ x.) -> Z') - ... except LogicalExpressionException as e: print(e) - Unexpected token: ')'. Expression expected. - A -> (\ x.) -> Z - ^ - >>> try: read_expr(r'A -> (P(x)Q(x)) -> Z') - ... except LogicalExpressionException as e: print(e) - Unexpected token: 'Q'. Expected token ')'. - A -> (P(x)Q(x)) -> Z - ^ - >>> try: read_expr(r'A -> ((P(x)Q(x)) -> Z') - ... except LogicalExpressionException as e: print(e) - Unexpected token: 'Q'. Expected token ')'. - A -> ((P(x)Q(x)) -> Z - ^ - >>> try: read_expr(r'A -> (all x y) -> Z') - ... except LogicalExpressionException as e: print(e) - Unexpected token: ')'. Expression expected. - A -> (all x y) -> Z - ^ - >>> try: read_expr(r'A -> (exists x y.) -> Z') - ... except LogicalExpressionException as e: print(e) - Unexpected token: ')'. Expression expected. - A -> (exists x y.) -> Z - ^ - >>> try: read_expr(r'A -> (exists x -> y) -> Z') - ... except LogicalExpressionException as e: print(e) - Unexpected token: '->'. Expression expected. - A -> (exists x -> y) -> Z - ^ diff --git a/pipeline/nltk/test/meteor.doctest b/pipeline/nltk/test/meteor.doctest deleted file mode 100644 index d7d924004601091811d6b58d52aa549849ada659..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/meteor.doctest +++ /dev/null @@ -1,54 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -.. -*- coding: utf-8 -*- - -============= -METEOR tests -============= - -No Alignment test ------------------- - - >>> from nltk.translate import meteor - >>> from nltk import word_tokenize - -If the candidate has no alignment to any of the references, the METEOR score is 0. - - >>> round(meteor( - ... [word_tokenize('The candidate has no alignment to any of the references')], - ... word_tokenize('John loves Mary') - ... ), 4) - 0.0 - -Tests based on wikipedia examples ---------------------------------- - -Testing on `wikipedia examples `_ - - >>> same_res = round(meteor( - ... [word_tokenize('The cat sat on the mat')], - ... word_tokenize('The cat sat on the mat') - ... ), 4) - >>> abs(same_res - 0.9977) < 1e-2 - True - - >>> meteor( - ... [word_tokenize('The cat sat on the mat')], - ... word_tokenize('on the mat sat the cat') - ... ) - 0.5 - - >>> round(meteor( - ... [word_tokenize('The cat sat on the mat')], - ... word_tokenize('The cat was sat on the mat') - ... ), 4) - 0.9654 - -Test corresponding to issue #2751, where METEOR score > 1 - - >>> round(meteor( - ... [word_tokenize('create or update a vm set')], - ... word_tokenize('creates or updates a virtual machine scale set') - ... ), 4) - 0.7806 diff --git a/pipeline/nltk/test/metrics.doctest b/pipeline/nltk/test/metrics.doctest deleted file mode 100644 index 9c51fd82bfdf6cb91185fbb4b40d1a01fdf72086..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/metrics.doctest +++ /dev/null @@ -1,321 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -======= -Metrics -======= - ------ -Setup ------ - - >>> import pytest - >>> _ = pytest.importorskip("numpy") - - -The `nltk.metrics` package provides a variety of *evaluation measures* -which can be used for a wide variety of NLP tasks. - - >>> from nltk.metrics import * - ------------------- -Standard IR Scores ------------------- - -We can use standard scores from information retrieval to test the -performance of taggers, chunkers, etc. - - >>> reference = 'DET NN VB DET JJ NN NN IN DET NN'.split() - >>> test = 'DET VB VB DET NN NN NN IN DET NN'.split() - >>> print(accuracy(reference, test)) - 0.8 - - -The following measures apply to sets: - - >>> reference_set = set(reference) - >>> test_set = set(test) - >>> precision(reference_set, test_set) - 1.0 - >>> print(recall(reference_set, test_set)) - 0.8 - >>> print(f_measure(reference_set, test_set)) - 0.88888888888... - -Measuring the likelihood of the data, given probability distributions: - - >>> from nltk import FreqDist, MLEProbDist - >>> pdist1 = MLEProbDist(FreqDist("aldjfalskfjaldsf")) - >>> pdist2 = MLEProbDist(FreqDist("aldjfalssjjlldss")) - >>> print(log_likelihood(['a', 'd'], [pdist1, pdist2])) - -2.7075187496... - - ----------------- -Distance Metrics ----------------- - -String edit distance (Levenshtein): - - >>> edit_distance("rain", "shine") - 3 - >>> edit_distance_align("shine", "shine") - [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)] - >>> edit_distance_align("rain", "brainy") - [(0, 0), (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (4, 6)] - >>> edit_distance_align("", "brainy") - [(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6)] - >>> edit_distance_align("", "") - [(0, 0)] - -Other distance measures: - - >>> s1 = set([1,2,3,4]) - >>> s2 = set([3,4,5]) - >>> binary_distance(s1, s2) - 1.0 - >>> print(jaccard_distance(s1, s2)) - 0.6 - >>> print(masi_distance(s1, s2)) - 0.868 - ----------------------- -Miscellaneous Measures ----------------------- - -Rank Correlation works with two dictionaries mapping keys to ranks. -The dictionaries should have the same set of keys. - - >>> spearman_correlation({'e':1, 't':2, 'a':3}, {'e':1, 'a':2, 't':3}) - 0.5 - -Windowdiff uses a sliding window in comparing two segmentations of the same input (e.g. tokenizations, chunkings). -Segmentations are represented using strings of zeros and ones. - - >>> s1 = "000100000010" - >>> s2 = "000010000100" - >>> s3 = "100000010000" - >>> s4 = "000000000000" - >>> s5 = "111111111111" - >>> windowdiff(s1, s1, 3) - 0.0 - >>> abs(windowdiff(s1, s2, 3) - 0.3) < 1e-6 # windowdiff(s1, s2, 3) == 0.3 - True - >>> abs(windowdiff(s2, s3, 3) - 0.8) < 1e-6 # windowdiff(s2, s3, 3) == 0.8 - True - >>> windowdiff(s1, s4, 3) - 0.5 - >>> windowdiff(s1, s5, 3) - 1.0 - ----------------- -Confusion Matrix ----------------- - - >>> reference = 'This is the reference data. Testing 123. aoaeoeoe' - >>> test = 'Thos iz_the rifirenci data. Testeng 123. aoaeoeoe' - >>> print(ConfusionMatrix(reference, test)) - | . 1 2 3 T _ a c d e f g h i n o r s t z | - --+-------------------------------------------+ - |<8>. . . . . 1 . . . . . . . . . . . . . . | - . | .<2>. . . . . . . . . . . . . . . . . . . | - 1 | . .<1>. . . . . . . . . . . . . . . . . . | - 2 | . . .<1>. . . . . . . . . . . . . . . . . | - 3 | . . . .<1>. . . . . . . . . . . . . . . . | - T | . . . . .<2>. . . . . . . . . . . . . . . | - _ | . . . . . .<.>. . . . . . . . . . . . . . | - a | . . . . . . .<4>. . . . . . . . . . . . . | - c | . . . . . . . .<1>. . . . . . . . . . . . | - d | . . . . . . . . .<1>. . . . . . . . . . . | - e | . . . . . . . . . .<6>. . . 3 . . . . . . | - f | . . . . . . . . . . .<1>. . . . . . . . . | - g | . . . . . . . . . . . .<1>. . . . . . . . | - h | . . . . . . . . . . . . .<2>. . . . . . . | - i | . . . . . . . . . . 1 . . .<1>. 1 . . . . | - n | . . . . . . . . . . . . . . .<2>. . . . . | - o | . . . . . . . . . . . . . . . .<3>. . . . | - r | . . . . . . . . . . . . . . . . .<2>. . . | - s | . . . . . . . . . . . . . . . . . .<2>. 1 | - t | . . . . . . . . . . . . . . . . . . .<3>. | - z | . . . . . . . . . . . . . . . . . . . .<.>| - --+-------------------------------------------+ - (row = reference; col = test) - - - >>> cm = ConfusionMatrix(reference, test) - >>> print(cm.pretty_format(sort_by_count=True)) - | e a i o s t . T h n r 1 2 3 c d f g _ z | - --+-------------------------------------------+ - |<8>. . . . . . . . . . . . . . . . . . 1 . | - e | .<6>. 3 . . . . . . . . . . . . . . . . . | - a | . .<4>. . . . . . . . . . . . . . . . . . | - i | . 1 .<1>1 . . . . . . . . . . . . . . . . | - o | . . . .<3>. . . . . . . . . . . . . . . . | - s | . . . . .<2>. . . . . . . . . . . . . . 1 | - t | . . . . . .<3>. . . . . . . . . . . . . . | - . | . . . . . . .<2>. . . . . . . . . . . . . | - T | . . . . . . . .<2>. . . . . . . . . . . . | - h | . . . . . . . . .<2>. . . . . . . . . . . | - n | . . . . . . . . . .<2>. . . . . . . . . . | - r | . . . . . . . . . . .<2>. . . . . . . . . | - 1 | . . . . . . . . . . . .<1>. . . . . . . . | - 2 | . . . . . . . . . . . . .<1>. . . . . . . | - 3 | . . . . . . . . . . . . . .<1>. . . . . . | - c | . . . . . . . . . . . . . . .<1>. . . . . | - d | . . . . . . . . . . . . . . . .<1>. . . . | - f | . . . . . . . . . . . . . . . . .<1>. . . | - g | . . . . . . . . . . . . . . . . . .<1>. . | - _ | . . . . . . . . . . . . . . . . . . .<.>. | - z | . . . . . . . . . . . . . . . . . . . .<.>| - --+-------------------------------------------+ - (row = reference; col = test) - - - >>> print(cm.pretty_format(sort_by_count=True, truncate=10)) - | e a i o s t . T h | - --+---------------------+ - |<8>. . . . . . . . . | - e | .<6>. 3 . . . . . . | - a | . .<4>. . . . . . . | - i | . 1 .<1>1 . . . . . | - o | . . . .<3>. . . . . | - s | . . . . .<2>. . . . | - t | . . . . . .<3>. . . | - . | . . . . . . .<2>. . | - T | . . . . . . . .<2>. | - h | . . . . . . . . .<2>| - --+---------------------+ - (row = reference; col = test) - - - >>> print(cm.pretty_format(sort_by_count=True, truncate=10, values_in_chart=False)) - | 1 | - | 1 2 3 4 5 6 7 8 9 0 | - ---+---------------------+ - 1 |<8>. . . . . . . . . | - 2 | .<6>. 3 . . . . . . | - 3 | . .<4>. . . . . . . | - 4 | . 1 .<1>1 . . . . . | - 5 | . . . .<3>. . . . . | - 6 | . . . . .<2>. . . . | - 7 | . . . . . .<3>. . . | - 8 | . . . . . . .<2>. . | - 9 | . . . . . . . .<2>. | - 10 | . . . . . . . . .<2>| - ---+---------------------+ - (row = reference; col = test) - Value key: - 1: - 2: e - 3: a - 4: i - 5: o - 6: s - 7: t - 8: . - 9: T - 10: h - - -For "e", the number of true positives should be 6, while the number of false negatives is 3. -So, the recall ought to be 6 / (6 + 3): - - >>> cm.recall("e") # doctest: +ELLIPSIS - 0.666666... - -For "e", the false positive is just 1, so the precision should be 6 / (6 + 1): - - >>> cm.precision("e") # doctest: +ELLIPSIS - 0.857142... - -The f-measure with default value of ``alpha = 0.5`` should then be: - -* *1/(alpha/p + (1-alpha)/r) =* -* *1/(0.5/p + 0.5/r) =* -* *2pr / (p + r) =* -* *2 * 0.857142... * 0.666666... / (0.857142... + 0.666666...) =* -* *0.749999...* - - >>> cm.f_measure("e") # doctest: +ELLIPSIS - 0.749999... - --------------------- -Association measures --------------------- - -These measures are useful to determine whether the coocurrence of two random -events is meaningful. They are used, for instance, to distinguish collocations -from other pairs of adjacent words. - -We bring some examples of bigram association calculations from Manning and -Schutze's SNLP, 2nd Ed. chapter 5. - - >>> n_new_companies, n_new, n_companies, N = 8, 15828, 4675, 14307668 - >>> bam = BigramAssocMeasures - >>> bam.raw_freq(20, (42, 20), N) == 20. / N - True - >>> bam.student_t(n_new_companies, (n_new, n_companies), N) - 0.999... - >>> bam.chi_sq(n_new_companies, (n_new, n_companies), N) - 1.54... - >>> bam.likelihood_ratio(150, (12593, 932), N) - 1291... - -For other associations, we ensure the ordering of the measures: - - >>> bam.mi_like(20, (42, 20), N) > bam.mi_like(20, (41, 27), N) - True - >>> bam.pmi(20, (42, 20), N) > bam.pmi(20, (41, 27), N) - True - >>> bam.phi_sq(20, (42, 20), N) > bam.phi_sq(20, (41, 27), N) - True - >>> bam.poisson_stirling(20, (42, 20), N) > bam.poisson_stirling(20, (41, 27), N) - True - >>> bam.jaccard(20, (42, 20), N) > bam.jaccard(20, (41, 27), N) - True - >>> bam.dice(20, (42, 20), N) > bam.dice(20, (41, 27), N) - True - >>> bam.fisher(20, (42, 20), N) > bam.fisher(20, (41, 27), N) # doctest: +SKIP - False - -For trigrams, we have to provide more count information: - - >>> n_w1_w2_w3 = 20 - >>> n_w1_w2, n_w1_w3, n_w2_w3 = 35, 60, 40 - >>> pair_counts = (n_w1_w2, n_w1_w3, n_w2_w3) - >>> n_w1, n_w2, n_w3 = 100, 200, 300 - >>> uni_counts = (n_w1, n_w2, n_w3) - >>> N = 14307668 - >>> tam = TrigramAssocMeasures - >>> tam.raw_freq(n_w1_w2_w3, pair_counts, uni_counts, N) == 1. * n_w1_w2_w3 / N - True - >>> uni_counts2 = (n_w1, n_w2, 100) - >>> tam.student_t(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.student_t(n_w1_w2_w3, pair_counts, uni_counts, N) - True - >>> tam.chi_sq(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.chi_sq(n_w1_w2_w3, pair_counts, uni_counts, N) - True - >>> tam.mi_like(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.mi_like(n_w1_w2_w3, pair_counts, uni_counts, N) - True - >>> tam.pmi(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.pmi(n_w1_w2_w3, pair_counts, uni_counts, N) - True - >>> tam.likelihood_ratio(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.likelihood_ratio(n_w1_w2_w3, pair_counts, uni_counts, N) - True - >>> tam.poisson_stirling(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.poisson_stirling(n_w1_w2_w3, pair_counts, uni_counts, N) - True - >>> tam.jaccard(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.jaccard(n_w1_w2_w3, pair_counts, uni_counts, N) - True - - -For fourgrams, we have to provide more count information: - - >>> n_w1_w2_w3_w4 = 5 - >>> n_w1_w2, n_w1_w3, n_w2_w3 = 35, 60, 40 - >>> n_w1_w2_w3, n_w2_w3_w4 = 20, 10 - >>> pair_counts = (n_w1_w2, n_w1_w3, n_w2_w3) - >>> triplet_counts = (n_w1_w2_w3, n_w2_w3_w4) - >>> n_w1, n_w2, n_w3, n_w4 = 100, 200, 300, 400 - >>> uni_counts = (n_w1, n_w2, n_w3, n_w4) - >>> N = 14307668 - >>> qam = QuadgramAssocMeasures - >>> qam.raw_freq(n_w1_w2_w3_w4, pair_counts, triplet_counts, uni_counts, N) == 1. * n_w1_w2_w3_w4 / N - True diff --git a/pipeline/nltk/test/misc.doctest b/pipeline/nltk/test/misc.doctest deleted file mode 100644 index 7b88b6d742e0917ee841187e3c792754c88dfbf2..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/misc.doctest +++ /dev/null @@ -1,118 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - --------------------------------------------------------------------------------- -Unit tests for the miscellaneous sort functions. --------------------------------------------------------------------------------- - - >>> from copy import deepcopy - >>> from nltk.misc.sort import * - -A (very) small list of unsorted integers. - - >>> test_data = [12, 67, 7, 28, 92, 56, 53, 720, 91, 57, 20, 20] - -Test each sorting method - each method returns the number of operations -required to sort the data, and sorts in-place (desctructively - hence the need -for multiple copies). - - >>> sorted_data = deepcopy(test_data) - >>> selection(sorted_data) - 66 - - >>> sorted_data - [7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720] - - >>> sorted_data = deepcopy(test_data) - >>> bubble(sorted_data) - 30 - - >>> sorted_data - [7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720] - - >>> sorted_data = deepcopy(test_data) - >>> merge(sorted_data) - 30 - - >>> sorted_data - [7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720] - - >>> sorted_data = deepcopy(test_data) - >>> quick(sorted_data) - 13 - - >>> sorted_data - [7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720] - --------------------------------------------------------------------------------- -Unit tests for Wordfinder class --------------------------------------------------------------------------------- - - >>> import random - - >>> # The following is not enough for reproducibility under Python 2/3 - >>> # (see https://bugs.python.org/issue9025) so this test is skipped. - >>> random.seed(12345) - - >>> from nltk.misc import wordfinder - >>> wordfinder.word_finder() # doctest: +SKIP - Word Finder - - J V L A I R O T A T I S I V O D E R E T - H U U B E A R O E P O C S O R E T N E P - A D A U Z E E S R A P P A L L M E N T R - C X A D Q S Z T P E O R S N G P J A D E - I G Y K K T I A A R G F I D T E L C N S - R E C N B H T R L T N N B W N T A O A I - A Y I L O E I A M E I A A Y U R P L L D - G L T V S T S F E A D I P H D O O H N I - R L S E C I N I L R N N M E C G R U E A - A A Y G I C E N L L E O I G Q R T A E L - M R C E T I S T A E T L L E U A E N R L - O U O T A S E E C S O O N H Y P A T G Y - E M H O M M D R E S F P U L T H C F N V - L A C A I M A M A N L B R U T E D O M I - O R I L N E E E E E U A R S C R Y L I P - H T R K E S N N M S I L A S R E V I N U - T X T A A O U T K S E T A R R E S I B J - A E D L E L J I F O O R P E L K N I R W - K H A I D E Q O P R I C K T I M B E R P - Z K D O O H G N I H T U R V E Y D R O P - - 1: INTERCHANGER - 2: TEARLESSNESS - 3: UNIVERSALISM - 4: DESENSITIZER - 5: INTERMENTION - 6: TRICHOCYSTIC - 7: EXTRAMURALLY - 8: VEGETOALKALI - 9: PALMELLACEAE - 10: AESTHETICISM - 11: PETROGRAPHER - 12: VISITATORIAL - 13: OLEOMARGARIC - 14: WRINKLEPROOF - 15: PRICKTIMBER - 16: PRESIDIALLY - 17: SCITAMINEAE - 18: ENTEROSCOPE - 19: APPALLMENT - 20: TURVEYDROP - 21: THINGHOOD - 22: BISERRATE - 23: GREENLAND - 24: BRUTEDOM - 25: POLONIAN - 26: ACOLHUAN - 27: LAPORTEA - 28: TENDING - 29: TEREDO - 30: MESOLE - 31: UNLIMP - 32: OSTARA - 33: PILY - 34: DUNT - 35: ONYX - 36: KATH - 37: JUNE diff --git a/pipeline/nltk/test/nonmonotonic.doctest b/pipeline/nltk/test/nonmonotonic.doctest deleted file mode 100644 index a570e05119733733829fd7db7813f7edfdc92f2e..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/nonmonotonic.doctest +++ /dev/null @@ -1,293 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -====================== -Nonmonotonic Reasoning -====================== - - >>> from nltk.test.setup_fixt import check_binary - >>> check_binary('mace4') - - >>> from nltk import * - >>> from nltk.inference.nonmonotonic import * - >>> from nltk.sem import logic - >>> logic._counter._value = 0 - >>> read_expr = logic.Expression.fromstring - ------------------------- -Closed Domain Assumption ------------------------- - -The only entities in the domain are those found in the assumptions or goal. -If the domain only contains "A" and "B", then the expression "exists x.P(x)" can -be replaced with "P(A) | P(B)" and an expression "all x.P(x)" can be replaced -with "P(A) & P(B)". - - >>> p1 = read_expr(r'all x.(man(x) -> mortal(x))') - >>> p2 = read_expr(r'man(Socrates)') - >>> c = read_expr(r'mortal(Socrates)') - >>> prover = Prover9Command(c, [p1,p2]) - >>> prover.prove() - True - >>> cdp = ClosedDomainProver(prover) - >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP - (man(Socrates) -> mortal(Socrates)) - man(Socrates) - >>> cdp.prove() - True - - >>> p1 = read_expr(r'exists x.walk(x)') - >>> p2 = read_expr(r'man(Socrates)') - >>> c = read_expr(r'walk(Socrates)') - >>> prover = Prover9Command(c, [p1,p2]) - >>> prover.prove() - False - >>> cdp = ClosedDomainProver(prover) - >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP - walk(Socrates) - man(Socrates) - >>> cdp.prove() - True - - >>> p1 = read_expr(r'exists x.walk(x)') - >>> p2 = read_expr(r'man(Socrates)') - >>> p3 = read_expr(r'-walk(Bill)') - >>> c = read_expr(r'walk(Socrates)') - >>> prover = Prover9Command(c, [p1,p2,p3]) - >>> prover.prove() - False - >>> cdp = ClosedDomainProver(prover) - >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP - (walk(Socrates) | walk(Bill)) - man(Socrates) - -walk(Bill) - >>> cdp.prove() - True - - >>> p1 = read_expr(r'walk(Socrates)') - >>> p2 = read_expr(r'walk(Bill)') - >>> c = read_expr(r'all x.walk(x)') - >>> prover = Prover9Command(c, [p1,p2]) - >>> prover.prove() - False - >>> cdp = ClosedDomainProver(prover) - >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP - walk(Socrates) - walk(Bill) - >>> print(cdp.goal()) # doctest: +SKIP - (walk(Socrates) & walk(Bill)) - >>> cdp.prove() - True - - >>> p1 = read_expr(r'girl(mary)') - >>> p2 = read_expr(r'dog(rover)') - >>> p3 = read_expr(r'all x.(girl(x) -> -dog(x))') - >>> p4 = read_expr(r'all x.(dog(x) -> -girl(x))') - >>> p5 = read_expr(r'chase(mary, rover)') - >>> c = read_expr(r'exists y.(dog(y) & all x.(girl(x) -> chase(x,y)))') - >>> prover = Prover9Command(c, [p1,p2,p3,p4,p5]) - >>> print(prover.prove()) - False - >>> cdp = ClosedDomainProver(prover) - >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP - girl(mary) - dog(rover) - ((girl(rover) -> -dog(rover)) & (girl(mary) -> -dog(mary))) - ((dog(rover) -> -girl(rover)) & (dog(mary) -> -girl(mary))) - chase(mary,rover) - >>> print(cdp.goal()) # doctest: +SKIP - ((dog(rover) & (girl(rover) -> chase(rover,rover)) & (girl(mary) -> chase(mary,rover))) | (dog(mary) & (girl(rover) -> chase(rover,mary)) & (girl(mary) -> chase(mary,mary)))) - >>> print(cdp.prove()) - True - ------------------------ -Unique Names Assumption ------------------------ - -No two entities in the domain represent the same entity unless it can be -explicitly proven that they do. Therefore, if the domain contains "A" and "B", -then add the assumption "-(A = B)" if it is not the case that -" \|- (A = B)". - - >>> p1 = read_expr(r'man(Socrates)') - >>> p2 = read_expr(r'man(Bill)') - >>> c = read_expr(r'exists x.exists y.-(x = y)') - >>> prover = Prover9Command(c, [p1,p2]) - >>> prover.prove() - False - >>> unp = UniqueNamesProver(prover) - >>> for a in unp.assumptions(): print(a) # doctest: +SKIP - man(Socrates) - man(Bill) - -(Socrates = Bill) - >>> unp.prove() - True - - >>> p1 = read_expr(r'all x.(walk(x) -> (x = Socrates))') - >>> p2 = read_expr(r'Bill = William') - >>> p3 = read_expr(r'Bill = Billy') - >>> c = read_expr(r'-walk(William)') - >>> prover = Prover9Command(c, [p1,p2,p3]) - >>> prover.prove() - False - >>> unp = UniqueNamesProver(prover) - >>> for a in unp.assumptions(): print(a) # doctest: +SKIP - all x.(walk(x) -> (x = Socrates)) - (Bill = William) - (Bill = Billy) - -(William = Socrates) - -(Billy = Socrates) - -(Socrates = Bill) - >>> unp.prove() - True - ------------------------ -Closed World Assumption ------------------------ - -The only entities that have certain properties are those that is it stated -have the properties. We accomplish this assumption by "completing" predicates. - -If the assumptions contain "P(A)", then "all x.(P(x) -> (x=A))" is the completion -of "P". If the assumptions contain "all x.(ostrich(x) -> bird(x))", then -"all x.(bird(x) -> ostrich(x))" is the completion of "bird". If the -assumptions don't contain anything that are "P", then "all x.-P(x)" is the -completion of "P". - - >>> p1 = read_expr(r'walk(Socrates)') - >>> p2 = read_expr(r'-(Socrates = Bill)') - >>> c = read_expr(r'-walk(Bill)') - >>> prover = Prover9Command(c, [p1,p2]) - >>> prover.prove() - False - >>> cwp = ClosedWorldProver(prover) - >>> for a in cwp.assumptions(): print(a) # doctest: +SKIP - walk(Socrates) - -(Socrates = Bill) - all z1.(walk(z1) -> (z1 = Socrates)) - >>> cwp.prove() - True - - >>> p1 = read_expr(r'see(Socrates, John)') - >>> p2 = read_expr(r'see(John, Mary)') - >>> p3 = read_expr(r'-(Socrates = John)') - >>> p4 = read_expr(r'-(John = Mary)') - >>> c = read_expr(r'-see(Socrates, Mary)') - >>> prover = Prover9Command(c, [p1,p2,p3,p4]) - >>> prover.prove() - False - >>> cwp = ClosedWorldProver(prover) - >>> for a in cwp.assumptions(): print(a) # doctest: +SKIP - see(Socrates,John) - see(John,Mary) - -(Socrates = John) - -(John = Mary) - all z3 z4.(see(z3,z4) -> (((z3 = Socrates) & (z4 = John)) | ((z3 = John) & (z4 = Mary)))) - >>> cwp.prove() - True - - >>> p1 = read_expr(r'all x.(ostrich(x) -> bird(x))') - >>> p2 = read_expr(r'bird(Tweety)') - >>> p3 = read_expr(r'-ostrich(Sam)') - >>> p4 = read_expr(r'Sam != Tweety') - >>> c = read_expr(r'-bird(Sam)') - >>> prover = Prover9Command(c, [p1,p2,p3,p4]) - >>> prover.prove() - False - >>> cwp = ClosedWorldProver(prover) - >>> for a in cwp.assumptions(): print(a) # doctest: +SKIP - all x.(ostrich(x) -> bird(x)) - bird(Tweety) - -ostrich(Sam) - -(Sam = Tweety) - all z7.-ostrich(z7) - all z8.(bird(z8) -> ((z8 = Tweety) | ostrich(z8))) - >>> print(cwp.prove()) - True - ------------------------ -Multi-Decorator Example ------------------------ - -Decorators can be nested to utilize multiple assumptions. - - >>> p1 = read_expr(r'see(Socrates, John)') - >>> p2 = read_expr(r'see(John, Mary)') - >>> c = read_expr(r'-see(Socrates, Mary)') - >>> prover = Prover9Command(c, [p1,p2]) - >>> print(prover.prove()) - False - >>> cmd = ClosedDomainProver(UniqueNamesProver(ClosedWorldProver(prover))) - >>> print(cmd.prove()) - True - ------------------ -Default Reasoning ------------------ - >>> logic._counter._value = 0 - >>> premises = [] - -define the taxonomy - - >>> premises.append(read_expr(r'all x.(elephant(x) -> animal(x))')) - >>> premises.append(read_expr(r'all x.(bird(x) -> animal(x))')) - >>> premises.append(read_expr(r'all x.(dove(x) -> bird(x))')) - >>> premises.append(read_expr(r'all x.(ostrich(x) -> bird(x))')) - >>> premises.append(read_expr(r'all x.(flying_ostrich(x) -> ostrich(x))')) - -default the properties using abnormalities - - >>> premises.append(read_expr(r'all x.((animal(x) & -Ab1(x)) -> -fly(x))')) #normal animals don't fly - >>> premises.append(read_expr(r'all x.((bird(x) & -Ab2(x)) -> fly(x))')) #normal birds fly - >>> premises.append(read_expr(r'all x.((ostrich(x) & -Ab3(x)) -> -fly(x))')) #normal ostriches don't fly - -specify abnormal entities - - >>> premises.append(read_expr(r'all x.(bird(x) -> Ab1(x))')) #flight - >>> premises.append(read_expr(r'all x.(ostrich(x) -> Ab2(x))')) #non-flying bird - >>> premises.append(read_expr(r'all x.(flying_ostrich(x) -> Ab3(x))')) #flying ostrich - -define entities - - >>> premises.append(read_expr(r'elephant(el)')) - >>> premises.append(read_expr(r'dove(do)')) - >>> premises.append(read_expr(r'ostrich(os)')) - -print the augmented assumptions list - - >>> prover = Prover9Command(None, premises) - >>> command = UniqueNamesProver(ClosedWorldProver(prover)) - >>> for a in command.assumptions(): print(a) # doctest: +SKIP - all x.(elephant(x) -> animal(x)) - all x.(bird(x) -> animal(x)) - all x.(dove(x) -> bird(x)) - all x.(ostrich(x) -> bird(x)) - all x.(flying_ostrich(x) -> ostrich(x)) - all x.((animal(x) & -Ab1(x)) -> -fly(x)) - all x.((bird(x) & -Ab2(x)) -> fly(x)) - all x.((ostrich(x) & -Ab3(x)) -> -fly(x)) - all x.(bird(x) -> Ab1(x)) - all x.(ostrich(x) -> Ab2(x)) - all x.(flying_ostrich(x) -> Ab3(x)) - elephant(el) - dove(do) - ostrich(os) - all z1.(animal(z1) -> (elephant(z1) | bird(z1))) - all z2.(Ab1(z2) -> bird(z2)) - all z3.(bird(z3) -> (dove(z3) | ostrich(z3))) - all z4.(dove(z4) -> (z4 = do)) - all z5.(Ab2(z5) -> ostrich(z5)) - all z6.(Ab3(z6) -> flying_ostrich(z6)) - all z7.(ostrich(z7) -> ((z7 = os) | flying_ostrich(z7))) - all z8.-flying_ostrich(z8) - all z9.(elephant(z9) -> (z9 = el)) - -(el = os) - -(el = do) - -(os = do) - - >>> UniqueNamesProver(ClosedWorldProver(Prover9Command(read_expr('-fly(el)'), premises))).prove() - True - >>> UniqueNamesProver(ClosedWorldProver(Prover9Command(read_expr('fly(do)'), premises))).prove() - True - >>> UniqueNamesProver(ClosedWorldProver(Prover9Command(read_expr('-fly(os)'), premises))).prove() - True diff --git a/pipeline/nltk/test/paice.doctest b/pipeline/nltk/test/paice.doctest deleted file mode 100644 index 3759a44bd17ae6234b970b87ee39d6424e6d6f2c..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/paice.doctest +++ /dev/null @@ -1,35 +0,0 @@ - -===================================================== -PAICE's evaluation statistics for stemming algorithms -===================================================== - -Given a list of words with their real lemmas and stems according to stemming algorithm under evaluation, -counts Understemming Index (UI), Overstemming Index (OI), Stemming Weight (SW) and Error-rate relative to truncation (ERRT). - - >>> from nltk.metrics import Paice - - -------------------------------------- -Understemming and Overstemming values -------------------------------------- - - >>> lemmas = {'kneel': ['kneel', 'knelt'], - ... 'range': ['range', 'ranged'], - ... 'ring': ['ring', 'rang', 'rung']} - >>> stems = {'kneel': ['kneel'], - ... 'knelt': ['knelt'], - ... 'rang': ['rang', 'range', 'ranged'], - ... 'ring': ['ring'], - ... 'rung': ['rung']} - >>> p = Paice(lemmas, stems) - >>> p.gumt, p.gdmt, p.gwmt, p.gdnt - (4.0, 5.0, 2.0, 16.0) - - >>> p.ui, p.oi, p.sw - (0.8..., 0.125..., 0.15625...) - - >>> p.errt - 1.0 - - >>> [('{0:.3f}'.format(a), '{0:.3f}'.format(b)) for a, b in p.coords] - [('0.000', '1.000'), ('0.000', '0.375'), ('0.600', '0.125'), ('0.800', '0.125')] diff --git a/pipeline/nltk/test/parse.doctest b/pipeline/nltk/test/parse.doctest deleted file mode 100644 index 13e107e3faa103ea345da5733aa84fe09163e10d..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/parse.doctest +++ /dev/null @@ -1,933 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -========= - Parsing -========= - -Unit tests for the Context Free Grammar class ---------------------------------------------- - - >>> import pickle - >>> import subprocess - >>> import sys - >>> from nltk import Nonterminal, nonterminals, Production, CFG - - >>> nt1 = Nonterminal('NP') - >>> nt2 = Nonterminal('VP') - - >>> nt1.symbol() - 'NP' - - >>> nt1 == Nonterminal('NP') - True - - >>> nt1 == nt2 - False - - >>> S, NP, VP, PP = nonterminals('S, NP, VP, PP') - >>> N, V, P, DT = nonterminals('N, V, P, DT') - - >>> prod1 = Production(S, [NP, VP]) - >>> prod2 = Production(NP, [DT, NP]) - - >>> prod1.lhs() - S - - >>> prod1.rhs() - (NP, VP) - - >>> prod1 == Production(S, [NP, VP]) - True - - >>> prod1 == prod2 - False - - >>> grammar = CFG.fromstring(""" - ... S -> NP VP - ... PP -> P NP - ... NP -> 'the' N | N PP | 'the' N PP - ... VP -> V NP | V PP | V NP PP - ... N -> 'cat' - ... N -> 'dog' - ... N -> 'rug' - ... V -> 'chased' - ... V -> 'sat' - ... P -> 'in' - ... P -> 'on' - ... """) - - >>> cmd = """import pickle - ... from nltk import Production - ... p = Production('S', ['NP', 'VP']) - ... print(pickle.dumps(p)) - ... """ - - >>> # Start a subprocess to simulate pickling in another process - >>> proc = subprocess.run([sys.executable, '-c', cmd], stdout=subprocess.PIPE) - >>> p1 = pickle.loads(eval(proc.stdout)) - >>> p2 = Production('S', ['NP', 'VP']) - >>> print(hash(p1) == hash(p2)) - True - -Unit tests for the rd (Recursive Descent Parser) class ------------------------------------------------------- - -Create and run a recursive descent parser over both a syntactically ambiguous -and unambiguous sentence. - - >>> from nltk.parse import RecursiveDescentParser - >>> rd = RecursiveDescentParser(grammar) - - >>> sentence1 = 'the cat chased the dog'.split() - >>> sentence2 = 'the cat chased the dog on the rug'.split() - - >>> for t in rd.parse(sentence1): - ... print(t) - (S (NP the (N cat)) (VP (V chased) (NP the (N dog)))) - - >>> for t in rd.parse(sentence2): - ... print(t) - (S - (NP the (N cat)) - (VP (V chased) (NP the (N dog) (PP (P on) (NP the (N rug)))))) - (S - (NP the (N cat)) - (VP (V chased) (NP the (N dog)) (PP (P on) (NP the (N rug))))) - - -(dolist (expr doctest-font-lock-keywords) - (add-to-list 'font-lock-keywords expr)) - - font-lock-keywords -(add-to-list 'font-lock-keywords - (car doctest-font-lock-keywords)) - - -Unit tests for the sr (Shift Reduce Parser) class -------------------------------------------------- - -Create and run a shift reduce parser over both a syntactically ambiguous -and unambiguous sentence. Note that unlike the recursive descent parser, one -and only one parse is ever returned. - - >>> from nltk.parse import ShiftReduceParser - >>> sr = ShiftReduceParser(grammar) - - >>> sentence1 = 'the cat chased the dog'.split() - >>> sentence2 = 'the cat chased the dog on the rug'.split() - - >>> for t in sr.parse(sentence1): - ... print(t) - (S (NP the (N cat)) (VP (V chased) (NP the (N dog)))) - - -The shift reduce parser uses heuristics to decide what to do when there are -multiple possible shift or reduce operations available - for the supplied -grammar clearly the wrong operation is selected. - - >>> for t in sr.parse(sentence2): - ... print(t) - - -Unit tests for the Chart Parser class -------------------------------------- - -We use the demo() function for testing. -We must turn off showing of times. - - >>> import nltk - -First we test tracing with a short sentence - - >>> nltk.parse.chart.demo(2, print_times=False, trace=1, - ... sent='I saw a dog', numparses=1) - * Sentence: - I saw a dog - ['I', 'saw', 'a', 'dog'] - - * Strategy: Bottom-up - - |. I . saw . a . dog .| - |[---------] . . .| [0:1] 'I' - |. [---------] . .| [1:2] 'saw' - |. . [---------] .| [2:3] 'a' - |. . . [---------]| [3:4] 'dog' - |> . . . .| [0:0] NP -> * 'I' - |[---------] . . .| [0:1] NP -> 'I' * - |> . . . .| [0:0] S -> * NP VP - |> . . . .| [0:0] NP -> * NP PP - |[---------> . . .| [0:1] S -> NP * VP - |[---------> . . .| [0:1] NP -> NP * PP - |. > . . .| [1:1] Verb -> * 'saw' - |. [---------] . .| [1:2] Verb -> 'saw' * - |. > . . .| [1:1] VP -> * Verb NP - |. > . . .| [1:1] VP -> * Verb - |. [---------> . .| [1:2] VP -> Verb * NP - |. [---------] . .| [1:2] VP -> Verb * - |. > . . .| [1:1] VP -> * VP PP - |[-------------------] . .| [0:2] S -> NP VP * - |. [---------> . .| [1:2] VP -> VP * PP - |. . > . .| [2:2] Det -> * 'a' - |. . [---------] .| [2:3] Det -> 'a' * - |. . > . .| [2:2] NP -> * Det Noun - |. . [---------> .| [2:3] NP -> Det * Noun - |. . . > .| [3:3] Noun -> * 'dog' - |. . . [---------]| [3:4] Noun -> 'dog' * - |. . [-------------------]| [2:4] NP -> Det Noun * - |. . > . .| [2:2] S -> * NP VP - |. . > . .| [2:2] NP -> * NP PP - |. [-----------------------------]| [1:4] VP -> Verb NP * - |. . [------------------->| [2:4] S -> NP * VP - |. . [------------------->| [2:4] NP -> NP * PP - |[=======================================]| [0:4] S -> NP VP * - |. [----------------------------->| [1:4] VP -> VP * PP - Nr edges in chart: 33 - (S (NP I) (VP (Verb saw) (NP (Det a) (Noun dog)))) - - -Then we test the different parsing Strategies. -Note that the number of edges differ between the strategies. - -Top-down - - >>> nltk.parse.chart.demo(1, print_times=False, trace=0, - ... sent='I saw John with a dog', numparses=2) - * Sentence: - I saw John with a dog - ['I', 'saw', 'John', 'with', 'a', 'dog'] - - * Strategy: Top-down - - Nr edges in chart: 48 - (S - (NP I) - (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) - (S - (NP I) - (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) - - -Bottom-up - - >>> nltk.parse.chart.demo(2, print_times=False, trace=0, - ... sent='I saw John with a dog', numparses=2) - * Sentence: - I saw John with a dog - ['I', 'saw', 'John', 'with', 'a', 'dog'] - - * Strategy: Bottom-up - - Nr edges in chart: 53 - (S - (NP I) - (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) - (S - (NP I) - (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) - - -Bottom-up Left-Corner - - >>> nltk.parse.chart.demo(3, print_times=False, trace=0, - ... sent='I saw John with a dog', numparses=2) - * Sentence: - I saw John with a dog - ['I', 'saw', 'John', 'with', 'a', 'dog'] - - * Strategy: Bottom-up left-corner - - Nr edges in chart: 36 - (S - (NP I) - (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) - (S - (NP I) - (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) - - -Left-Corner with Bottom-Up Filter - - >>> nltk.parse.chart.demo(4, print_times=False, trace=0, - ... sent='I saw John with a dog', numparses=2) - * Sentence: - I saw John with a dog - ['I', 'saw', 'John', 'with', 'a', 'dog'] - - * Strategy: Filtered left-corner - - Nr edges in chart: 28 - (S - (NP I) - (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) - (S - (NP I) - (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) - - -The stepping chart parser - - >>> nltk.parse.chart.demo(5, print_times=False, trace=1, - ... sent='I saw John with a dog', numparses=2) - * Sentence: - I saw John with a dog - ['I', 'saw', 'John', 'with', 'a', 'dog'] - - * Strategy: Stepping (top-down vs bottom-up) - - *** SWITCH TO TOP DOWN - |[------] . . . . .| [0:1] 'I' - |. [------] . . . .| [1:2] 'saw' - |. . [------] . . .| [2:3] 'John' - |. . . [------] . .| [3:4] 'with' - |. . . . [------] .| [4:5] 'a' - |. . . . . [------]| [5:6] 'dog' - |> . . . . . .| [0:0] S -> * NP VP - |> . . . . . .| [0:0] NP -> * NP PP - |> . . . . . .| [0:0] NP -> * Det Noun - |> . . . . . .| [0:0] NP -> * 'I' - |[------] . . . . .| [0:1] NP -> 'I' * - |[------> . . . . .| [0:1] S -> NP * VP - |[------> . . . . .| [0:1] NP -> NP * PP - |. > . . . . .| [1:1] VP -> * VP PP - |. > . . . . .| [1:1] VP -> * Verb NP - |. > . . . . .| [1:1] VP -> * Verb - |. > . . . . .| [1:1] Verb -> * 'saw' - |. [------] . . . .| [1:2] Verb -> 'saw' * - |. [------> . . . .| [1:2] VP -> Verb * NP - |. [------] . . . .| [1:2] VP -> Verb * - |[-------------] . . . .| [0:2] S -> NP VP * - |. [------> . . . .| [1:2] VP -> VP * PP - *** SWITCH TO BOTTOM UP - |. . > . . . .| [2:2] NP -> * 'John' - |. . . > . . .| [3:3] PP -> * 'with' NP - |. . . > . . .| [3:3] Prep -> * 'with' - |. . . . > . .| [4:4] Det -> * 'a' - |. . . . . > .| [5:5] Noun -> * 'dog' - |. . [------] . . .| [2:3] NP -> 'John' * - |. . . [------> . .| [3:4] PP -> 'with' * NP - |. . . [------] . .| [3:4] Prep -> 'with' * - |. . . . [------] .| [4:5] Det -> 'a' * - |. . . . . [------]| [5:6] Noun -> 'dog' * - |. [-------------] . . .| [1:3] VP -> Verb NP * - |[--------------------] . . .| [0:3] S -> NP VP * - |. [-------------> . . .| [1:3] VP -> VP * PP - |. . > . . . .| [2:2] S -> * NP VP - |. . > . . . .| [2:2] NP -> * NP PP - |. . . . > . .| [4:4] NP -> * Det Noun - |. . [------> . . .| [2:3] S -> NP * VP - |. . [------> . . .| [2:3] NP -> NP * PP - |. . . . [------> .| [4:5] NP -> Det * Noun - |. . . . [-------------]| [4:6] NP -> Det Noun * - |. . . [--------------------]| [3:6] PP -> 'with' NP * - |. [----------------------------------]| [1:6] VP -> VP PP * - *** SWITCH TO TOP DOWN - |. . > . . . .| [2:2] NP -> * Det Noun - |. . . . > . .| [4:4] NP -> * NP PP - |. . . > . . .| [3:3] VP -> * VP PP - |. . . > . . .| [3:3] VP -> * Verb NP - |. . . > . . .| [3:3] VP -> * Verb - |[=========================================]| [0:6] S -> NP VP * - |. [---------------------------------->| [1:6] VP -> VP * PP - |. . [---------------------------]| [2:6] NP -> NP PP * - |. . . . [------------->| [4:6] NP -> NP * PP - |. [----------------------------------]| [1:6] VP -> Verb NP * - |. . [--------------------------->| [2:6] S -> NP * VP - |. . [--------------------------->| [2:6] NP -> NP * PP - |[=========================================]| [0:6] S -> NP VP * - |. [---------------------------------->| [1:6] VP -> VP * PP - |. . . . . . >| [6:6] VP -> * VP PP - |. . . . . . >| [6:6] VP -> * Verb NP - |. . . . . . >| [6:6] VP -> * Verb - *** SWITCH TO BOTTOM UP - |. . . . > . .| [4:4] S -> * NP VP - |. . . . [------------->| [4:6] S -> NP * VP - *** SWITCH TO TOP DOWN - *** SWITCH TO BOTTOM UP - *** SWITCH TO TOP DOWN - *** SWITCH TO BOTTOM UP - *** SWITCH TO TOP DOWN - *** SWITCH TO BOTTOM UP - Nr edges in chart: 61 - (S - (NP I) - (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) - (S - (NP I) - (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) - - - -Unit tests for the Incremental Chart Parser class -------------------------------------------------- - -The incremental chart parsers are defined in earleychart.py. -We use the demo() function for testing. We must turn off showing of times. - - >>> import nltk - -Earley Chart Parser - - >>> nltk.parse.earleychart.demo(print_times=False, trace=1, - ... sent='I saw John with a dog', numparses=2) - * Sentence: - I saw John with a dog - ['I', 'saw', 'John', 'with', 'a', 'dog'] - - |. I . saw . John . with . a . dog .| - |[------] . . . . .| [0:1] 'I' - |. [------] . . . .| [1:2] 'saw' - |. . [------] . . .| [2:3] 'John' - |. . . [------] . .| [3:4] 'with' - |. . . . [------] .| [4:5] 'a' - |. . . . . [------]| [5:6] 'dog' - |> . . . . . .| [0:0] S -> * NP VP - |> . . . . . .| [0:0] NP -> * NP PP - |> . . . . . .| [0:0] NP -> * Det Noun - |> . . . . . .| [0:0] NP -> * 'I' - |[------] . . . . .| [0:1] NP -> 'I' * - |[------> . . . . .| [0:1] S -> NP * VP - |[------> . . . . .| [0:1] NP -> NP * PP - |. > . . . . .| [1:1] VP -> * VP PP - |. > . . . . .| [1:1] VP -> * Verb NP - |. > . . . . .| [1:1] VP -> * Verb - |. > . . . . .| [1:1] Verb -> * 'saw' - |. [------] . . . .| [1:2] Verb -> 'saw' * - |. [------> . . . .| [1:2] VP -> Verb * NP - |. [------] . . . .| [1:2] VP -> Verb * - |[-------------] . . . .| [0:2] S -> NP VP * - |. [------> . . . .| [1:2] VP -> VP * PP - |. . > . . . .| [2:2] NP -> * NP PP - |. . > . . . .| [2:2] NP -> * Det Noun - |. . > . . . .| [2:2] NP -> * 'John' - |. . [------] . . .| [2:3] NP -> 'John' * - |. [-------------] . . .| [1:3] VP -> Verb NP * - |. . [------> . . .| [2:3] NP -> NP * PP - |. . . > . . .| [3:3] PP -> * 'with' NP - |[--------------------] . . .| [0:3] S -> NP VP * - |. [-------------> . . .| [1:3] VP -> VP * PP - |. . . [------> . .| [3:4] PP -> 'with' * NP - |. . . . > . .| [4:4] NP -> * NP PP - |. . . . > . .| [4:4] NP -> * Det Noun - |. . . . > . .| [4:4] Det -> * 'a' - |. . . . [------] .| [4:5] Det -> 'a' * - |. . . . [------> .| [4:5] NP -> Det * Noun - |. . . . . > .| [5:5] Noun -> * 'dog' - |. . . . . [------]| [5:6] Noun -> 'dog' * - |. . . . [-------------]| [4:6] NP -> Det Noun * - |. . . [--------------------]| [3:6] PP -> 'with' NP * - |. . . . [------------->| [4:6] NP -> NP * PP - |. . [---------------------------]| [2:6] NP -> NP PP * - |. [----------------------------------]| [1:6] VP -> VP PP * - |[=========================================]| [0:6] S -> NP VP * - |. [---------------------------------->| [1:6] VP -> VP * PP - |. [----------------------------------]| [1:6] VP -> Verb NP * - |. . [--------------------------->| [2:6] NP -> NP * PP - |[=========================================]| [0:6] S -> NP VP * - |. [---------------------------------->| [1:6] VP -> VP * PP - (S - (NP I) - (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) - (S - (NP I) - (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) - - -Unit tests for LARGE context-free grammars ------------------------------------------- - -Reading the ATIS grammar. - - >>> grammar = nltk.data.load('grammars/large_grammars/atis.cfg') - >>> grammar - - -Reading the test sentences. - - >>> sentences = nltk.data.load('grammars/large_grammars/atis_sentences.txt') - >>> sentences = nltk.parse.util.extract_test_sentences(sentences) - >>> len(sentences) - 98 - >>> testsentence = sentences[22] - >>> testsentence[0] - ['show', 'me', 'northwest', 'flights', 'to', 'detroit', '.'] - >>> testsentence[1] - 17 - >>> sentence = testsentence[0] - -Now we test all different parsing strategies. -Note that the number of edges differ between the strategies. - -Bottom-up parsing. - - >>> parser = nltk.parse.BottomUpChartParser(grammar) - >>> chart = parser.chart_parse(sentence) - >>> print((chart.num_edges())) - 7661 - >>> print((len(list(chart.parses(grammar.start()))))) - 17 - -Bottom-up Left-corner parsing. - - >>> parser = nltk.parse.BottomUpLeftCornerChartParser(grammar) - >>> chart = parser.chart_parse(sentence) - >>> print((chart.num_edges())) - 4986 - >>> print((len(list(chart.parses(grammar.start()))))) - 17 - -Left-corner parsing with bottom-up filter. - - >>> parser = nltk.parse.LeftCornerChartParser(grammar) - >>> chart = parser.chart_parse(sentence) - >>> print((chart.num_edges())) - 1342 - >>> print((len(list(chart.parses(grammar.start()))))) - 17 - -Top-down parsing. - - >>> parser = nltk.parse.TopDownChartParser(grammar) - >>> chart = parser.chart_parse(sentence) - >>> print((chart.num_edges())) - 28352 - >>> print((len(list(chart.parses(grammar.start()))))) - 17 - -Incremental Bottom-up parsing. - - >>> parser = nltk.parse.IncrementalBottomUpChartParser(grammar) - >>> chart = parser.chart_parse(sentence) - >>> print((chart.num_edges())) - 7661 - >>> print((len(list(chart.parses(grammar.start()))))) - 17 - -Incremental Bottom-up Left-corner parsing. - - >>> parser = nltk.parse.IncrementalBottomUpLeftCornerChartParser(grammar) - >>> chart = parser.chart_parse(sentence) - >>> print((chart.num_edges())) - 4986 - >>> print((len(list(chart.parses(grammar.start()))))) - 17 - -Incremental Left-corner parsing with bottom-up filter. - - >>> parser = nltk.parse.IncrementalLeftCornerChartParser(grammar) - >>> chart = parser.chart_parse(sentence) - >>> print((chart.num_edges())) - 1342 - >>> print((len(list(chart.parses(grammar.start()))))) - 17 - -Incremental Top-down parsing. - - >>> parser = nltk.parse.IncrementalTopDownChartParser(grammar) - >>> chart = parser.chart_parse(sentence) - >>> print((chart.num_edges())) - 28352 - >>> print((len(list(chart.parses(grammar.start()))))) - 17 - -Earley parsing. This is similar to the incremental top-down algorithm. - - >>> parser = nltk.parse.EarleyChartParser(grammar) - >>> chart = parser.chart_parse(sentence) - >>> print((chart.num_edges())) - 28352 - >>> print((len(list(chart.parses(grammar.start()))))) - 17 - - -Unit tests for the Probabilistic CFG class ------------------------------------------- - - >>> from nltk.corpus import treebank - >>> from itertools import islice - >>> from nltk.grammar import PCFG, induce_pcfg - >>> toy_pcfg1 = PCFG.fromstring(""" - ... S -> NP VP [1.0] - ... NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] - ... Det -> 'the' [0.8] | 'my' [0.2] - ... N -> 'man' [0.5] | 'telescope' [0.5] - ... VP -> VP PP [0.1] | V NP [0.7] | V [0.2] - ... V -> 'ate' [0.35] | 'saw' [0.65] - ... PP -> P NP [1.0] - ... P -> 'with' [0.61] | 'under' [0.39] - ... """) - - >>> toy_pcfg2 = PCFG.fromstring(""" - ... S -> NP VP [1.0] - ... VP -> V NP [.59] - ... VP -> V [.40] - ... VP -> VP PP [.01] - ... NP -> Det N [.41] - ... NP -> Name [.28] - ... NP -> NP PP [.31] - ... PP -> P NP [1.0] - ... V -> 'saw' [.21] - ... V -> 'ate' [.51] - ... V -> 'ran' [.28] - ... N -> 'boy' [.11] - ... N -> 'cookie' [.12] - ... N -> 'table' [.13] - ... N -> 'telescope' [.14] - ... N -> 'hill' [.5] - ... Name -> 'Jack' [.52] - ... Name -> 'Bob' [.48] - ... P -> 'with' [.61] - ... P -> 'under' [.39] - ... Det -> 'the' [.41] - ... Det -> 'a' [.31] - ... Det -> 'my' [.28] - ... """) - -Create a set of PCFG productions. - - >>> grammar = PCFG.fromstring(""" - ... A -> B B [.3] | C B C [.7] - ... B -> B D [.5] | C [.5] - ... C -> 'a' [.1] | 'b' [0.9] - ... D -> 'b' [1.0] - ... """) - >>> prod = grammar.productions()[0] - >>> prod - A -> B B [0.3] - - >>> prod.lhs() - A - - >>> prod.rhs() - (B, B) - - >>> print((prod.prob())) - 0.3 - - >>> grammar.start() - A - - >>> grammar.productions() - [A -> B B [0.3], A -> C B C [0.7], B -> B D [0.5], B -> C [0.5], C -> 'a' [0.1], C -> 'b' [0.9], D -> 'b' [1.0]] - -Induce some productions using parsed Treebank data. - - >>> productions = [] - >>> for fileid in treebank.fileids()[:2]: - ... for t in treebank.parsed_sents(fileid): - ... productions += t.productions() - - >>> grammar = induce_pcfg(S, productions) - >>> grammar - - - >>> sorted(grammar.productions(lhs=Nonterminal('PP')))[:2] - [PP -> IN NP [1.0]] - >>> sorted(grammar.productions(lhs=Nonterminal('NNP')))[:2] - [NNP -> 'Agnew' [0.0714286], NNP -> 'Consolidated' [0.0714286]] - >>> sorted(grammar.productions(lhs=Nonterminal('JJ')))[:2] - [JJ -> 'British' [0.142857], JJ -> 'former' [0.142857]] - >>> sorted(grammar.productions(lhs=Nonterminal('NP')))[:2] - [NP -> CD NNS [0.133333], NP -> DT JJ JJ NN [0.0666667]] - -Unit tests for the Probabilistic Chart Parse classes ----------------------------------------------------- - - >>> tokens = "Jack saw Bob with my cookie".split() - >>> grammar = toy_pcfg2 - >>> print(grammar) - Grammar with 23 productions (start state = S) - S -> NP VP [1.0] - VP -> V NP [0.59] - VP -> V [0.4] - VP -> VP PP [0.01] - NP -> Det N [0.41] - NP -> Name [0.28] - NP -> NP PP [0.31] - PP -> P NP [1.0] - V -> 'saw' [0.21] - V -> 'ate' [0.51] - V -> 'ran' [0.28] - N -> 'boy' [0.11] - N -> 'cookie' [0.12] - N -> 'table' [0.13] - N -> 'telescope' [0.14] - N -> 'hill' [0.5] - Name -> 'Jack' [0.52] - Name -> 'Bob' [0.48] - P -> 'with' [0.61] - P -> 'under' [0.39] - Det -> 'the' [0.41] - Det -> 'a' [0.31] - Det -> 'my' [0.28] - -Create several parsers using different queuing strategies and show the -resulting parses. - - >>> from nltk.parse import pchart - - >>> parser = pchart.InsideChartParser(grammar) - >>> for t in parser.parse(tokens): - ... print(t) - (S - (NP (Name Jack)) - (VP - (V saw) - (NP - (NP (Name Bob)) - (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06) - (S - (NP (Name Jack)) - (VP - (VP (V saw) (NP (Name Bob))) - (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07) - - >>> parser = pchart.RandomChartParser(grammar) - >>> for t in parser.parse(tokens): - ... print(t) - (S - (NP (Name Jack)) - (VP - (V saw) - (NP - (NP (Name Bob)) - (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06) - (S - (NP (Name Jack)) - (VP - (VP (V saw) (NP (Name Bob))) - (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07) - - >>> parser = pchart.UnsortedChartParser(grammar) - >>> for t in parser.parse(tokens): - ... print(t) - (S - (NP (Name Jack)) - (VP - (V saw) - (NP - (NP (Name Bob)) - (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06) - (S - (NP (Name Jack)) - (VP - (VP (V saw) (NP (Name Bob))) - (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07) - - >>> parser = pchart.LongestChartParser(grammar) - >>> for t in parser.parse(tokens): - ... print(t) - (S - (NP (Name Jack)) - (VP - (V saw) - (NP - (NP (Name Bob)) - (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06) - (S - (NP (Name Jack)) - (VP - (VP (V saw) (NP (Name Bob))) - (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07) - - >>> parser = pchart.InsideChartParser(grammar, beam_size = len(tokens)+1) - >>> for t in parser.parse(tokens): - ... print(t) - - -Unit tests for the Viterbi Parse classes ----------------------------------------- - - >>> from nltk.parse import ViterbiParser - >>> tokens = "Jack saw Bob with my cookie".split() - >>> grammar = toy_pcfg2 - -Parse the tokenized sentence. - - >>> parser = ViterbiParser(grammar) - >>> for t in parser.parse(tokens): - ... print(t) - (S - (NP (Name Jack)) - (VP - (V saw) - (NP - (NP (Name Bob)) - (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06) - - -Unit tests for the FeatStructNonterminal class ----------------------------------------------- - - >>> from nltk.grammar import FeatStructNonterminal - >>> FeatStructNonterminal( - ... pos='n', agr=FeatStructNonterminal(number='pl', gender='f')) - [agr=[gender='f', number='pl'], pos='n'] - - >>> FeatStructNonterminal('VP[+fin]/NP[+pl]') - VP[+fin]/NP[+pl] - - -Tracing the Feature Chart Parser --------------------------------- - -We use the featurechart.demo() function for tracing the Feature Chart Parser. - - >>> nltk.parse.featurechart.demo(print_times=False, - ... print_grammar=True, - ... parser=nltk.parse.featurechart.FeatureChartParser, - ... sent='I saw John with a dog') - - Grammar with 18 productions (start state = S[]) - S[] -> NP[] VP[] - PP[] -> Prep[] NP[] - NP[] -> NP[] PP[] - VP[] -> VP[] PP[] - VP[] -> Verb[] NP[] - VP[] -> Verb[] - NP[] -> Det[pl=?x] Noun[pl=?x] - NP[] -> 'John' - NP[] -> 'I' - Det[] -> 'the' - Det[] -> 'my' - Det[-pl] -> 'a' - Noun[-pl] -> 'dog' - Noun[-pl] -> 'cookie' - Verb[] -> 'ate' - Verb[] -> 'saw' - Prep[] -> 'with' - Prep[] -> 'under' - - * FeatureChartParser - Sentence: I saw John with a dog - |.I.s.J.w.a.d.| - |[-] . . . . .| [0:1] 'I' - |. [-] . . . .| [1:2] 'saw' - |. . [-] . . .| [2:3] 'John' - |. . . [-] . .| [3:4] 'with' - |. . . . [-] .| [4:5] 'a' - |. . . . . [-]| [5:6] 'dog' - |[-] . . . . .| [0:1] NP[] -> 'I' * - |[-> . . . . .| [0:1] S[] -> NP[] * VP[] {} - |[-> . . . . .| [0:1] NP[] -> NP[] * PP[] {} - |. [-] . . . .| [1:2] Verb[] -> 'saw' * - |. [-> . . . .| [1:2] VP[] -> Verb[] * NP[] {} - |. [-] . . . .| [1:2] VP[] -> Verb[] * - |. [-> . . . .| [1:2] VP[] -> VP[] * PP[] {} - |[---] . . . .| [0:2] S[] -> NP[] VP[] * - |. . [-] . . .| [2:3] NP[] -> 'John' * - |. . [-> . . .| [2:3] S[] -> NP[] * VP[] {} - |. . [-> . . .| [2:3] NP[] -> NP[] * PP[] {} - |. [---] . . .| [1:3] VP[] -> Verb[] NP[] * - |. [---> . . .| [1:3] VP[] -> VP[] * PP[] {} - |[-----] . . .| [0:3] S[] -> NP[] VP[] * - |. . . [-] . .| [3:4] Prep[] -> 'with' * - |. . . [-> . .| [3:4] PP[] -> Prep[] * NP[] {} - |. . . . [-] .| [4:5] Det[-pl] -> 'a' * - |. . . . [-> .| [4:5] NP[] -> Det[pl=?x] * Noun[pl=?x] {?x: False} - |. . . . . [-]| [5:6] Noun[-pl] -> 'dog' * - |. . . . [---]| [4:6] NP[] -> Det[-pl] Noun[-pl] * - |. . . . [--->| [4:6] S[] -> NP[] * VP[] {} - |. . . . [--->| [4:6] NP[] -> NP[] * PP[] {} - |. . . [-----]| [3:6] PP[] -> Prep[] NP[] * - |. . [-------]| [2:6] NP[] -> NP[] PP[] * - |. [---------]| [1:6] VP[] -> VP[] PP[] * - |. [--------->| [1:6] VP[] -> VP[] * PP[] {} - |[===========]| [0:6] S[] -> NP[] VP[] * - |. . [------->| [2:6] S[] -> NP[] * VP[] {} - |. . [------->| [2:6] NP[] -> NP[] * PP[] {} - |. [---------]| [1:6] VP[] -> Verb[] NP[] * - |. [--------->| [1:6] VP[] -> VP[] * PP[] {} - |[===========]| [0:6] S[] -> NP[] VP[] * - (S[] - (NP[] I) - (VP[] - (VP[] (Verb[] saw) (NP[] John)) - (PP[] (Prep[] with) (NP[] (Det[-pl] a) (Noun[-pl] dog))))) - (S[] - (NP[] I) - (VP[] - (Verb[] saw) - (NP[] - (NP[] John) - (PP[] (Prep[] with) (NP[] (Det[-pl] a) (Noun[-pl] dog)))))) - - -Unit tests for the Feature Chart Parser classes ------------------------------------------------ - -The list of parsers we want to test. - - >>> parsers = [nltk.parse.featurechart.FeatureChartParser, - ... nltk.parse.featurechart.FeatureTopDownChartParser, - ... nltk.parse.featurechart.FeatureBottomUpChartParser, - ... nltk.parse.featurechart.FeatureBottomUpLeftCornerChartParser, - ... nltk.parse.earleychart.FeatureIncrementalChartParser, - ... nltk.parse.earleychart.FeatureEarleyChartParser, - ... nltk.parse.earleychart.FeatureIncrementalTopDownChartParser, - ... nltk.parse.earleychart.FeatureIncrementalBottomUpChartParser, - ... nltk.parse.earleychart.FeatureIncrementalBottomUpLeftCornerChartParser, - ... ] - -A helper function that tests each parser on the given grammar and sentence. -We check that the number of trees are correct, and that all parsers -return the same trees. Otherwise an error is printed. - - >>> def unittest(grammar, sentence, nr_trees): - ... sentence = sentence.split() - ... trees = None - ... for P in parsers: - ... result = P(grammar).parse(sentence) - ... result = set(tree.freeze() for tree in result) - ... if len(result) != nr_trees: - ... print("Wrong nr of trees:", len(result)) - ... elif trees is None: - ... trees = result - ... elif result != trees: - ... print("Trees differ for parser:", P.__name__) - -The demo grammar from before, with an ambiguous sentence. - - >>> isawjohn = nltk.parse.featurechart.demo_grammar() - >>> unittest(isawjohn, "I saw John with a dog with my cookie", 5) - -This grammar tests that variables in different grammar rules are renamed -before unification. (The problematic variable is in this case ?X). - - >>> whatwasthat = nltk.grammar.FeatureGrammar.fromstring(''' - ... S[] -> NP[num=?N] VP[num=?N, slash=?X] - ... NP[num=?X] -> "what" - ... NP[num=?X] -> "that" - ... VP[num=?P, slash=none] -> V[num=?P] NP[] - ... V[num=sg] -> "was" - ... ''') - >>> unittest(whatwasthat, "what was that", 1) - -This grammar tests that the same rule can be used in different places -in another rule, and that the variables are properly renamed. - - >>> thislovesthat = nltk.grammar.FeatureGrammar.fromstring(''' - ... S[] -> NP[case=nom] V[] NP[case=acc] - ... NP[case=?X] -> Pron[case=?X] - ... Pron[] -> "this" - ... Pron[] -> "that" - ... V[] -> "loves" - ... ''') - >>> unittest(thislovesthat, "this loves that", 1) - - -Tests for loading feature grammar files ---------------------------------------- - -Alternative 1: first load the grammar, then create the parser. - - >>> fcfg = nltk.data.load('grammars/book_grammars/feat0.fcfg') - >>> fcp1 = nltk.parse.FeatureChartParser(fcfg) - >>> print((type(fcp1))) - - -Alternative 2: directly load the parser. - - >>> fcp2 = nltk.parse.load_parser('grammars/book_grammars/feat0.fcfg') - >>> print((type(fcp2))) - diff --git a/pipeline/nltk/test/portuguese_en.doctest b/pipeline/nltk/test/portuguese_en.doctest deleted file mode 100644 index aacaf1d16d375c318ab38c961e8c1094f81a1284..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/portuguese_en.doctest +++ /dev/null @@ -1,568 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -================================== -Examples for Portuguese Processing -================================== - -This HOWTO contains a variety of examples relating to the Portuguese language. -It is intended to be read in conjunction with the NLTK book -(``https://www.nltk.org/book/``). For instructions on running the Python -interpreter, please see the section *Getting Started with Python*, in Chapter 1. - --------------------------------------------- -Python Programming, with Portuguese Examples --------------------------------------------- - -Chapter 1 of the NLTK book contains many elementary programming examples, all -with English texts. In this section, we'll see some corresponding examples -using Portuguese. Please refer to the chapter for full discussion. *Vamos!* - - >>> from nltk.test.portuguese_en_fixt import setup_module - >>> setup_module() - - >>> from nltk.examples.pt import * - *** Introductory Examples for the NLTK Book *** - Loading ptext1, ... and psent1, ... - Type the name of the text or sentence to view it. - Type: 'texts()' or 'sents()' to list the materials. - ptext1: Memórias Póstumas de Brás Cubas (1881) - ptext2: Dom Casmurro (1899) - ptext3: Gênesis - ptext4: Folha de Sao Paulo (1994) - - -Any time we want to find out about these texts, we just have -to enter their names at the Python prompt: - - >>> ptext2 - - -Searching Text --------------- - -A concordance permits us to see words in context. - - >>> ptext1.concordance('olhos') - Building index... - Displaying 25 of 138 matches: - De pé , à cabeceira da cama , com os olhos estúpidos , a boca entreaberta , a t - orelhas . Pela minha parte fechei os olhos e deixei - me ir à ventura . Já agor - xões de cérebro enfermo . Como ia de olhos fechados , não via o caminho ; lembr - gelos eternos . Com efeito , abri os olhos e vi que o meu animal galopava numa - me apareceu então , fitando - me uns olhos rutilantes como o sol . Tudo nessa f - mim mesmo . Então , encarei - a com olhos súplices , e pedi mais alguns anos . - ... - -For a given word, we can find words with a similar text distribution: - - >>> ptext1.similar('chegar') - Building word-context index... - acabada acudir aludir avistar bramanismo casamento cheguei com contar - contrário corpo dali deixei desferirem dizer fazer filhos já leitor lhe - >>> ptext3.similar('chegar') - Building word-context index... - achar alumiar arrombar destruir governar guardar ir lavrar passar que - toda tomar ver vir - -We can search for the statistically significant collocations in a text: - - >>> ptext1.collocations() - Building collocations list - Quincas Borba; Lobo Neves; alguma coisa; Brás Cubas; meu pai; dia - seguinte; não sei; Meu pai; alguns instantes; outra vez; outra coisa; - por exemplo; mim mesmo; coisa nenhuma; mesma coisa; não era; dias - depois; Passeio Público; olhar para; das coisas - -We can search for words in context, with the help of *regular expressions*, e.g.: - - >>> ptext1.findall(" (<.*>)") - estúpidos; e; fechados; rutilantes; súplices; a; do; babavam; - na; moles; se; da; umas; espraiavam; chamejantes; espetados; - ... - -We can automatically generate random text based on a given text, e.g.: - - >>> ptext3.generate() # doctest: +SKIP - No princípio , criou Deus os abençoou , dizendo : Onde { estão } e até - à ave dos céus , { que } será . Disse mais Abrão : Dá - me a mulher - que tomaste ; porque daquele poço Eseque , { tinha .} E disse : Não - poderemos descer ; mas , do campo ainda não estava na casa do teu - pescoço . E viveu Serugue , depois Simeão e Levi { são } estes ? E o - varão , porque habitava na terra de Node , da mão de Esaú : Jeús , - Jalão e Corá - -Texts as List of Words ----------------------- - -A few sentences have been defined for you. - - >>> psent1 - ['o', 'amor', 'da', 'gl\xf3ria', 'era', 'a', 'coisa', 'mais', - 'verdadeiramente', 'humana', 'que', 'h\xe1', 'no', 'homem', ',', - 'e', ',', 'conseq\xfcentemente', ',', 'a', 'sua', 'mais', - 'genu\xedna', 'fei\xe7\xe3o', '.'] - >>> - -Notice that the sentence has been *tokenized*. Each token is -represented as a string, represented using quotes, e.g. ``'coisa'``. -Some strings contain special characters, e.g. ``\xf3``, -the internal representation for ó. -The tokens are combined in the form of a *list*. How long is this list? - - >>> len(psent1) - 25 - >>> - -What is the vocabulary of this sentence? - - >>> sorted(set(psent1)) - [',', '.', 'a', 'amor', 'coisa', 'conseqüentemente', 'da', 'e', 'era', - 'feição', 'genuína', 'glória', 'homem', 'humana', 'há', 'mais', 'no', - 'o', 'que', 'sua', 'verdadeiramente'] - >>> - -Let's iterate over each item in ``psent2``, and print information for each: - - >>> for w in psent2: - ... print(w, len(w), w[-1]) - ... - Não 3 o - consultes 9 s - dicionários 11 s - . 1 . - -Observe how we make a human-readable version of a string, using ``decode()``. -Also notice that we accessed the last character of a string ``w`` using ``w[-1]``. - -We just saw a ``for`` loop above. Another useful control structure is a -*list comprehension*. - - >>> [w.upper() for w in psent2] - ['N\xc3O', 'CONSULTES', 'DICION\xc1RIOS', '.'] - >>> [w for w in psent1 if w.endswith('a')] - ['da', 'gl\xf3ria', 'era', 'a', 'coisa', 'humana', 'a', 'sua', 'genu\xedna'] - >>> [w for w in ptext4 if len(w) > 15] - ['norte-irlandeses', 'pan-nacionalismo', 'predominatemente', 'primeiro-ministro', - 'primeiro-ministro', 'irlandesa-americana', 'responsabilidades', 'significativamente'] - -We can examine the relative frequency of words in a text, using ``FreqDist``: - - >>> fd1 = FreqDist(ptext1) - >>> fd1 - - >>> fd1['olhos'] - 137 - >>> fd1.max() - ',' - >>> fd1.samples()[:100] - [',', '.', 'a', 'que', 'de', 'e', '-', 'o', ';', 'me', 'um', 'n\xe3o', - '\x97', 'se', 'do', 'da', 'uma', 'com', 'os', '\xe9', 'era', 'as', 'eu', - 'lhe', 'ao', 'em', 'para', 'mas', '...', '!', '\xe0', 'na', 'mais', '?', - 'no', 'como', 'por', 'N\xe3o', 'dos', 'o', 'ele', ':', 'Virg\xedlia', - 'me', 'disse', 'minha', 'das', 'O', '/', 'A', 'CAP\xcdTULO', 'muito', - 'depois', 'coisa', 'foi', 'sem', 'olhos', 'ela', 'nos', 'tinha', 'nem', - 'E', 'outro', 'vida', 'nada', 'tempo', 'menos', 'outra', 'casa', 'homem', - 'porque', 'quando', 'mim', 'mesmo', 'ser', 'pouco', 'estava', 'dia', - 't\xe3o', 'tudo', 'Mas', 'at\xe9', 'D', 'ainda', 's\xf3', 'alguma', - 'la', 'vez', 'anos', 'h\xe1', 'Era', 'pai', 'esse', 'lo', 'dizer', 'assim', - 'ent\xe3o', 'dizia', 'aos', 'Borba'] - ---------------- -Reading Corpora ---------------- - -Accessing the Machado Text Corpus ---------------------------------- - -NLTK includes the complete works of Machado de Assis. - - >>> from nltk.corpus import machado - >>> machado.fileids() - ['contos/macn001.txt', 'contos/macn002.txt', 'contos/macn003.txt', ...] - -Each file corresponds to one of the works of Machado de Assis. To see a complete -list of works, you can look at the corpus README file: ``print machado.readme()``. -Let's access the text of the *Posthumous Memories of Brás Cubas*. - -We can access the text as a list of characters, and access 200 characters starting -from position 10,000. - - >>> raw_text = machado.raw('romance/marm05.txt') - >>> raw_text[10000:10200] - u', primou no\nEstado, e foi um dos amigos particulares do vice-rei Conde - da Cunha.\n\nComo este apelido de Cubas lhe\ncheirasse excessivamente a - tanoaria, alegava meu pai, bisneto de Dami\xe3o, que o\ndito ape' - -However, this is not a very useful way to work with a text. We generally think -of a text as a sequence of words and punctuation, not characters: - - >>> text1 = machado.words('romance/marm05.txt') - >>> text1 - ['Romance', ',', 'Mem\xf3rias', 'P\xf3stumas', 'de', ...] - >>> len(text1) - 77098 - >>> len(set(text1)) - 10848 - -Here's a program that finds the most common ngrams that contain a -particular target word. - - >>> from nltk import ngrams, FreqDist - >>> target_word = 'olhos' - >>> fd = FreqDist(ng - ... for ng in ngrams(text1, 5) - ... if target_word in ng) - >>> for hit in fd.samples(): - ... print(' '.join(hit)) - ... - , com os olhos no - com os olhos no ar - com os olhos no chão - e todos com os olhos - me estar com os olhos - os olhos estúpidos , a - os olhos na costura , - os olhos no ar , - , com os olhos espetados - , com os olhos estúpidos - , com os olhos fitos - , com os olhos naquele - , com os olhos para - - -Accessing the MacMorpho Tagged Corpus -------------------------------------- - -NLTK includes the MAC-MORPHO Brazilian Portuguese POS-tagged news text, -with over a million words of -journalistic texts extracted from ten sections of -the daily newspaper *Folha de Sao Paulo*, 1994. - -We can access this corpus as a sequence of words or tagged words as follows: - - >>> import nltk.corpus - >>> nltk.corpus.mac_morpho.words() - ['Jersei', 'atinge', 'm\xe9dia', 'de', 'Cr$', '1,4', ...] - >>> nltk.corpus.mac_morpho.sents() - [['Jersei', 'atinge', 'm\xe9dia', 'de', 'Cr$', '1,4', 'milh\xe3o', - 'em', 'a', 'venda', 'de', 'a', 'Pinhal', 'em', 'S\xe3o', 'Paulo'], - ['Programe', 'sua', 'viagem', 'a', 'a', 'Exposi\xe7\xe3o', 'Nacional', - 'do', 'Zeb', ',', 'que', 'come\xe7a', 'dia', '25'], ...] - >>> nltk.corpus.mac_morpho.tagged_words() - [('Jersei', 'N'), ('atinge', 'V'), ('m\xe9dia', 'N'), ...] - -We can also access it in sentence chunks. - - >>> nltk.corpus.mac_morpho.tagged_sents() - [[('Jersei', 'N'), ('atinge', 'V'), ('m\xe9dia', 'N'), ('de', 'PREP'), - ('Cr$', 'CUR'), ('1,4', 'NUM'), ('milh\xe3o', 'N'), ('em', 'PREP|+'), - ('a', 'ART'), ('venda', 'N'), ('de', 'PREP|+'), ('a', 'ART'), - ('Pinhal', 'NPROP'), ('em', 'PREP'), ('S\xe3o', 'NPROP'), - ('Paulo', 'NPROP')], - [('Programe', 'V'), ('sua', 'PROADJ'), ('viagem', 'N'), ('a', 'PREP|+'), - ('a', 'ART'), ('Exposi\xe7\xe3o', 'NPROP'), ('Nacional', 'NPROP'), - ('do', 'NPROP'), ('Zeb', 'NPROP'), (',', ','), ('que', 'PRO-KS-REL'), - ('come\xe7a', 'V'), ('dia', 'N'), ('25', 'N|AP')], ...] - -This data can be used to train taggers (examples below for the Floresta treebank). - -Accessing the Floresta Portuguese Treebank ------------------------------------------- - -The NLTK data distribution includes the -"Floresta Sinta(c)tica Corpus" version 7.4, available from -``https://www.linguateca.pt/Floresta/``. - -We can access this corpus as a sequence of words or tagged words as follows: - - >>> from nltk.corpus import floresta - >>> floresta.words() - ['Um', 'revivalismo', 'refrescante', 'O', '7_e_Meio', ...] - >>> floresta.tagged_words() - [('Um', '>N+art'), ('revivalismo', 'H+n'), ...] - -The tags consist of some syntactic information, followed by a plus sign, -followed by a conventional part-of-speech tag. Let's strip off the material before -the plus sign: - - >>> def simplify_tag(t): - ... if "+" in t: - ... return t[t.index("+")+1:] - ... else: - ... return t - >>> twords = floresta.tagged_words() - >>> twords = [(w.lower(), simplify_tag(t)) for (w,t) in twords] - >>> twords[:10] - [('um', 'art'), ('revivalismo', 'n'), ('refrescante', 'adj'), ('o', 'art'), ('7_e_meio', 'prop'), - ('\xe9', 'v-fin'), ('um', 'art'), ('ex-libris', 'n'), ('de', 'prp'), ('a', 'art')] - -Pretty printing the tagged words: - - >>> print(' '.join(word + '/' + tag for (word, tag) in twords[:10])) - um/art revivalismo/n refrescante/adj o/art 7_e_meio/prop é/v-fin um/art ex-libris/n de/prp a/art - -Count the word tokens and types, and determine the most common word: - - >>> words = floresta.words() - >>> len(words) - 211852 - >>> fd = nltk.FreqDist(words) - >>> len(fd) - 29421 - >>> fd.max() - 'de' - -List the 20 most frequent tags, in order of decreasing frequency: - - >>> tags = [simplify_tag(tag) for (word,tag) in floresta.tagged_words()] - >>> fd = nltk.FreqDist(tags) - >>> fd.keys()[:20] - ['n', 'prp', 'art', 'v-fin', ',', 'prop', 'adj', 'adv', '.', - 'conj-c', 'v-inf', 'pron-det', 'v-pcp', 'num', 'pron-indp', - 'pron-pers', '\xab', '\xbb', 'conj-s', '}'] - -We can also access the corpus grouped by sentence: - - >>> floresta.sents() - [['Um', 'revivalismo', 'refrescante'], - ['O', '7_e_Meio', '\xe9', 'um', 'ex-libris', 'de', 'a', 'noite', - 'algarvia', '.'], ...] - >>> floresta.tagged_sents() - [[('Um', '>N+art'), ('revivalismo', 'H+n'), ('refrescante', 'N<+adj')], - [('O', '>N+art'), ('7_e_Meio', 'H+prop'), ('\xe9', 'P+v-fin'), - ('um', '>N+art'), ('ex-libris', 'H+n'), ('de', 'H+prp'), - ('a', '>N+art'), ('noite', 'H+n'), ('algarvia', 'N<+adj'), ('.', '.')], - ...] - >>> floresta.parsed_sents() - [Tree('UTT+np', [Tree('>N+art', ['Um']), Tree('H+n', ['revivalismo']), - Tree('N<+adj', ['refrescante'])]), - Tree('STA+fcl', - [Tree('SUBJ+np', [Tree('>N+art', ['O']), - Tree('H+prop', ['7_e_Meio'])]), - Tree('P+v-fin', ['\xe9']), - Tree('SC+np', - [Tree('>N+art', ['um']), - Tree('H+n', ['ex-libris']), - Tree('N<+pp', [Tree('H+prp', ['de']), - Tree('P<+np', [Tree('>N+art', ['a']), - Tree('H+n', ['noite']), - Tree('N<+adj', ['algarvia'])])])]), - Tree('.', ['.'])]), ...] - -To view a parse tree, use the ``draw()`` method, e.g.: - - >>> psents = floresta.parsed_sents() - >>> psents[5].draw() # doctest: +SKIP - -Character Encodings -------------------- - -Python understands the common character encoding used for Portuguese, ISO 8859-1 (ISO Latin 1). - - >>> import os, nltk.test - >>> testdir = os.path.split(nltk.test.__file__)[0] - >>> text = open(os.path.join(testdir, 'floresta.txt'), 'rb').read().decode('ISO 8859-1') - >>> text[:60] - 'O 7 e Meio \xe9 um ex-libris da noite algarvia.\n\xc9 uma das mais ' - >>> print(text[:60]) - O 7 e Meio é um ex-libris da noite algarvia. - É uma das mais - -For more information about character encodings and Python, please see section 3.3 of the book. - ----------------- -Processing Tasks ----------------- - - -Simple Concordancing --------------------- - -Here's a function that takes a word and a specified amount of context (measured -in characters), and generates a concordance for that word. - - >>> def concordance(word, context=30): - ... for sent in floresta.sents(): - ... if word in sent: - ... pos = sent.index(word) - ... left = ' '.join(sent[:pos]) - ... right = ' '.join(sent[pos+1:]) - ... print('%*s %s %-*s' % - ... (context, left[-context:], word, context, right[:context])) - - >>> concordance("dar") # doctest: +SKIP - anduru , foi o suficiente para dar a volta a o resultado . - 1. O P?BLICO veio dar a a imprensa di?ria portuguesa - A fartura de pensamento pode dar maus resultados e n?s n?o quer - Come?a a dar resultados a pol?tica de a Uni - ial come?ar a incorporar- lo e dar forma a um ' site ' que tem se - r com Constantino para ele lhe dar tamb?m os pap?is assinados . - va a brincar , pois n?o lhe ia dar procura??o nenhuma enquanto n? - ?rica como o ant?doto capaz de dar sentido a o seu enorme poder . - . . . - >>> concordance("vender") # doctest: +SKIP - er recebido uma encomenda para vender 4000 blindados a o Iraque . - m?rico_Amorim caso conseguisse vender o lote de ac??es de o empres?r - mpre ter jovens simp?ticos a ? vender ? chega ! } - Disse que o governo vai vender ? desde autom?vel at? particip - ndiciou ontem duas pessoas por vender carro com ?gio . - A inten??o de Fleury ? vender as a??es para equilibrar as fi - -Part-of-Speech Tagging ----------------------- - -Let's begin by getting the tagged sentence data, and simplifying the tags -as described earlier. - - >>> from nltk.corpus import floresta - >>> tsents = floresta.tagged_sents() - >>> tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent] - >>> train = tsents[100:] - >>> test = tsents[:100] - -We already know that ``n`` is the most common tag, so we can set up a -default tagger that tags every word as a noun, and see how well it does: - - >>> tagger0 = nltk.DefaultTagger('n') - >>> nltk.tag.accuracy(tagger0, test) - 0.17697228144989338 - -Evidently, about one in every six words is a noun. Let's improve on this by -training a unigram tagger: - - >>> tagger1 = nltk.UnigramTagger(train, backoff=tagger0) - >>> nltk.tag.accuracy(tagger1, test) - 0.87029140014214645 - -Next a bigram tagger: - - >>> tagger2 = nltk.BigramTagger(train, backoff=tagger1) - >>> nltk.tag.accuracy(tagger2, test) - 0.89019189765458417 - - -Sentence Segmentation ---------------------- - -Punkt is a language-neutral sentence segmentation tool. We - - >>> sent_tokenizer=nltk.data.load('tokenizers/punkt/portuguese.pickle') - >>> raw_text = machado.raw('romance/marm05.txt') - >>> sentences = sent_tokenizer.tokenize(raw_text) - >>> for sent in sentences[1000:1005]: - ... print("<<", sent, ">>") - ... - << Em verdade, parecia ainda mais mulher do que era; - seria criança nos seus folgares de moça; mas assim quieta, impassível, tinha a - compostura da mulher casada. >> - << Talvez essa circunstância lhe diminuía um pouco da - graça virginal. >> - << Depressa nos familiarizamos; a mãe fazia-lhe grandes elogios, eu - escutava-os de boa sombra, e ela sorria com os olhos fúlgidos, como se lá dentro - do cérebro lhe estivesse a voar uma borboletinha de asas de ouro e olhos de - diamante... >> - << Digo lá dentro, porque cá fora o - que esvoaçou foi uma borboleta preta, que subitamente penetrou na varanda, e - começou a bater as asas em derredor de D. Eusébia. >> - << D. Eusébia deu um grito, - levantou-se, praguejou umas palavras soltas: - T'esconjuro!... >> - -The sentence tokenizer can be trained and evaluated on other text. -The source text (from the Floresta Portuguese Treebank) contains one sentence per line. -We read the text, split it into its lines, and then join these lines together using -spaces. Now the information about sentence breaks has been discarded. We split this -material into training and testing data: - - >>> import os, nltk.test - >>> testdir = os.path.split(nltk.test.__file__)[0] - >>> text = open(os.path.join(testdir, 'floresta.txt'), 'rb').read().decode('ISO-8859-1') - >>> lines = text.split('\n') - >>> train = ' '.join(lines[10:]) - >>> test = ' '.join(lines[:10]) - -Now we train the sentence segmenter (or sentence tokenizer) and use it on our test sentences: - - >>> stok = nltk.PunktSentenceTokenizer(train) - >>> print(stok.tokenize(test)) - ['O 7 e Meio \xe9 um ex-libris da noite algarvia.', - '\xc9 uma das mais antigas discotecas do Algarve, situada em Albufeira, - que continua a manter os tra\xe7os decorativos e as clientelas de sempre.', - '\xc9 um pouco a vers\xe3o de uma esp\xe9cie de \xaboutro lado\xbb da noite, - a meio caminho entre os devaneios de uma fauna perif\xe9rica, seja de Lisboa, - Londres, Dublin ou Faro e Portim\xe3o, e a postura circunspecta dos fi\xe9is da casa, - que dela esperam a m\xfasica \xabgeracionista\xbb dos 60 ou dos 70.', - 'N\xe3o deixa de ser, nos tempos que correm, um certo \xabvery typical\xbb algarvio, - cabe\xe7a de cartaz para os que querem fugir a algumas movimenta\xe7\xf5es nocturnas - j\xe1 a caminho da ritualiza\xe7\xe3o de massas, do g\xe9nero \xabvamos todos ao - Calypso e encontramo-nos na Locomia\xbb.', - 'E assim, aos 2,5 milh\xf5es que o Minist\xe9rio do Planeamento e Administra\xe7\xe3o - do Territ\xf3rio j\xe1 gasta no pagamento do pessoal afecto a estes organismos, - v\xeam juntar-se os montantes das obras propriamente ditas, que os munic\xedpios, - j\xe1 com projectos na m\xe3o, v\xeam reivindicar junto do Executivo, como salienta - aquele membro do Governo.', - 'E o dinheiro \xabn\xe3o falta s\xf3 \xe0s c\xe2maras\xbb, lembra o secret\xe1rio de Estado, - que considera que a solu\xe7\xe3o para as autarquias \xe9 \xabespecializarem-se em - fundos comunit\xe1rios\xbb.', - 'Mas como, se muitas n\xe3o disp\xf5em, nos seus quadros, dos t\xe9cnicos necess\xe1rios?', - '\xabEncomendem-nos a projectistas de fora\xbb porque, se as obras vierem a ser financiadas, - eles at\xe9 saem de gra\xe7a, j\xe1 que, nesse caso, \xabos fundos comunit\xe1rios pagam - os projectos, o mesmo n\xe3o acontecendo quando eles s\xe3o feitos pelos GAT\xbb, - dado serem organismos do Estado.', - 'Essa poder\xe1 vir a ser uma hip\xf3tese, at\xe9 porque, no terreno, a capacidade dos GAT - est\xe1 cada vez mais enfraquecida.', - 'Alguns at\xe9 j\xe1 desapareceram, como o de Castro Verde, e outros t\xeam vindo a perder quadros.'] - -NLTK's data collection includes a trained model for Portuguese sentence -segmentation, which can be loaded as follows. It is faster to load a trained model than -to retrain it. - - >>> stok = nltk.data.load('tokenizers/punkt/portuguese.pickle') - -Stemming --------- - -NLTK includes the RSLP Portuguese stemmer. Here we use it to stem some Portuguese text: - - >>> stemmer = nltk.stem.RSLPStemmer() - >>> stemmer.stem("copiar") - 'copi' - >>> stemmer.stem("paisagem") - 'pais' - - -Stopwords ---------- - -NLTK includes Portuguese stopwords: - - >>> stopwords = nltk.corpus.stopwords.words('portuguese') - >>> stopwords[:10] - ['a', 'ao', 'aos', 'aquela', 'aquelas', 'aquele', 'aqueles', 'aquilo', 'as', 'at\xe9'] - -Now we can use these to filter text. Let's find the most frequent words (other than stopwords) -and print them in descending order of frequency: - - >>> fd = nltk.FreqDist(w.lower() for w in floresta.words() if w not in stopwords) - >>> for word in list(fd.keys())[:20]: - ... print(word, fd[word]) - , 13444 - . 7725 - « 2369 - » 2310 - é 1305 - o 1086 - } 1047 - { 1044 - a 897 - ; 633 - em 516 - ser 466 - sobre 349 - os 313 - anos 301 - ontem 292 - ainda 279 - segundo 256 - ter 249 - dois 231 diff --git a/pipeline/nltk/test/portuguese_en_fixt.py b/pipeline/nltk/test/portuguese_en_fixt.py deleted file mode 100644 index 1e86682b0810ef1299cf353ae606db9f9e9114d7..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/portuguese_en_fixt.py +++ /dev/null @@ -1,4 +0,0 @@ -def setup_module(): - import pytest - - pytest.skip("portuguese_en.doctest imports nltk.examples.pt which doesn't exist!") diff --git a/pipeline/nltk/test/probability.doctest b/pipeline/nltk/test/probability.doctest deleted file mode 100644 index f8f385dec2bf207684558068525b3ab9c9719d6b..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/probability.doctest +++ /dev/null @@ -1,306 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -=========== -Probability -=========== - - >>> from nltk.test.probability_fixt import setup_module - >>> setup_module() - - >>> import nltk - >>> from nltk.probability import * - -FreqDist --------- - - >>> text1 = ['no', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '!'] - >>> text2 = ['no', 'good', 'porpoise', 'likes', 'to', 'fish', 'fish', 'anywhere', '.'] - - >>> fd1 = nltk.FreqDist(text1) - >>> fd1 == nltk.FreqDist(text1) - True - -Note that items are sorted in order of decreasing frequency; two items of the same frequency appear in indeterminate order. - - >>> import itertools - >>> both = nltk.FreqDist(text1 + text2) - >>> both_most_common = both.most_common() - >>> list(itertools.chain(*(sorted(ys) for k, ys in itertools.groupby(both_most_common, key=lambda t: t[1])))) - [('fish', 3), ('anywhere', 2), ('good', 2), ('no', 2), ('porpoise', 2), ('!', 1), ('.', 1), ('a', 1), ('goes', 1), ('likes', 1), ('to', 1), ('without', 1)] - - >>> both == fd1 + nltk.FreqDist(text2) - True - >>> fd1 == nltk.FreqDist(text1) # But fd1 is unchanged - True - - >>> fd2 = nltk.FreqDist(text2) - >>> fd1.update(fd2) - >>> fd1 == both - True - - >>> fd1 = nltk.FreqDist(text1) - >>> fd1.update(text2) - >>> fd1 == both - True - - >>> fd1 = nltk.FreqDist(text1) - >>> fd2 = nltk.FreqDist(fd1) - >>> fd2 == fd1 - True - -``nltk.FreqDist`` can be pickled: - - >>> import pickle - >>> fd1 = nltk.FreqDist(text1) - >>> pickled = pickle.dumps(fd1) - >>> fd1 == pickle.loads(pickled) - True - -Mathematical operations: - - >>> FreqDist('abbb') + FreqDist('bcc') - FreqDist({'b': 4, 'c': 2, 'a': 1}) - >>> FreqDist('abbbc') - FreqDist('bccd') - FreqDist({'b': 2, 'a': 1}) - >>> FreqDist('abbb') | FreqDist('bcc') - FreqDist({'b': 3, 'c': 2, 'a': 1}) - >>> FreqDist('abbb') & FreqDist('bcc') - FreqDist({'b': 1}) - -ConditionalFreqDist -------------------- - - >>> cfd1 = ConditionalFreqDist() - >>> cfd1[1] = FreqDist('abbbb') - >>> cfd1[2] = FreqDist('xxxxyy') - >>> cfd1 - - - >>> cfd2 = ConditionalFreqDist() - >>> cfd2[1] = FreqDist('bbccc') - >>> cfd2[2] = FreqDist('xxxyyyzz') - >>> cfd2[3] = FreqDist('m') - >>> cfd2 - - - >>> r = cfd1 + cfd2 - >>> [(i,r[i]) for i in r.conditions()] - [(1, FreqDist({'b': 6, 'c': 3, 'a': 1})), (2, FreqDist({'x': 7, 'y': 5, 'z': 2})), (3, FreqDist({'m': 1}))] - - >>> r = cfd1 - cfd2 - >>> [(i,r[i]) for i in r.conditions()] - [(1, FreqDist({'b': 2, 'a': 1})), (2, FreqDist({'x': 1}))] - - >>> r = cfd1 | cfd2 - >>> [(i,r[i]) for i in r.conditions()] - [(1, FreqDist({'b': 4, 'c': 3, 'a': 1})), (2, FreqDist({'x': 4, 'y': 3, 'z': 2})), (3, FreqDist({'m': 1}))] - - >>> r = cfd1 & cfd2 - >>> [(i,r[i]) for i in r.conditions()] - [(1, FreqDist({'b': 2})), (2, FreqDist({'x': 3, 'y': 2}))] - -Testing some HMM estimators ---------------------------- - -We extract a small part (500 sentences) of the Brown corpus - - >>> corpus = nltk.corpus.brown.tagged_sents(categories='adventure')[:500] - >>> print(len(corpus)) - 500 - -We create a HMM trainer - note that we need the tags and symbols -from the whole corpus, not just the training corpus - - >>> from nltk.util import unique_list - >>> tag_set = unique_list(tag for sent in corpus for (word,tag) in sent) - >>> print(len(tag_set)) - 92 - >>> symbols = unique_list(word for sent in corpus for (word,tag) in sent) - >>> print(len(symbols)) - 1464 - >>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols) - -We divide the corpus into 90% training and 10% testing - - >>> train_corpus = [] - >>> test_corpus = [] - >>> for i in range(len(corpus)): - ... if i % 10: - ... train_corpus += [corpus[i]] - ... else: - ... test_corpus += [corpus[i]] - >>> print(len(train_corpus)) - 450 - >>> print(len(test_corpus)) - 50 - -And now we can test the estimators - - >>> def train_and_test(est): - ... hmm = trainer.train_supervised(train_corpus, estimator=est) - ... print('%.2f%%' % (100 * hmm.accuracy(test_corpus))) - -Maximum Likelihood Estimation ------------------------------ -- this resulted in an initialization error before r7209 - - >>> mle = lambda fd, bins: MLEProbDist(fd) - >>> train_and_test(mle) - 22.75% - -Laplace (= Lidstone with gamma==1) - - >>> train_and_test(LaplaceProbDist) - 66.04% - -Expected Likelihood Estimation (= Lidstone with gamma==0.5) - - >>> train_and_test(ELEProbDist) - 73.01% - -Lidstone Estimation, for gamma==0.1, 0.5 and 1 -(the later two should be exactly equal to MLE and ELE above) - - >>> def lidstone(gamma): - ... return lambda fd, bins: LidstoneProbDist(fd, gamma, bins) - >>> train_and_test(lidstone(0.1)) - 82.51% - >>> train_and_test(lidstone(0.5)) - 73.01% - >>> train_and_test(lidstone(1.0)) - 66.04% - -Witten Bell Estimation ----------------------- -- This resulted in ZeroDivisionError before r7209 - - >>> train_and_test(WittenBellProbDist) - 88.12% - -Good Turing Estimation - - >>> gt = lambda fd, bins: SimpleGoodTuringProbDist(fd, bins=1e5) - >>> train_and_test(gt) - 86.93% - -Kneser Ney Estimation ---------------------- -Since the Kneser-Ney distribution is best suited for trigrams, we must adjust -our testing accordingly. - - >>> corpus = [[((x[0],y[0],z[0]),(x[1],y[1],z[1])) - ... for x, y, z in nltk.trigrams(sent)] - ... for sent in corpus[:100]] - -We will then need to redefine the rest of the training/testing variables - - >>> tag_set = unique_list(tag for sent in corpus for (word,tag) in sent) - >>> len(tag_set) - 906 - - >>> symbols = unique_list(word for sent in corpus for (word,tag) in sent) - >>> len(symbols) - 1341 - - >>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols) - >>> train_corpus = [] - >>> test_corpus = [] - - >>> for i in range(len(corpus)): - ... if i % 10: - ... train_corpus += [corpus[i]] - ... else: - ... test_corpus += [corpus[i]] - - >>> len(train_corpus) - 90 - >>> len(test_corpus) - 10 - - >>> kn = lambda fd, bins: KneserNeyProbDist(fd) - >>> train_and_test(kn) - 0.86% - -Remains to be added: -- Tests for HeldoutProbDist, CrossValidationProbDist and MutableProbDist - -Squashed bugs -------------- - -Issue 511: override pop and popitem to invalidate the cache - - >>> fd = nltk.FreqDist('a') - >>> list(fd.keys()) - ['a'] - >>> fd.pop('a') - 1 - >>> list(fd.keys()) - [] - -Issue 533: access cumulative frequencies with no arguments - - >>> fd = nltk.FreqDist('aab') - >>> list(fd._cumulative_frequencies(['a'])) - [2.0] - >>> list(fd._cumulative_frequencies(['a', 'b'])) - [2.0, 3.0] - -Issue 579: override clear to reset some variables - - >>> fd = FreqDist('aab') - >>> fd.clear() - >>> fd.N() - 0 - -Issue 351: fix fileids method of CategorizedCorpusReader to inadvertently -add errant categories - - >>> from nltk.corpus import brown - >>> brown.fileids('blah') - Traceback (most recent call last): - ... - ValueError: Category blah not found - >>> brown.categories() - ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] - -Issue 175: add the unseen bin to SimpleGoodTuringProbDist by default -otherwise any unseen events get a probability of zero, i.e., -they don't get smoothed - - >>> from nltk import SimpleGoodTuringProbDist, FreqDist - >>> fd = FreqDist({'a':1, 'b':1, 'c': 2, 'd': 3, 'e': 4, 'f': 4, 'g': 4, 'h': 5, 'i': 5, 'j': 6, 'k': 6, 'l': 6, 'm': 7, 'n': 7, 'o': 8, 'p': 9, 'q': 10}) - >>> p = SimpleGoodTuringProbDist(fd) - >>> p.prob('a') - 0.017649766667026317... - >>> p.prob('o') - 0.08433050215340411... - >>> p.prob('z') - 0.022727272727272728... - >>> p.prob('foobar') - 0.022727272727272728... - -``MLEProbDist``, ``ConditionalProbDist'', ``DictionaryConditionalProbDist`` and -``ConditionalFreqDist`` can be pickled: - - >>> import pickle - >>> pd = MLEProbDist(fd) - >>> sorted(pd.samples()) == sorted(pickle.loads(pickle.dumps(pd)).samples()) - True - >>> dpd = DictionaryConditionalProbDist({'x': pd}) - >>> unpickled = pickle.loads(pickle.dumps(dpd)) - >>> dpd['x'].prob('a') - 0.011363636... - >>> dpd['x'].prob('a') == unpickled['x'].prob('a') - True - >>> cfd = nltk.probability.ConditionalFreqDist() - >>> cfd['foo']['hello'] += 1 - >>> cfd['foo']['hello'] += 1 - >>> cfd['bar']['hello'] += 1 - >>> cfd2 = pickle.loads(pickle.dumps(cfd)) - >>> cfd2 == cfd - True - >>> cpd = ConditionalProbDist(cfd, SimpleGoodTuringProbDist) - >>> cpd2 = pickle.loads(pickle.dumps(cpd)) - >>> cpd['foo'].prob('hello') == cpd2['foo'].prob('hello') - True diff --git a/pipeline/nltk/test/probability_fixt.py b/pipeline/nltk/test/probability_fixt.py deleted file mode 100644 index a67809384d3780fa9d1b3efcf4ca51e10cd4be00..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/probability_fixt.py +++ /dev/null @@ -1,8 +0,0 @@ -# probability.doctest uses HMM which requires numpy; -# skip probability.doctest if numpy is not available - - -def setup_module(): - import pytest - - pytest.importorskip("numpy") diff --git a/pipeline/nltk/test/propbank.doctest b/pipeline/nltk/test/propbank.doctest deleted file mode 100644 index d7f9a98a4be23e050676205d11a776b30f7eb499..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/propbank.doctest +++ /dev/null @@ -1,176 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -======== -PropBank -======== - -The PropBank Corpus provides predicate-argument annotation for the -entire Penn Treebank. Each verb in the treebank is annotated by a single -instance in PropBank, containing information about the location of -the verb, and the location and identity of its arguments: - - >>> from nltk.corpus import propbank - >>> pb_instances = propbank.instances() - >>> print(pb_instances) - [, - , ...] - -Each propbank instance defines the following member variables: - - - Location information: `fileid`, `sentnum`, `wordnum` - - Annotator information: `tagger` - - Inflection information: `inflection` - - Roleset identifier: `roleset` - - Verb (aka predicate) location: `predicate` - - Argument locations and types: `arguments` - -The following examples show the types of these arguments: - - >>> inst = pb_instances[103] - >>> (inst.fileid, inst.sentnum, inst.wordnum) - ('wsj_0004.mrg', 8, 16) - >>> inst.tagger - 'gold' - >>> inst.inflection - - >>> infl = inst.inflection - >>> infl.form, infl.tense, infl.aspect, infl.person, infl.voice - ('v', 'p', '-', '-', 'a') - >>> inst.roleset - 'rise.01' - >>> inst.predicate - PropbankTreePointer(16, 0) - >>> inst.arguments - ((PropbankTreePointer(0, 2), 'ARG1'), - (PropbankTreePointer(13, 1), 'ARGM-DIS'), - (PropbankTreePointer(17, 1), 'ARG4-to'), - (PropbankTreePointer(20, 1), 'ARG3-from')) - -The location of the predicate and of the arguments are encoded using -`PropbankTreePointer` objects, as well as `PropbankChainTreePointer` -objects and `PropbankSplitTreePointer` objects. A -`PropbankTreePointer` consists of a `wordnum` and a `height`: - - >>> print(inst.predicate.wordnum, inst.predicate.height) - 16 0 - -This identifies the tree constituent that is headed by the word that -is the `wordnum`\ 'th token in the sentence, and whose span is found -by going `height` nodes up in the tree. This type of pointer is only -useful if we also have the corresponding tree structure, since it -includes empty elements such as traces in the word number count. The -trees for 10% of the standard PropBank Corpus are contained in the -`treebank` corpus: - - >>> tree = inst.tree - - >>> from nltk.corpus import treebank - >>> assert tree == treebank.parsed_sents(inst.fileid)[inst.sentnum] - - >>> inst.predicate.select(tree) - Tree('VBD', ['rose']) - >>> for (argloc, argid) in inst.arguments: - ... print('%-10s %s' % (argid, argloc.select(tree).pformat(500)[:50])) - ARG1 (NP-SBJ (NP (DT The) (NN yield)) (PP (IN on) (NP ( - ARGM-DIS (PP (IN for) (NP (NN example))) - ARG4-to (PP-DIR (TO to) (NP (CD 8.04) (NN %))) - ARG3-from (PP-DIR (IN from) (NP (CD 7.90) (NN %))) - -Propbank tree pointers can be converted to standard tree locations, -which are usually easier to work with, using the `treepos()` method: - - >>> treepos = inst.predicate.treepos(tree) - >>> print (treepos, tree[treepos]) - (4, 0) (VBD rose) - -In some cases, argument locations will be encoded using -`PropbankChainTreePointer`\ s (for trace chains) or -`PropbankSplitTreePointer`\ s (for discontinuous constituents). Both -of these objects contain a single member variable, `pieces`, -containing a list of the constituent pieces. They also define the -method `select()`, which will return a tree containing all the -elements of the argument. (A new head node is created, labeled -"*CHAIN*" or "*SPLIT*", since the argument is not a single constituent -in the original tree). Sentence #6 contains an example of an argument -that is both discontinuous and contains a chain: - - >>> inst = pb_instances[6] - >>> inst.roleset - 'expose.01' - >>> argloc, argid = inst.arguments[2] - >>> argloc - - >>> argloc.pieces - [, PropbankTreePointer(27, 0)] - >>> argloc.pieces[0].pieces - ... - [PropbankTreePointer(22, 1), PropbankTreePointer(24, 0), - PropbankTreePointer(25, 1)] - >>> print(argloc.select(inst.tree)) - (*CHAIN* - (*SPLIT* (NP (DT a) (NN group)) (IN of) (NP (NNS workers))) - (-NONE- *)) - -The PropBank Corpus also provides access to the frameset files, which -define the argument labels used by the annotations, on a per-verb -basis. Each frameset file contains one or more predicates, such as -'turn' or 'turn_on', each of which is divided into coarse-grained word -senses called rolesets. For each roleset, the frameset file provides -descriptions of the argument roles, along with examples. - - >>> expose_01 = propbank.roleset('expose.01') - >>> turn_01 = propbank.roleset('turn.01') - >>> print(turn_01) - - >>> for role in turn_01.findall("roles/role"): - ... print(role.attrib['n'], role.attrib['descr']) - 0 turner - 1 thing turning - m direction, location - - >>> from xml.etree import ElementTree - >>> print(ElementTree.tostring(turn_01.find('example')).decode('utf8').strip()) - - - John turned the key in the lock. - - John - turned - the key - in the lock - - -Note that the standard corpus distribution only contains 10% of the -treebank, so the parse trees are not available for instances starting -at 9353: - - >>> inst = pb_instances[9352] - >>> inst.fileid - 'wsj_0199.mrg' - >>> print(inst.tree) - (S (NP-SBJ (NNP Trinity)) (VP (VBD said) (SBAR (-NONE- 0) ...)) - >>> print(inst.predicate.select(inst.tree)) - (VB begin) - - >>> inst = pb_instances[9353] - >>> inst.fileid - 'wsj_0200.mrg' - >>> print(inst.tree) - None - >>> print(inst.predicate.select(inst.tree)) - Traceback (most recent call last): - . . . - ValueError: Parse tree not available - -However, if you supply your own version of the treebank corpus (by -putting it before the nltk-provided version on `nltk.data.path`, or -by creating a `ptb` directory as described above and using the -`propbank_ptb` module), then you can access the trees for all -instances. - -A list of the verb lemmas contained in PropBank is returned by the -`propbank.verbs()` method: - - >>> propbank.verbs() - ['abandon', 'abate', 'abdicate', 'abet', 'abide', ...] diff --git a/pipeline/nltk/test/relextract.doctest b/pipeline/nltk/test/relextract.doctest deleted file mode 100644 index 83c08d4bb0dfaed9efe430ff6b114de999445fd0..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/relextract.doctest +++ /dev/null @@ -1,263 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -====================== -Information Extraction -====================== - -Information Extraction standardly consists of three subtasks: - -#. Named Entity Recognition - -#. Relation Extraction - -#. Template Filling - -Named Entities -~~~~~~~~~~~~~~ - -The IEER corpus is marked up for a variety of Named Entities. A Named -Entity (more strictly, a Named Entity mention) is a name of an -entity belonging to a specified class. For example, the Named Entity -classes in IEER include PERSON, LOCATION, ORGANIZATION, DATE and so -on. Within NLTK, Named Entities are represented as subtrees within a -chunk structure: the class name is treated as node label, while the -entity mention itself appears as the leaves of the subtree. This is -illustrated below, where we have show an extract of the chunk -representation of document NYT_19980315.064: - - >>> from nltk.corpus import ieer - >>> docs = ieer.parsed_docs('NYT_19980315') - >>> tree = docs[1].text - >>> print(tree) - (DOCUMENT - ... - ``It's - a - chance - to - think - about - first-level - questions,'' - said - Ms. - (PERSON Cohn) - , - a - partner - in - the - (ORGANIZATION McGlashan & Sarrail) - firm - in - (LOCATION San Mateo) - , - (LOCATION Calif.) - ...) - -Thus, the Named Entity mentions in this example are *Cohn*, *McGlashan & -Sarrail*, *San Mateo* and *Calif.*. - -The CoNLL2002 Dutch and Spanish data is treated similarly, although in -this case, the strings are also POS tagged. - - >>> from nltk.corpus import conll2002 - >>> for doc in conll2002.chunked_sents('ned.train')[27]: - ... print(doc) - ('Het', 'Art') - (ORG Hof/N van/Prep Cassatie/N) - ('verbrak', 'V') - ('het', 'Art') - ('arrest', 'N') - ('zodat', 'Conj') - ('het', 'Pron') - ('moest', 'V') - ('worden', 'V') - ('overgedaan', 'V') - ('door', 'Prep') - ('het', 'Art') - ('hof', 'N') - ('van', 'Prep') - ('beroep', 'N') - ('van', 'Prep') - (LOC Antwerpen/N) - ('.', 'Punc') - -Relation Extraction -~~~~~~~~~~~~~~~~~~~ - -Relation Extraction standardly consists of identifying specified -relations between Named Entities. For example, assuming that we can -recognize ORGANIZATIONs and LOCATIONs in text, we might want to also -recognize pairs *(o, l)* of these kinds of entities such that *o* is -located in *l*. - -The `sem.relextract` module provides some tools to help carry out a -simple version of this task. The `tree2semi_rel()` function splits a chunk -document into a list of two-member lists, each of which consists of a -(possibly empty) string followed by a `Tree` (i.e., a Named Entity): - - >>> from nltk.sem import relextract - >>> pairs = relextract.tree2semi_rel(tree) - >>> for s, tree in pairs[18:22]: - ... print('("...%s", %s)' % (" ".join(s[-5:]),tree)) - ("...about first-level questions,'' said Ms.", (PERSON Cohn)) - ("..., a partner in the", (ORGANIZATION McGlashan & Sarrail)) - ("...firm in", (LOCATION San Mateo)) - ("...,", (LOCATION Calif.)) - -The function `semi_rel2reldict()` processes triples of these pairs, i.e., -pairs of the form ``((string1, Tree1), (string2, Tree2), (string3, -Tree3))`` and outputs a dictionary (a `reldict`) in which ``Tree1`` is -the subject of the relation, ``string2`` is the filler -and ``Tree3`` is the object of the relation. ``string1`` and ``string3`` are -stored as left and right context respectively. - - >>> reldicts = relextract.semi_rel2reldict(pairs) - >>> for k, v in sorted(reldicts[0].items()): - ... print(k, '=>', v) - filler => of messages to their own ``Cyberia'' ... - lcon => transactions.'' Each week, they post - objclass => ORGANIZATION - objsym => white_house - objtext => White House - rcon => for access to its planned - subjclass => CARDINAL - subjsym => hundreds - subjtext => hundreds - untagged_filler => of messages to their own ``Cyberia'' ... - -The next example shows some of the values for two `reldict`\ s -corresponding to the ``'NYT_19980315'`` text extract shown earlier. - - >>> for r in reldicts[18:20]: - ... print('=' * 20) - ... print(r['subjtext']) - ... print(r['filler']) - ... print(r['objtext']) - ==================== - Cohn - , a partner in the - McGlashan & Sarrail - ==================== - McGlashan & Sarrail - firm in - San Mateo - -The function `relextract()` allows us to filter the `reldict`\ s -according to the classes of the subject and object named entities. In -addition, we can specify that the filler text has to match a given -regular expression, as illustrated in the next example. Here, we are -looking for pairs of entities in the IN relation, where IN has -signature . - - >>> import re - >>> IN = re.compile(r'.*\bin\b(?!\b.+ing\b)') - >>> for fileid in ieer.fileids(): - ... for doc in ieer.parsed_docs(fileid): - ... for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN): - ... print(relextract.rtuple(rel)) - [ORG: 'Christian Democrats'] ', the leading political forces in' [LOC: 'Italy'] - [ORG: 'AP'] ') _ Lebanese guerrillas attacked Israeli forces in southern' [LOC: 'Lebanon'] - [ORG: 'Security Council'] 'adopted Resolution 425. Huge yellow banners hung across intersections in' [LOC: 'Beirut'] - [ORG: 'U.N.'] 'failures in' [LOC: 'Africa'] - [ORG: 'U.N.'] 'peacekeeping operation in' [LOC: 'Somalia'] - [ORG: 'U.N.'] 'partners on a more effective role in' [LOC: 'Africa'] - [ORG: 'AP'] ') _ A bomb exploded in a mosque in central' [LOC: 'San`a'] - [ORG: 'Krasnoye Sormovo'] 'shipyard in the Soviet city of' [LOC: 'Gorky'] - [ORG: 'Kelab Golf Darul Ridzuan'] 'in' [LOC: 'Perak'] - [ORG: 'U.N.'] 'peacekeeping operation in' [LOC: 'Somalia'] - [ORG: 'WHYY'] 'in' [LOC: 'Philadelphia'] - [ORG: 'McGlashan & Sarrail'] 'firm in' [LOC: 'San Mateo'] - [ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington'] - [ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington'] - [ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles'] - [ORG: 'Open Text'] ', based in' [LOC: 'Waterloo'] - ... - -The next example illustrates a case where the pattern is a disjunction -of roles that a PERSON can occupy in an ORGANIZATION. - - >>> roles = r""" - ... (.*( - ... analyst| - ... chair(wo)?man| - ... commissioner| - ... counsel| - ... director| - ... economist| - ... editor| - ... executive| - ... foreman| - ... governor| - ... head| - ... lawyer| - ... leader| - ... librarian).*)| - ... manager| - ... partner| - ... president| - ... producer| - ... professor| - ... researcher| - ... spokes(wo)?man| - ... writer| - ... ,\sof\sthe?\s* # "X, of (the) Y" - ... """ - >>> ROLES = re.compile(roles, re.VERBOSE) - >>> for fileid in ieer.fileids(): - ... for doc in ieer.parsed_docs(fileid): - ... for rel in relextract.extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES): - ... print(relextract.rtuple(rel)) - [PER: 'Kivutha Kibwana'] ', of the' [ORG: 'National Convention Assembly'] - [PER: 'Boban Boskovic'] ', chief executive of the' [ORG: 'Plastika'] - [PER: 'Annan'] ', the first sub-Saharan African to head the' [ORG: 'United Nations'] - [PER: 'Kiriyenko'] 'became a foreman at the' [ORG: 'Krasnoye Sormovo'] - [PER: 'Annan'] ', the first sub-Saharan African to head the' [ORG: 'United Nations'] - [PER: 'Mike Godwin'] ', chief counsel for the' [ORG: 'Electronic Frontier Foundation'] - ... - -In the case of the CoNLL2002 data, we can include POS tags in the -query pattern. This example also illustrates how the output can be -presented as something that looks more like a clause in a logical language. - - >>> de = """ - ... .* - ... ( - ... de/SP| - ... del/SP - ... ) - ... """ - >>> DE = re.compile(de, re.VERBOSE) - >>> rels = [rel for doc in conll2002.chunked_sents('esp.train') - ... for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)] - >>> for r in rels[:10]: - ... print(relextract.clause(r, relsym='DE')) - DE('tribunal_supremo', 'victoria') - DE('museo_de_arte', 'alcorc\xf3n') - DE('museo_de_bellas_artes', 'a_coru\xf1a') - DE('siria', 'l\xedbano') - DE('uni\xf3n_europea', 'pek\xedn') - DE('ej\xe9rcito', 'rogberi') - DE('juzgado_de_instrucci\xf3n_n\xfamero_1', 'san_sebasti\xe1n') - DE('psoe', 'villanueva_de_la_serena') - DE('ej\xe9rcito', 'l\xedbano') - DE('juzgado_de_lo_penal_n\xfamero_2', 'ceuta') - >>> vnv = """ - ... ( - ... is/V| - ... was/V| - ... werd/V| - ... wordt/V - ... ) - ... .* - ... van/Prep - ... """ - >>> VAN = re.compile(vnv, re.VERBOSE) - >>> for doc in conll2002.chunked_sents('ned.train'): - ... for r in relextract.extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN): - ... print(relextract.clause(r, relsym="VAN")) - VAN("cornet_d'elzius", 'buitenlandse_handel') - VAN('johan_rottiers', 'kardinaal_van_roey_instituut') - VAN('annie_lennox', 'eurythmics') diff --git a/pipeline/nltk/test/resolution.doctest b/pipeline/nltk/test/resolution.doctest deleted file mode 100644 index f1cf70090d5bed3b868f5e427f5c3f5d045073d4..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/resolution.doctest +++ /dev/null @@ -1,222 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -========================= -Resolution Theorem Prover -========================= - - >>> from nltk.inference.resolution import * - >>> from nltk.sem import logic - >>> from nltk.sem.logic import * - >>> logic._counter._value = 0 - >>> read_expr = logic.Expression.fromstring - - >>> P = read_expr('P') - >>> Q = read_expr('Q') - >>> R = read_expr('R') - >>> A = read_expr('A') - >>> B = read_expr('B') - >>> x = read_expr('x') - >>> y = read_expr('y') - >>> z = read_expr('z') - -------------------------------- -Test most_general_unification() -------------------------------- - >>> print(most_general_unification(x, x)) - {} - >>> print(most_general_unification(A, A)) - {} - >>> print(most_general_unification(A, x)) - {x: A} - >>> print(most_general_unification(x, A)) - {x: A} - >>> print(most_general_unification(x, y)) - {x: y} - >>> print(most_general_unification(P(x), P(A))) - {x: A} - >>> print(most_general_unification(P(x,B), P(A,y))) - {x: A, y: B} - >>> print(most_general_unification(P(x,B), P(B,x))) - {x: B} - >>> print(most_general_unification(P(x,y), P(A,x))) - {x: A, y: x} - >>> print(most_general_unification(P(Q(x)), P(y))) - {y: Q(x)} - ------------- -Test unify() ------------- - >>> print(Clause([]).unify(Clause([]))) - [] - >>> print(Clause([P(x)]).unify(Clause([-P(A)]))) - [{}] - >>> print(Clause([P(A), Q(x)]).unify(Clause([-P(x), R(x)]))) - [{R(A), Q(A)}] - >>> print(Clause([P(A), Q(x), R(x,y)]).unify(Clause([-P(x), Q(y)]))) - [{Q(y), Q(A), R(A,y)}] - >>> print(Clause([P(A), -Q(y)]).unify(Clause([-P(x), Q(B)]))) - [{}] - >>> print(Clause([P(x), Q(x)]).unify(Clause([-P(A), -Q(B)]))) - [{-Q(B), Q(A)}, {-P(A), P(B)}] - >>> print(Clause([P(x,x), Q(x), R(x)]).unify(Clause([-P(A,z), -Q(B)]))) - [{-Q(B), Q(A), R(A)}, {-P(A,z), R(B), P(B,B)}] - - >>> a = clausify(read_expr('P(A)')) - >>> b = clausify(read_expr('A=B')) - >>> print(a[0].unify(b[0])) - [{P(B)}] - -------------------------- -Test is_tautology() -------------------------- - >>> print(Clause([P(A), -P(A)]).is_tautology()) - True - >>> print(Clause([-P(A), P(A)]).is_tautology()) - True - >>> print(Clause([P(x), -P(A)]).is_tautology()) - False - >>> print(Clause([Q(B), -P(A), P(A)]).is_tautology()) - True - >>> print(Clause([-Q(A), P(R(A)), -P(R(A)), Q(x), -R(y)]).is_tautology()) - True - >>> print(Clause([P(x), -Q(A)]).is_tautology()) - False - -------------------------- -Test subsumes() -------------------------- - >>> print(Clause([P(A), Q(B)]).subsumes(Clause([P(A), Q(B)]))) - True - >>> print(Clause([-P(A)]).subsumes(Clause([P(A)]))) - False - >>> print(Clause([P(A), Q(B)]).subsumes(Clause([Q(B), P(A)]))) - True - >>> print(Clause([P(A), Q(B)]).subsumes(Clause([Q(B), R(A), P(A)]))) - True - >>> print(Clause([P(A), R(A), Q(B)]).subsumes(Clause([Q(B), P(A)]))) - False - >>> print(Clause([P(x)]).subsumes(Clause([P(A)]))) - True - >>> print(Clause([P(A)]).subsumes(Clause([P(x)]))) - True - ------------- -Test prove() ------------- - >>> print(ResolutionProverCommand(read_expr('man(x)')).prove()) - False - >>> print(ResolutionProverCommand(read_expr('(man(x) -> man(x))')).prove()) - True - >>> print(ResolutionProverCommand(read_expr('(man(x) -> --man(x))')).prove()) - True - >>> print(ResolutionProverCommand(read_expr('-(man(x) & -man(x))')).prove()) - True - >>> print(ResolutionProverCommand(read_expr('(man(x) | -man(x))')).prove()) - True - >>> print(ResolutionProverCommand(read_expr('(man(x) -> man(x))')).prove()) - True - >>> print(ResolutionProverCommand(read_expr('-(man(x) & -man(x))')).prove()) - True - >>> print(ResolutionProverCommand(read_expr('(man(x) | -man(x))')).prove()) - True - >>> print(ResolutionProverCommand(read_expr('(man(x) -> man(x))')).prove()) - True - >>> print(ResolutionProverCommand(read_expr('(man(x) <-> man(x))')).prove()) - True - >>> print(ResolutionProverCommand(read_expr('-(man(x) <-> -man(x))')).prove()) - True - >>> print(ResolutionProverCommand(read_expr('all x.man(x)')).prove()) - False - >>> print(ResolutionProverCommand(read_expr('-all x.some y.F(x,y) & some x.all y.(-F(x,y))')).prove()) - False - >>> print(ResolutionProverCommand(read_expr('some x.all y.sees(x,y)')).prove()) - False - - >>> p1 = read_expr('all x.(man(x) -> mortal(x))') - >>> p2 = read_expr('man(Socrates)') - >>> c = read_expr('mortal(Socrates)') - >>> ResolutionProverCommand(c, [p1,p2]).prove() - True - - >>> p1 = read_expr('all x.(man(x) -> walks(x))') - >>> p2 = read_expr('man(John)') - >>> c = read_expr('some y.walks(y)') - >>> ResolutionProverCommand(c, [p1,p2]).prove() - True - - >>> p = read_expr('some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))') - >>> c = read_expr('some e0.walk(e0,mary)') - >>> ResolutionProverCommand(c, [p]).prove() - True - ------------- -Test proof() ------------- - >>> p1 = read_expr('all x.(man(x) -> mortal(x))') - >>> p2 = read_expr('man(Socrates)') - >>> c = read_expr('mortal(Socrates)') - >>> logic._counter._value = 0 - >>> tp = ResolutionProverCommand(c, [p1,p2]) - >>> tp.prove() - True - >>> print(tp.proof()) - [1] {-mortal(Socrates)} A - [2] {-man(z2), mortal(z2)} A - [3] {man(Socrates)} A - [4] {-man(Socrates)} (1, 2) - [5] {mortal(Socrates)} (2, 3) - [6] {} (1, 5) - - ------------------- -Question Answering ------------------- -One answer - - >>> p1 = read_expr('father_of(art,john)') - >>> p2 = read_expr('father_of(bob,kim)') - >>> p3 = read_expr('all x.all y.(father_of(x,y) -> parent_of(x,y))') - >>> c = read_expr('all x.(parent_of(x,john) -> ANSWER(x))') - >>> logic._counter._value = 0 - >>> tp = ResolutionProverCommand(None, [p1,p2,p3,c]) - >>> sorted(tp.find_answers()) - [] - >>> print(tp.proof()) # doctest: +SKIP - [1] {father_of(art,john)} A - [2] {father_of(bob,kim)} A - [3] {-father_of(z3,z4), parent_of(z3,z4)} A - [4] {-parent_of(z6,john), ANSWER(z6)} A - [5] {parent_of(art,john)} (1, 3) - [6] {parent_of(bob,kim)} (2, 3) - [7] {ANSWER(z6), -father_of(z6,john)} (3, 4) - [8] {ANSWER(art)} (1, 7) - [9] {ANSWER(art)} (4, 5) - - -Multiple answers - - >>> p1 = read_expr('father_of(art,john)') - >>> p2 = read_expr('mother_of(ann,john)') - >>> p3 = read_expr('all x.all y.(father_of(x,y) -> parent_of(x,y))') - >>> p4 = read_expr('all x.all y.(mother_of(x,y) -> parent_of(x,y))') - >>> c = read_expr('all x.(parent_of(x,john) -> ANSWER(x))') - >>> logic._counter._value = 0 - >>> tp = ResolutionProverCommand(None, [p1,p2,p3,p4,c]) - >>> sorted(tp.find_answers()) - [, ] - >>> print(tp.proof()) # doctest: +SKIP - [ 1] {father_of(art,john)} A - [ 2] {mother_of(ann,john)} A - [ 3] {-father_of(z3,z4), parent_of(z3,z4)} A - [ 4] {-mother_of(z7,z8), parent_of(z7,z8)} A - [ 5] {-parent_of(z10,john), ANSWER(z10)} A - [ 6] {parent_of(art,john)} (1, 3) - [ 7] {parent_of(ann,john)} (2, 4) - [ 8] {ANSWER(z10), -father_of(z10,john)} (3, 5) - [ 9] {ANSWER(art)} (1, 8) - [10] {ANSWER(z10), -mother_of(z10,john)} (4, 5) - [11] {ANSWER(ann)} (2, 10) - [12] {ANSWER(art)} (5, 6) - [13] {ANSWER(ann)} (5, 7) - diff --git a/pipeline/nltk/test/semantics.doctest b/pipeline/nltk/test/semantics.doctest deleted file mode 100644 index c142338892a2405ee3b13e774cf1873f56ce50c7..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/semantics.doctest +++ /dev/null @@ -1,667 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -========= -Semantics -========= - - >>> # Setup tests by setting the counter to 0 - >>> from nltk.sem import logic - >>> logic._counter._value = 0 - - >>> import nltk - >>> from nltk.sem import Valuation, Model - >>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'), - ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), - ... ('dog', set(['d1'])), - ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))] - >>> val = Valuation(v) - >>> dom = val.domain - >>> m = Model(dom, val) - -Evaluation ----------- - -The top-level method of a ``Model`` instance is ``evaluate()``, which -assigns a semantic value to expressions of the ``logic`` module, under -an assignment ``g``: - - >>> dom = val.domain - >>> g = nltk.sem.Assignment(dom) - >>> m.evaluate('all x.(boy(x) -> - girl(x))', g) - True - - -``evaluate()`` calls a recursive function ``satisfy()``, which in turn -calls a function ``i()`` to interpret non-logical constants and -individual variables. ``i()`` delegates the interpretation of these to -the the model's ``Valuation`` and the variable assignment ``g`` -respectively. Any atomic expression which cannot be assigned a value -by ``i`` raises an ``Undefined`` exception; this is caught by -``evaluate``, which returns the string ``'Undefined'``. - - >>> m.evaluate('walk(adam)', g, trace=2) - - 'walk(adam)' is undefined under M, g - 'Undefined' - -Batch Processing ----------------- - -The utility functions ``interpret_sents()`` and ``evaluate_sents()`` are intended to -help with processing multiple sentences. Here's an example of the first of these: - - >>> sents = ['Mary walks'] - >>> results = nltk.sem.util.interpret_sents(sents, 'grammars/sample_grammars/sem2.fcfg') - >>> for result in results: - ... for (synrep, semrep) in result: - ... print(synrep) - (S[SEM=] - (NP[-LOC, NUM='sg', SEM=<\P.P(mary)>] - (PropN[-LOC, NUM='sg', SEM=<\P.P(mary)>] Mary)) - (VP[NUM='sg', SEM=<\x.walk(x)>] - (IV[NUM='sg', SEM=<\x.walk(x)>, TNS='pres'] walks))) - -In order to provide backwards compatibility with 'legacy' grammars where the semantics value -is specified with a lowercase -``sem`` feature, the relevant feature name can be passed to the function using the -``semkey`` parameter, as shown here: - - >>> sents = ['raining'] - >>> g = nltk.grammar.FeatureGrammar.fromstring(""" - ... % start S - ... S[sem=] -> 'raining' - ... """) - >>> results = nltk.sem.util.interpret_sents(sents, g, semkey='sem') - >>> for result in results: - ... for (synrep, semrep) in result: - ... print(semrep) - raining - -The function ``evaluate_sents()`` works in a similar manner, but also needs to be -passed a ``Model`` against which the semantic representations are evaluated. - -Unit Tests -========== - - -Unit tests for relations and valuations ---------------------------------------- - - >>> from nltk.sem import * - -Relations are sets of tuples, all of the same length. - - >>> s1 = set([('d1', 'd2'), ('d1', 'd1'), ('d2', 'd1')]) - >>> is_rel(s1) - True - >>> s2 = set([('d1', 'd2'), ('d1', 'd2'), ('d1',)]) - >>> is_rel(s2) - Traceback (most recent call last): - . . . - ValueError: Set set([('d1', 'd2'), ('d1',)]) contains sequences of different lengths - >>> s3 = set(['d1', 'd2']) - >>> is_rel(s3) - Traceback (most recent call last): - . . . - ValueError: Set set(['d2', 'd1']) contains sequences of different lengths - >>> s4 = set2rel(s3) - >>> is_rel(s4) - True - >>> is_rel(set()) - True - >>> null_binary_rel = set([(None, None)]) - >>> is_rel(null_binary_rel) - True - -Sets of entities are converted into sets of singleton tuples -(containing strings). - - >>> sorted(set2rel(s3)) - [('d1',), ('d2',)] - >>> sorted(set2rel(set([1,3,5,]))) - ['1', '3', '5'] - >>> set2rel(set()) == set() - True - >>> set2rel(set2rel(s3)) == set2rel(s3) - True - -Predication is evaluated by set membership. - - >>> ('d1', 'd2') in s1 - True - >>> ('d2', 'd2') in s1 - False - >>> ('d1',) in s1 - False - >>> 'd2' in s1 - False - >>> ('d1',) in s4 - True - >>> ('d1',) in set() - False - >>> 'd1' in null_binary_rel - False - - - >>> val = Valuation([('Fido', 'd1'), ('dog', set(['d1', 'd2'])), ('walk', set())]) - >>> sorted(val['dog']) - [('d1',), ('d2',)] - >>> val.domain == set(['d1', 'd2']) - True - >>> print(val.symbols) - ['Fido', 'dog', 'walk'] - - -Parse a valuation from a string. - - >>> v = """ - ... john => b1 - ... mary => g1 - ... suzie => g2 - ... fido => d1 - ... tess => d2 - ... noosa => n - ... girl => {g1, g2} - ... boy => {b1, b2} - ... dog => {d1, d2} - ... bark => {d1, d2} - ... walk => {b1, g2, d1} - ... chase => {(b1, g1), (b2, g1), (g1, d1), (g2, d2)} - ... see => {(b1, g1), (b2, d2), (g1, b1),(d2, b1), (g2, n)} - ... in => {(b1, n), (b2, n), (d2, n)} - ... with => {(b1, g1), (g1, b1), (d1, b1), (b1, d1)} - ... """ - >>> val = Valuation.fromstring(v) - - >>> print(val) # doctest: +SKIP - {'bark': set([('d1',), ('d2',)]), - 'boy': set([('b1',), ('b2',)]), - 'chase': set([('b1', 'g1'), ('g2', 'd2'), ('g1', 'd1'), ('b2', 'g1')]), - 'dog': set([('d1',), ('d2',)]), - 'fido': 'd1', - 'girl': set([('g2',), ('g1',)]), - 'in': set([('d2', 'n'), ('b1', 'n'), ('b2', 'n')]), - 'john': 'b1', - 'mary': 'g1', - 'noosa': 'n', - 'see': set([('b1', 'g1'), ('b2', 'd2'), ('d2', 'b1'), ('g2', 'n'), ('g1', 'b1')]), - 'suzie': 'g2', - 'tess': 'd2', - 'walk': set([('d1',), ('b1',), ('g2',)]), - 'with': set([('b1', 'g1'), ('d1', 'b1'), ('b1', 'd1'), ('g1', 'b1')])} - - -Unit tests for function argument application in a Model -------------------------------------------------------- - - >>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'),\ - ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), ('dog', set(['d1'])), - ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')])), - ... ('kiss', null_binary_rel)] - >>> val = Valuation(v) - >>> dom = val.domain - >>> m = Model(dom, val) - >>> g = Assignment(dom) - >>> sorted(val['boy']) - [('b1',), ('b2',)] - >>> ('b1',) in val['boy'] - True - >>> ('g1',) in val['boy'] - False - >>> ('foo',) in val['boy'] - False - >>> ('b1', 'g1') in val['love'] - True - >>> ('b1', 'b1') in val['kiss'] - False - >>> sorted(val.domain) - ['b1', 'b2', 'd1', 'g1', 'g2'] - - -Model Tests -=========== - -Extension of Lambda expressions - - >>> v0 = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'),\ - ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), - ... ('dog', set(['d1'])), - ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))] - - >>> val0 = Valuation(v0) - >>> dom0 = val0.domain - >>> m0 = Model(dom0, val0) - >>> g0 = Assignment(dom0) - - >>> print(m0.evaluate(r'\x. \y. love(x, y)', g0) == {'g2': {'g2': False, 'b2': False, 'b1': True, 'g1': False, 'd1': False}, 'b2': {'g2': True, 'b2': False, 'b1': False, 'g1': False, 'd1': False}, 'b1': {'g2': False, 'b2': False, 'b1': False, 'g1': True, 'd1': False}, 'g1': {'g2': False, 'b2': False, 'b1': True, 'g1': False, 'd1': False}, 'd1': {'g2': False, 'b2': False, 'b1': False, 'g1': False, 'd1': False}}) - True - >>> print(m0.evaluate(r'\x. dog(x) (adam)', g0)) - False - >>> print(m0.evaluate(r'\x. (dog(x) | boy(x)) (adam)', g0)) - True - >>> print(m0.evaluate(r'\x. \y. love(x, y)(fido)', g0) == {'g2': False, 'b2': False, 'b1': False, 'g1': False, 'd1': False}) - True - >>> print(m0.evaluate(r'\x. \y. love(x, y)(adam)', g0) == {'g2': False, 'b2': False, 'b1': False, 'g1': True, 'd1': False}) - True - >>> print(m0.evaluate(r'\x. \y. love(x, y)(betty)', g0) == {'g2': False, 'b2': False, 'b1': True, 'g1': False, 'd1': False}) - True - >>> print(m0.evaluate(r'\x. \y. love(x, y)(betty)(adam)', g0)) - True - >>> print(m0.evaluate(r'\x. \y. love(x, y)(betty, adam)', g0)) - True - >>> print(m0.evaluate(r'\y. \x. love(x, y)(fido)(adam)', g0)) - False - >>> print(m0.evaluate(r'\y. \x. love(x, y)(betty, adam)', g0)) - True - >>> print(m0.evaluate(r'\x. exists y. love(x, y)', g0) == {'g2': True, 'b2': True, 'b1': True, 'g1': True, 'd1': False}) - True - >>> print(m0.evaluate(r'\z. adam', g0) == {'g2': 'b1', 'b2': 'b1', 'b1': 'b1', 'g1': 'b1', 'd1': 'b1'}) - True - >>> print(m0.evaluate(r'\z. love(x, y)', g0) == {'g2': False, 'b2': False, 'b1': False, 'g1': False, 'd1': False}) - True - - -Propositional Model Test ------------------------- - - >>> tests = [ - ... ('P & Q', True), - ... ('P & R', False), - ... ('- P', False), - ... ('- R', True), - ... ('- - P', True), - ... ('- (P & R)', True), - ... ('P | R', True), - ... ('R | P', True), - ... ('R | R', False), - ... ('- P | R', False), - ... ('P | - P', True), - ... ('P -> Q', True), - ... ('P -> R', False), - ... ('R -> P', True), - ... ('P <-> P', True), - ... ('R <-> R', True), - ... ('P <-> R', False), - ... ] - >>> val1 = Valuation([('P', True), ('Q', True), ('R', False)]) - >>> dom = set([]) - >>> m = Model(dom, val1) - >>> g = Assignment(dom) - >>> for (sent, testvalue) in tests: - ... semvalue = m.evaluate(sent, g) - ... if semvalue == testvalue: - ... print('*', end=' ') - * * * * * * * * * * * * * * * * * - - -Test of i Function ------------------- - - >>> from nltk.sem import Expression - >>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'), - ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), ('dog', set(['d1'])), - ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))] - >>> val = Valuation(v) - >>> dom = val.domain - >>> m = Model(dom, val) - >>> g = Assignment(dom, [('x', 'b1'), ('y', 'g2')]) - >>> exprs = ['adam', 'girl', 'love', 'walks', 'x', 'y', 'z'] - >>> parsed_exprs = [Expression.fromstring(e) for e in exprs] - >>> sorted_set = lambda x: sorted(x) if isinstance(x, set) else x - >>> for parsed in parsed_exprs: - ... try: - ... print("'%s' gets value %s" % (parsed, sorted_set(m.i(parsed, g)))) - ... except Undefined: - ... print("'%s' is Undefined" % parsed) - 'adam' gets value b1 - 'girl' gets value [('g1',), ('g2',)] - 'love' gets value [('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')] - 'walks' is Undefined - 'x' gets value b1 - 'y' gets value g2 - 'z' is Undefined - -Test for formulas in Model --------------------------- - - >>> tests = [ - ... ('love(adam, betty)', True), - ... ('love(adam, sue)', 'Undefined'), - ... ('dog(fido)', True), - ... ('- dog(fido)', False), - ... ('- - dog(fido)', True), - ... ('- dog(sue)', 'Undefined'), - ... ('dog(fido) & boy(adam)', True), - ... ('- (dog(fido) & boy(adam))', False), - ... ('- dog(fido) & boy(adam)', False), - ... ('dog(fido) | boy(adam)', True), - ... ('- (dog(fido) | boy(adam))', False), - ... ('- dog(fido) | boy(adam)', True), - ... ('- dog(fido) | - boy(adam)', False), - ... ('dog(fido) -> boy(adam)', True), - ... ('- (dog(fido) -> boy(adam))', False), - ... ('- dog(fido) -> boy(adam)', True), - ... ('exists x . love(adam, x)', True), - ... ('all x . love(adam, x)', False), - ... ('fido = fido', True), - ... ('exists x . all y. love(x, y)', False), - ... ('exists x . (x = fido)', True), - ... ('all x . (dog(x) | - dog(x))', True), - ... ('adam = mia', 'Undefined'), - ... ('\\x. (boy(x) | girl(x))', {'g2': True, 'b2': True, 'b1': True, 'g1': True, 'd1': False}), - ... ('\\x. exists y. (boy(x) & love(x, y))', {'g2': False, 'b2': True, 'b1': True, 'g1': False, 'd1': False}), - ... ('exists z1. boy(z1)', True), - ... ('exists x. (boy(x) & - (x = adam))', True), - ... ('exists x. (boy(x) & all y. love(y, x))', False), - ... ('all x. (boy(x) | girl(x))', False), - ... ('all x. (girl(x) -> exists y. boy(y) & love(x, y))', False), - ... ('exists x. (boy(x) & all y. (girl(y) -> love(y, x)))', True), - ... ('exists x. (boy(x) & all y. (girl(y) -> love(x, y)))', False), - ... ('all x. (dog(x) -> - girl(x))', True), - ... ('exists x. exists y. (love(x, y) & love(x, y))', True), - ... ] - >>> for (sent, testvalue) in tests: - ... semvalue = m.evaluate(sent, g) - ... if semvalue == testvalue: - ... print('*', end=' ') - ... else: - ... print(sent, semvalue) - * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * - - - -Satisfier Tests ---------------- - - >>> formulas = [ - ... 'boy(x)', - ... '(x = x)', - ... '(boy(x) | girl(x))', - ... '(boy(x) & girl(x))', - ... 'love(adam, x)', - ... 'love(x, adam)', - ... '- (x = adam)', - ... 'exists z22. love(x, z22)', - ... 'exists y. love(y, x)', - ... 'all y. (girl(y) -> love(x, y))', - ... 'all y. (girl(y) -> love(y, x))', - ... 'all y. (girl(y) -> (boy(x) & love(y, x)))', - ... 'boy(x) & all y. (girl(y) -> love(x, y))', - ... 'boy(x) & all y. (girl(y) -> love(y, x))', - ... 'boy(x) & exists y. (girl(y) & love(y, x))', - ... 'girl(x) -> dog(x)', - ... 'all y. (dog(y) -> (x = y))', - ... '- exists y. love(y, x)', - ... 'exists y. (love(adam, y) & love(y, x))' - ... ] - >>> g.purge() - >>> g.add('x', 'b1') - {'x': 'b1'} - >>> for f in formulas: - ... try: - ... print("'%s' gets value: %s" % (f, m.evaluate(f, g))) - ... except Undefined: - ... print("'%s' is Undefined" % f) - 'boy(x)' gets value: True - '(x = x)' gets value: True - '(boy(x) | girl(x))' gets value: True - '(boy(x) & girl(x))' gets value: False - 'love(adam, x)' gets value: False - 'love(x, adam)' gets value: False - '- (x = adam)' gets value: False - 'exists z22. love(x, z22)' gets value: True - 'exists y. love(y, x)' gets value: True - 'all y. (girl(y) -> love(x, y))' gets value: False - 'all y. (girl(y) -> love(y, x))' gets value: True - 'all y. (girl(y) -> (boy(x) & love(y, x)))' gets value: True - 'boy(x) & all y. (girl(y) -> love(x, y))' gets value: False - 'boy(x) & all y. (girl(y) -> love(y, x))' gets value: True - 'boy(x) & exists y. (girl(y) & love(y, x))' gets value: True - 'girl(x) -> dog(x)' gets value: True - 'all y. (dog(y) -> (x = y))' gets value: False - '- exists y. love(y, x)' gets value: False - 'exists y. (love(adam, y) & love(y, x))' gets value: True - - >>> from nltk.sem import Expression - >>> for fmla in formulas: - ... p = Expression.fromstring(fmla) - ... g.purge() - ... print("Satisfiers of '%s':\n\t%s" % (p, sorted(m.satisfiers(p, 'x', g)))) - Satisfiers of 'boy(x)': - ['b1', 'b2'] - Satisfiers of '(x = x)': - ['b1', 'b2', 'd1', 'g1', 'g2'] - Satisfiers of '(boy(x) | girl(x))': - ['b1', 'b2', 'g1', 'g2'] - Satisfiers of '(boy(x) & girl(x))': - [] - Satisfiers of 'love(adam,x)': - ['g1'] - Satisfiers of 'love(x,adam)': - ['g1', 'g2'] - Satisfiers of '-(x = adam)': - ['b2', 'd1', 'g1', 'g2'] - Satisfiers of 'exists z22.love(x,z22)': - ['b1', 'b2', 'g1', 'g2'] - Satisfiers of 'exists y.love(y,x)': - ['b1', 'g1', 'g2'] - Satisfiers of 'all y.(girl(y) -> love(x,y))': - [] - Satisfiers of 'all y.(girl(y) -> love(y,x))': - ['b1'] - Satisfiers of 'all y.(girl(y) -> (boy(x) & love(y,x)))': - ['b1'] - Satisfiers of '(boy(x) & all y.(girl(y) -> love(x,y)))': - [] - Satisfiers of '(boy(x) & all y.(girl(y) -> love(y,x)))': - ['b1'] - Satisfiers of '(boy(x) & exists y.(girl(y) & love(y,x)))': - ['b1'] - Satisfiers of '(girl(x) -> dog(x))': - ['b1', 'b2', 'd1'] - Satisfiers of 'all y.(dog(y) -> (x = y))': - ['d1'] - Satisfiers of '-exists y.love(y,x)': - ['b2', 'd1'] - Satisfiers of 'exists y.(love(adam,y) & love(y,x))': - ['b1'] - - -Tests based on the Blackburn & Bos testsuite --------------------------------------------- - - >>> v1 = [('jules', 'd1'), ('vincent', 'd2'), ('pumpkin', 'd3'), - ... ('honey_bunny', 'd4'), ('yolanda', 'd5'), - ... ('customer', set(['d1', 'd2'])), - ... ('robber', set(['d3', 'd4'])), - ... ('love', set([('d3', 'd4')]))] - >>> val1 = Valuation(v1) - >>> dom1 = val1.domain - >>> m1 = Model(dom1, val1) - >>> g1 = Assignment(dom1) - - >>> v2 = [('jules', 'd1'), ('vincent', 'd2'), ('pumpkin', 'd3'), - ... ('honey_bunny', 'd4'), ('yolanda', 'd4'), - ... ('customer', set(['d1', 'd2', 'd5', 'd6'])), - ... ('robber', set(['d3', 'd4'])), - ... ('love', set([(None, None)]))] - >>> val2 = Valuation(v2) - >>> dom2 = set(['d1', 'd2', 'd3', 'd4', 'd5', 'd6']) - >>> m2 = Model(dom2, val2) - >>> g2 = Assignment(dom2) - >>> g21 = Assignment(dom2) - >>> g21.add('y', 'd3') - {'y': 'd3'} - - >>> v3 = [('mia', 'd1'), ('jody', 'd2'), ('jules', 'd3'), - ... ('vincent', 'd4'), - ... ('woman', set(['d1', 'd2'])), ('man', set(['d3', 'd4'])), - ... ('joke', set(['d5', 'd6'])), ('episode', set(['d7', 'd8'])), - ... ('in', set([('d5', 'd7'), ('d5', 'd8')])), - ... ('tell', set([('d1', 'd5'), ('d2', 'd6')]))] - >>> val3 = Valuation(v3) - >>> dom3 = set(['d1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8']) - >>> m3 = Model(dom3, val3) - >>> g3 = Assignment(dom3) - - >>> tests = [ - ... ('exists x. robber(x)', m1, g1, True), - ... ('exists x. exists y. love(y, x)', m1, g1, True), - ... ('exists x0. exists x1. love(x1, x0)', m2, g2, False), - ... ('all x. all y. love(y, x)', m2, g2, False), - ... ('- (all x. all y. love(y, x))', m2, g2, True), - ... ('all x. all y. - love(y, x)', m2, g2, True), - ... ('yolanda = honey_bunny', m2, g2, True), - ... ('mia = honey_bunny', m2, g2, 'Undefined'), - ... ('- (yolanda = honey_bunny)', m2, g2, False), - ... ('- (mia = honey_bunny)', m2, g2, 'Undefined'), - ... ('all x. (robber(x) | customer(x))', m2, g2, True), - ... ('- (all x. (robber(x) | customer(x)))', m2, g2, False), - ... ('(robber(x) | customer(x))', m2, g2, 'Undefined'), - ... ('(robber(y) | customer(y))', m2, g21, True), - ... ('exists x. (man(x) & exists x. woman(x))', m3, g3, True), - ... ('exists x. (man(x) & exists x. woman(x))', m3, g3, True), - ... ('- exists x. woman(x)', m3, g3, False), - ... ('exists x. (tasty(x) & burger(x))', m3, g3, 'Undefined'), - ... ('- exists x. (tasty(x) & burger(x))', m3, g3, 'Undefined'), - ... ('exists x. (man(x) & - exists y. woman(y))', m3, g3, False), - ... ('exists x. (man(x) & - exists x. woman(x))', m3, g3, False), - ... ('exists x. (woman(x) & - exists x. customer(x))', m2, g2, 'Undefined'), - ... ] - - >>> for item in tests: - ... sentence, model, g, testvalue = item - ... semvalue = model.evaluate(sentence, g) - ... if semvalue == testvalue: - ... print('*', end=' ') - ... g.purge() - * * * * * * * * * * * * * * * * * * * * * * - - -Tests for mapping from syntax to semantics ------------------------------------------- - -Load a valuation from a file. - - >>> import nltk.data - >>> from nltk.sem.util import parse_sents - >>> val = nltk.data.load('grammars/sample_grammars/valuation1.val') - >>> dom = val.domain - >>> m = Model(dom, val) - >>> g = Assignment(dom) - >>> gramfile = 'grammars/sample_grammars/sem2.fcfg' - >>> inputs = ['John sees a girl', 'every dog barks'] - >>> parses = parse_sents(inputs, gramfile) - >>> for sent, trees in zip(inputs, parses): - ... print() - ... print("Sentence: %s" % sent) - ... for tree in trees: - ... print("Parse:\n %s" %tree) - ... print("Semantics: %s" % root_semrep(tree)) - - Sentence: John sees a girl - Parse: - (S[SEM=] - (NP[-LOC, NUM='sg', SEM=<\P.P(john)>] - (PropN[-LOC, NUM='sg', SEM=<\P.P(john)>] John)) - (VP[NUM='sg', SEM=<\y.exists x.(girl(x) & see(y,x))>] - (TV[NUM='sg', SEM=<\X y.X(\x.see(y,x))>, TNS='pres'] sees) - (NP[NUM='sg', SEM=<\Q.exists x.(girl(x) & Q(x))>] - (Det[NUM='sg', SEM=<\P Q.exists x.(P(x) & Q(x))>] a) - (Nom[NUM='sg', SEM=<\x.girl(x)>] - (N[NUM='sg', SEM=<\x.girl(x)>] girl))))) - Semantics: exists x.(girl(x) & see(john,x)) - - Sentence: every dog barks - Parse: - (S[SEM= bark(x))>] - (NP[NUM='sg', SEM=<\Q.all x.(dog(x) -> Q(x))>] - (Det[NUM='sg', SEM=<\P Q.all x.(P(x) -> Q(x))>] every) - (Nom[NUM='sg', SEM=<\x.dog(x)>] - (N[NUM='sg', SEM=<\x.dog(x)>] dog))) - (VP[NUM='sg', SEM=<\x.bark(x)>] - (IV[NUM='sg', SEM=<\x.bark(x)>, TNS='pres'] barks))) - Semantics: all x.(dog(x) -> bark(x)) - - >>> sent = "every dog barks" - >>> result = nltk.sem.util.interpret_sents([sent], gramfile)[0] - >>> for (syntree, semrep) in result: - ... print(syntree) - ... print() - ... print(semrep) - (S[SEM= bark(x))>] - (NP[NUM='sg', SEM=<\Q.all x.(dog(x) -> Q(x))>] - (Det[NUM='sg', SEM=<\P Q.all x.(P(x) -> Q(x))>] every) - (Nom[NUM='sg', SEM=<\x.dog(x)>] - (N[NUM='sg', SEM=<\x.dog(x)>] dog))) - (VP[NUM='sg', SEM=<\x.bark(x)>] - (IV[NUM='sg', SEM=<\x.bark(x)>, TNS='pres'] barks))) - - all x.(dog(x) -> bark(x)) - - >>> result = nltk.sem.util.evaluate_sents([sent], gramfile, m, g)[0] - >>> for (syntree, semrel, value) in result: - ... print(syntree) - ... print() - ... print(semrep) - ... print() - ... print(value) - (S[SEM= bark(x))>] - (NP[NUM='sg', SEM=<\Q.all x.(dog(x) -> Q(x))>] - (Det[NUM='sg', SEM=<\P Q.all x.(P(x) -> Q(x))>] every) - (Nom[NUM='sg', SEM=<\x.dog(x)>] - (N[NUM='sg', SEM=<\x.dog(x)>] dog))) - (VP[NUM='sg', SEM=<\x.bark(x)>] - (IV[NUM='sg', SEM=<\x.bark(x)>, TNS='pres'] barks))) - - all x.(dog(x) -> bark(x)) - - True - - >>> sents = ['Mary walks', 'John sees a dog'] - >>> results = nltk.sem.util.interpret_sents(sents, 'grammars/sample_grammars/sem2.fcfg') - >>> for result in results: - ... for (synrep, semrep) in result: - ... print(synrep) - (S[SEM=] - (NP[-LOC, NUM='sg', SEM=<\P.P(mary)>] - (PropN[-LOC, NUM='sg', SEM=<\P.P(mary)>] Mary)) - (VP[NUM='sg', SEM=<\x.walk(x)>] - (IV[NUM='sg', SEM=<\x.walk(x)>, TNS='pres'] walks))) - (S[SEM=] - (NP[-LOC, NUM='sg', SEM=<\P.P(john)>] - (PropN[-LOC, NUM='sg', SEM=<\P.P(john)>] John)) - (VP[NUM='sg', SEM=<\y.exists x.(dog(x) & see(y,x))>] - (TV[NUM='sg', SEM=<\X y.X(\x.see(y,x))>, TNS='pres'] sees) - (NP[NUM='sg', SEM=<\Q.exists x.(dog(x) & Q(x))>] - (Det[NUM='sg', SEM=<\P Q.exists x.(P(x) & Q(x))>] a) - (Nom[NUM='sg', SEM=<\x.dog(x)>] - (N[NUM='sg', SEM=<\x.dog(x)>] dog))))) - -Cooper Storage --------------- - - >>> from nltk.sem import cooper_storage as cs - >>> sentence = 'every girl chases a dog' - >>> trees = cs.parse_with_bindops(sentence, grammar='grammars/book_grammars/storage.fcfg') - >>> semrep = trees[0].label()['SEM'] - >>> cs_semrep = cs.CooperStore(semrep) - >>> print(cs_semrep.core) - chase(z2,z4) - >>> for bo in cs_semrep.store: - ... print(bo) - bo(\P.all x.(girl(x) -> P(x)),z2) - bo(\P.exists x.(dog(x) & P(x)),z4) - >>> cs_semrep.s_retrieve(trace=True) - Permutation 1 - (\P.all x.(girl(x) -> P(x)))(\z2.chase(z2,z4)) - (\P.exists x.(dog(x) & P(x)))(\z4.all x.(girl(x) -> chase(x,z4))) - Permutation 2 - (\P.exists x.(dog(x) & P(x)))(\z4.chase(z2,z4)) - (\P.all x.(girl(x) -> P(x)))(\z2.exists x.(dog(x) & chase(z2,x))) - - >>> for reading in cs_semrep.readings: - ... print(reading) - exists x.(dog(x) & all z3.(girl(z3) -> chase(z3,x))) - all x.(girl(x) -> exists z4.(dog(z4) & chase(x,z4))) diff --git a/pipeline/nltk/test/sentiment.doctest b/pipeline/nltk/test/sentiment.doctest deleted file mode 100644 index d7899edb8dd95b5a69e9fa5ecfb47a1e8ff0a2ef..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/sentiment.doctest +++ /dev/null @@ -1,236 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -=================== -Sentiment Analysis -=================== - - >>> from nltk.classify import NaiveBayesClassifier - >>> from nltk.corpus import subjectivity - >>> from nltk.sentiment import SentimentAnalyzer - >>> from nltk.sentiment.util import * - - >>> n_instances = 100 - >>> subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]] - >>> obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]] - >>> len(subj_docs), len(obj_docs) - (100, 100) - -Each document is represented by a tuple (sentence, label). The sentence is tokenized, -so it is represented by a list of strings: - - >>> subj_docs[0] - (['smart', 'and', 'alert', ',', 'thirteen', 'conversations', 'about', 'one', - 'thing', 'is', 'a', 'small', 'gem', '.'], 'subj') - -We separately split subjective and objective instances to keep a balanced uniform -class distribution in both train and test sets. - - >>> train_subj_docs = subj_docs[:80] - >>> test_subj_docs = subj_docs[80:100] - >>> train_obj_docs = obj_docs[:80] - >>> test_obj_docs = obj_docs[80:100] - >>> training_docs = train_subj_docs+train_obj_docs - >>> testing_docs = test_subj_docs+test_obj_docs - - >>> sentim_analyzer = SentimentAnalyzer() - >>> all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs]) - -We use simple unigram word features, handling negation: - - >>> unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) - >>> len(unigram_feats) - 83 - >>> sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) - -We apply features to obtain a feature-value representation of our datasets: - - >>> training_set = sentim_analyzer.apply_features(training_docs) - >>> test_set = sentim_analyzer.apply_features(testing_docs) - -We can now train our classifier on the training set, and subsequently output the -evaluation results: - - >>> trainer = NaiveBayesClassifier.train - >>> classifier = sentim_analyzer.train(trainer, training_set) - Training classifier - >>> for key,value in sorted(sentim_analyzer.evaluate(test_set).items()): - ... print('{0}: {1}'.format(key, value)) - Evaluating NaiveBayesClassifier results... - Accuracy: 0.8 - F-measure [obj]: 0.8 - F-measure [subj]: 0.8 - Precision [obj]: 0.8 - Precision [subj]: 0.8 - Recall [obj]: 0.8 - Recall [subj]: 0.8 - - -Vader ------- - - >>> from nltk.sentiment.vader import SentimentIntensityAnalyzer - >>> sentences = ["VADER is smart, handsome, and funny.", # positive sentence example - ... "VADER is smart, handsome, and funny!", # punctuation emphasis handled correctly (sentiment intensity adjusted) - ... "VADER is very smart, handsome, and funny.", # booster words handled correctly (sentiment intensity adjusted) - ... "VADER is VERY SMART, handsome, and FUNNY.", # emphasis for ALLCAPS handled - ... "VADER is VERY SMART, handsome, and FUNNY!!!",# combination of signals - VADER appropriately adjusts intensity - ... "VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!!",# booster words & punctuation make this close to ceiling for score - ... "The book was good.", # positive sentence - ... "The book was kind of good.", # qualified positive sentence is handled correctly (intensity adjusted) - ... "The plot was good, but the characters are uncompelling and the dialog is not great.", # mixed negation sentence - ... "A really bad, horrible book.", # negative sentence with booster words - ... "At least it isn't a horrible book.", # negated negative sentence with contraction - ... ":) and :D", # emoticons handled - ... "", # an empty string is correctly handled - ... "Today sux", # negative slang handled - ... "Today sux!", # negative slang with punctuation emphasis handled - ... "Today SUX!", # negative slang with capitalization emphasis - ... "Today kinda sux! But I'll get by, lol" # mixed sentiment example with slang and constrastive conjunction "but" - ... ] - >>> paragraph = "It was one of the worst movies I've seen, despite good reviews. \ - ... Unbelievably bad acting!! Poor direction. VERY poor production. \ - ... The movie was bad. Very bad movie. VERY bad movie. VERY BAD movie. VERY BAD movie!" - - >>> from nltk import tokenize - >>> lines_list = tokenize.sent_tokenize(paragraph) - >>> sentences.extend(lines_list) - - >>> tricky_sentences = [ - ... "Most automated sentiment analysis tools are shit.", - ... "VADER sentiment analysis is the shit.", - ... "Sentiment analysis has never been good.", - ... "Sentiment analysis with VADER has never been this good.", - ... "Warren Beatty has never been so entertaining.", - ... "I won't say that the movie is astounding and I wouldn't claim that \ - ... the movie is too banal either.", - ... "I like to hate Michael Bay films, but I couldn't fault this one", - ... "I like to hate Michael Bay films, BUT I couldn't help but fault this one", - ... "It's one thing to watch an Uwe Boll film, but another thing entirely \ - ... to pay for it", - ... "The movie was too good", - ... "This movie was actually neither that funny, nor super witty.", - ... "This movie doesn't care about cleverness, wit or any other kind of \ - ... intelligent humor.", - ... "Those who find ugly meanings in beautiful things are corrupt without \ - ... being charming.", - ... "There are slow and repetitive parts, BUT it has just enough spice to \ - ... keep it interesting.", - ... "The script is not fantastic, but the acting is decent and the cinematography \ - ... is EXCELLENT!", - ... "Roger Dodger is one of the most compelling variations on this theme.", - ... "Roger Dodger is one of the least compelling variations on this theme.", - ... "Roger Dodger is at least compelling as a variation on the theme.", - ... "they fall in love with the product", - ... "but then it breaks", - ... "usually around the time the 90 day warranty expires", - ... "the twin towers collapsed today", - ... "However, Mr. Carter solemnly argues, his client carried out the kidnapping \ - ... under orders and in the ''least offensive way possible.''" - ... ] - >>> sentences.extend(tricky_sentences) - >>> for sentence in sentences: - ... sid = SentimentIntensityAnalyzer() - ... print(sentence) - ... ss = sid.polarity_scores(sentence) - ... for k in sorted(ss): - ... print('{0}: {1}, '.format(k, ss[k]), end='') - ... print() - VADER is smart, handsome, and funny. - compound: 0.8316, neg: 0.0, neu: 0.254, pos: 0.746, - VADER is smart, handsome, and funny! - compound: 0.8439, neg: 0.0, neu: 0.248, pos: 0.752, - VADER is very smart, handsome, and funny. - compound: 0.8545, neg: 0.0, neu: 0.299, pos: 0.701, - VADER is VERY SMART, handsome, and FUNNY. - compound: 0.9227, neg: 0.0, neu: 0.246, pos: 0.754, - VADER is VERY SMART, handsome, and FUNNY!!! - compound: 0.9342, neg: 0.0, neu: 0.233, pos: 0.767, - VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!! - compound: 0.9469, neg: 0.0, neu: 0.294, pos: 0.706, - The book was good. - compound: 0.4404, neg: 0.0, neu: 0.508, pos: 0.492, - The book was kind of good. - compound: 0.3832, neg: 0.0, neu: 0.657, pos: 0.343, - The plot was good, but the characters are uncompelling and the dialog is not great. - compound: -0.7042, neg: 0.327, neu: 0.579, pos: 0.094, - A really bad, horrible book. - compound: -0.8211, neg: 0.791, neu: 0.209, pos: 0.0, - At least it isn't a horrible book. - compound: 0.431, neg: 0.0, neu: 0.637, pos: 0.363, - :) and :D - compound: 0.7925, neg: 0.0, neu: 0.124, pos: 0.876, - - compound: 0.0, neg: 0.0, neu: 0.0, pos: 0.0, - Today sux - compound: -0.3612, neg: 0.714, neu: 0.286, pos: 0.0, - Today sux! - compound: -0.4199, neg: 0.736, neu: 0.264, pos: 0.0, - Today SUX! - compound: -0.5461, neg: 0.779, neu: 0.221, pos: 0.0, - Today kinda sux! But I'll get by, lol - compound: 0.5249, neg: 0.138, neu: 0.517, pos: 0.344, - It was one of the worst movies I've seen, despite good reviews. - compound: -0.7584, neg: 0.394, neu: 0.606, pos: 0.0, - Unbelievably bad acting!! - compound: -0.6572, neg: 0.686, neu: 0.314, pos: 0.0, - Poor direction. - compound: -0.4767, neg: 0.756, neu: 0.244, pos: 0.0, - VERY poor production. - compound: -0.6281, neg: 0.674, neu: 0.326, pos: 0.0, - The movie was bad. - compound: -0.5423, neg: 0.538, neu: 0.462, pos: 0.0, - Very bad movie. - compound: -0.5849, neg: 0.655, neu: 0.345, pos: 0.0, - VERY bad movie. - compound: -0.6732, neg: 0.694, neu: 0.306, pos: 0.0, - VERY BAD movie. - compound: -0.7398, neg: 0.724, neu: 0.276, pos: 0.0, - VERY BAD movie! - compound: -0.7616, neg: 0.735, neu: 0.265, pos: 0.0, - Most automated sentiment analysis tools are shit. - compound: -0.5574, neg: 0.375, neu: 0.625, pos: 0.0, - VADER sentiment analysis is the shit. - compound: 0.6124, neg: 0.0, neu: 0.556, pos: 0.444, - Sentiment analysis has never been good. - compound: -0.3412, neg: 0.325, neu: 0.675, pos: 0.0, - Sentiment analysis with VADER has never been this good. - compound: 0.5228, neg: 0.0, neu: 0.703, pos: 0.297, - Warren Beatty has never been so entertaining. - compound: 0.5777, neg: 0.0, neu: 0.616, pos: 0.384, - I won't say that the movie is astounding and I wouldn't claim that the movie is too banal either. - compound: 0.4215, neg: 0.0, neu: 0.851, pos: 0.149, - I like to hate Michael Bay films, but I couldn't fault this one - compound: 0.3153, neg: 0.157, neu: 0.534, pos: 0.309, - I like to hate Michael Bay films, BUT I couldn't help but fault this one - compound: -0.1531, neg: 0.277, neu: 0.477, pos: 0.246, - It's one thing to watch an Uwe Boll film, but another thing entirely to pay for it - compound: -0.2541, neg: 0.112, neu: 0.888, pos: 0.0, - The movie was too good - compound: 0.4404, neg: 0.0, neu: 0.58, pos: 0.42, - This movie was actually neither that funny, nor super witty. - compound: -0.6759, neg: 0.41, neu: 0.59, pos: 0.0, - This movie doesn't care about cleverness, wit or any other kind of intelligent humor. - compound: -0.1338, neg: 0.265, neu: 0.497, pos: 0.239, - Those who find ugly meanings in beautiful things are corrupt without being charming. - compound: -0.3553, neg: 0.314, neu: 0.493, pos: 0.192, - There are slow and repetitive parts, BUT it has just enough spice to keep it interesting. - compound: 0.4678, neg: 0.079, neu: 0.735, pos: 0.186, - The script is not fantastic, but the acting is decent and the cinematography is EXCELLENT! - compound: 0.7565, neg: 0.092, neu: 0.607, pos: 0.301, - Roger Dodger is one of the most compelling variations on this theme. - compound: 0.2944, neg: 0.0, neu: 0.834, pos: 0.166, - Roger Dodger is one of the least compelling variations on this theme. - compound: -0.1695, neg: 0.132, neu: 0.868, pos: 0.0, - Roger Dodger is at least compelling as a variation on the theme. - compound: 0.2263, neg: 0.0, neu: 0.84, pos: 0.16, - they fall in love with the product - compound: 0.6369, neg: 0.0, neu: 0.588, pos: 0.412, - but then it breaks - compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, - usually around the time the 90 day warranty expires - compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, - the twin towers collapsed today - compound: -0.2732, neg: 0.344, neu: 0.656, pos: 0.0, - However, Mr. Carter solemnly argues, his client carried out the kidnapping under orders and in the ''least offensive way possible.'' - compound: -0.5859, neg: 0.23, neu: 0.697, pos: 0.074, diff --git a/pipeline/nltk/test/sentiwordnet.doctest b/pipeline/nltk/test/sentiwordnet.doctest deleted file mode 100644 index 8cab0d9590c71b37b1319e5f21b388168777c476..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/sentiwordnet.doctest +++ /dev/null @@ -1,41 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -====================== -SentiWordNet Interface -====================== - -SentiWordNet can be imported like this: - - >>> from nltk.corpus import sentiwordnet as swn - ------------- -SentiSynsets ------------- - - >>> breakdown = swn.senti_synset('breakdown.n.03') - >>> print(breakdown) - - >>> breakdown.pos_score() - 0.0 - >>> breakdown.neg_score() - 0.25 - >>> breakdown.obj_score() - 0.75 - - ------- -Lookup ------- - - >>> list(swn.senti_synsets('slow')) - [SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'), - SentiSynset('slow.v.03'), SentiSynset('slow.a.01'), - SentiSynset('slow.a.02'), SentiSynset('dense.s.04'), - SentiSynset('slow.a.04'), SentiSynset('boring.s.01'), - SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'), - SentiSynset('behind.r.03')] - - >>> happy = swn.senti_synsets('happy', 'a') - - >>> all = swn.all_senti_synsets() diff --git a/pipeline/nltk/test/setup_fixt.py b/pipeline/nltk/test/setup_fixt.py deleted file mode 100644 index e7f3a27464b1875107354eb01e1fe9467c653539..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/setup_fixt.py +++ /dev/null @@ -1,26 +0,0 @@ -from nltk.internals import find_binary, find_jar - - -def check_binary(binary: str, **args): - """Skip a test via `pytest.skip` if the `binary` executable is not found. - Keyword arguments are passed to `nltk.internals.find_binary`.""" - import pytest - - try: - find_binary(binary, **args) - except LookupError: - pytest.skip(f"Skipping test because the {binary} binary was not found.") - - -def check_jar(name_pattern: str, **args): - """Skip a test via `pytest.skip` if the `name_pattern` jar is not found. - Keyword arguments are passed to `nltk.internals.find_jar`. - - TODO: Investigate why the CoreNLP tests that rely on this check_jar failed - on the CI. https://github.com/nltk/nltk/pull/3060#issuecomment-1268355108 - """ - import pytest - - pytest.skip( - "Skipping test because the doctests requiring jars are inconsistent on the CI." - ) diff --git a/pipeline/nltk/test/simple.doctest b/pipeline/nltk/test/simple.doctest deleted file mode 100644 index 5cff34f2b3aab1dcfed64ffa93a63e3ce3c40c35..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/simple.doctest +++ /dev/null @@ -1,83 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -================= -EasyInstall Tests -================= - -This file contains some simple tests that will be run by EasyInstall in -order to test the installation when NLTK-Data is absent. - - ------------- -Tokenization ------------- - - >>> from nltk.tokenize import wordpunct_tokenize - >>> s = ("Good muffins cost $3.88\nin New York. Please buy me\n" - ... "two of them.\n\nThanks.") - >>> wordpunct_tokenize(s) - ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', - 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] - -------- -Metrics -------- - - >>> from nltk.metrics import precision, recall, f_measure - >>> reference = 'DET NN VB DET JJ NN NN IN DET NN'.split() - >>> test = 'DET VB VB DET NN NN NN IN DET NN'.split() - >>> reference_set = set(reference) - >>> test_set = set(test) - >>> precision(reference_set, test_set) - 1.0 - >>> print(recall(reference_set, test_set)) - 0.8 - >>> print(f_measure(reference_set, test_set)) - 0.88888888888... - ------------------- -Feature Structures ------------------- - - >>> from nltk import FeatStruct - >>> fs1 = FeatStruct(PER=3, NUM='pl', GND='fem') - >>> fs2 = FeatStruct(POS='N', AGR=fs1) - >>> print(fs2) - [ [ GND = 'fem' ] ] - [ AGR = [ NUM = 'pl' ] ] - [ [ PER = 3 ] ] - [ ] - [ POS = 'N' ] - >>> print(fs2['AGR']) - [ GND = 'fem' ] - [ NUM = 'pl' ] - [ PER = 3 ] - >>> print(fs2['AGR']['PER']) - 3 - -------- -Parsing -------- - - >>> from nltk.parse.recursivedescent import RecursiveDescentParser - >>> from nltk.grammar import CFG - >>> grammar = CFG.fromstring(""" - ... S -> NP VP - ... PP -> P NP - ... NP -> 'the' N | N PP | 'the' N PP - ... VP -> V NP | V PP | V NP PP - ... N -> 'cat' | 'dog' | 'rug' - ... V -> 'chased' - ... P -> 'on' - ... """) - >>> rd = RecursiveDescentParser(grammar) - >>> sent = 'the cat chased the dog on the rug'.split() - >>> for t in rd.parse(sent): - ... print(t) - (S - (NP the (N cat)) - (VP (V chased) (NP the (N dog) (PP (P on) (NP the (N rug)))))) - (S - (NP the (N cat)) - (VP (V chased) (NP the (N dog)) (PP (P on) (NP the (N rug))))) diff --git a/pipeline/nltk/test/stem.doctest b/pipeline/nltk/test/stem.doctest deleted file mode 100644 index c2c40a66d4202e13b46eb81424b7902637c7f942..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/stem.doctest +++ /dev/null @@ -1,105 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -========== - Stemmers -========== - -Overview -~~~~~~~~ - -Stemmers remove morphological affixes from words, leaving only the -word stem. - - >>> from nltk.stem import * - -Unit tests for the Porter stemmer -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - >>> from nltk.stem.porter import * - -Create a new Porter stemmer. - - >>> stemmer = PorterStemmer() - -Test the stemmer on various pluralised words. - - >>> plurals = ['caresses', 'flies', 'dies', 'mules', 'denied', - ... 'died', 'agreed', 'owned', 'humbled', 'sized', - ... 'meeting', 'stating', 'siezing', 'itemization', - ... 'sensational', 'traditional', 'reference', 'colonizer', - ... 'plotted'] - - >>> singles = [stemmer.stem(plural) for plural in plurals] - - >>> print(' '.join(singles)) - caress fli die mule deni die agre own humbl size meet - state siez item sensat tradit refer colon plot - - -Unit tests for Snowball stemmer -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - >>> from nltk.stem.snowball import SnowballStemmer - -See which languages are supported. - - >>> print(" ".join(SnowballStemmer.languages)) - arabic danish dutch english finnish french german hungarian italian - norwegian porter portuguese romanian russian spanish swedish - -Create a new instance of a language specific subclass. - - >>> stemmer = SnowballStemmer("english") - -Stem a word. - - >>> print(stemmer.stem("running")) - run - -Decide not to stem stopwords. - - >>> stemmer2 = SnowballStemmer("english", ignore_stopwords=True) - >>> print(stemmer.stem("having")) - have - >>> print(stemmer2.stem("having")) - having - -The 'english' stemmer is better than the original 'porter' stemmer. - - >>> print(SnowballStemmer("english").stem("generously")) - generous - >>> print(SnowballStemmer("porter").stem("generously")) - gener - -.. note:: - - Extra stemmer tests can be found in `nltk.test.unit.test_stem`. - -Unit tests for ARLSTem Stemmer -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - >>> from nltk.stem.arlstem import ARLSTem - -Create a Stemmer instance. - - >>> stemmer = ARLSTem() - -Stem a word. - - >>> stemmer.stem('يعمل') - 'عمل' - -Unit tests for ARLSTem2 Stemmer -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - >>> from nltk.stem.arlstem2 import ARLSTem2 - -Create a Stemmer instance. - - >>> stemmer = ARLSTem2() - -Stem a word. - - >>> stemmer.stem('يعمل') - 'عمل' diff --git a/pipeline/nltk/test/tag.doctest b/pipeline/nltk/test/tag.doctest deleted file mode 100644 index 505f622dfbc5eb0798ce775774df9c08a135c01f..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/tag.doctest +++ /dev/null @@ -1,475 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -Evaluation of Taggers -===================== - -Evaluating the standard NLTK PerceptronTagger using Accuracy, -Precision, Recall and F-measure for each of the tags. - - >>> from nltk.tag import PerceptronTagger - >>> from nltk.corpus import treebank - >>> tagger = PerceptronTagger() - >>> gold_data = treebank.tagged_sents()[10:20] - >>> print(tagger.accuracy(gold_data)) # doctest: +ELLIPSIS - 0.885931... - - >>> print(tagger.evaluate_per_tag(gold_data)) - Tag | Prec. | Recall | F-measure - -------+--------+--------+----------- - '' | 1.0000 | 1.0000 | 1.0000 - , | 1.0000 | 1.0000 | 1.0000 - -NONE- | 0.0000 | 0.0000 | 0.0000 - . | 1.0000 | 1.0000 | 1.0000 - : | 1.0000 | 1.0000 | 1.0000 - CC | 1.0000 | 1.0000 | 1.0000 - CD | 0.7647 | 1.0000 | 0.8667 - DT | 1.0000 | 1.0000 | 1.0000 - IN | 1.0000 | 1.0000 | 1.0000 - JJ | 0.5882 | 0.8333 | 0.6897 - JJR | 1.0000 | 1.0000 | 1.0000 - JJS | 1.0000 | 1.0000 | 1.0000 - NN | 0.7647 | 0.9630 | 0.8525 - NNP | 0.8929 | 1.0000 | 0.9434 - NNS | 1.0000 | 1.0000 | 1.0000 - POS | 1.0000 | 1.0000 | 1.0000 - PRP | 1.0000 | 1.0000 | 1.0000 - RB | 0.8000 | 1.0000 | 0.8889 - RBR | 0.0000 | 0.0000 | 0.0000 - TO | 1.0000 | 1.0000 | 1.0000 - VB | 1.0000 | 1.0000 | 1.0000 - VBD | 0.8571 | 0.9231 | 0.8889 - VBG | 1.0000 | 1.0000 | 1.0000 - VBN | 0.8333 | 0.5556 | 0.6667 - VBP | 0.5714 | 0.8000 | 0.6667 - VBZ | 1.0000 | 1.0000 | 1.0000 - WP | 1.0000 | 1.0000 | 1.0000 - `` | 1.0000 | 1.0000 | 1.0000 - - -List only the 10 most common tags: - - >>> print(tagger.evaluate_per_tag(gold_data, truncate=10, sort_by_count=True)) - Tag | Prec. | Recall | F-measure - -------+--------+--------+----------- - IN | 1.0000 | 1.0000 | 1.0000 - DT | 1.0000 | 1.0000 | 1.0000 - NN | 0.7647 | 0.9630 | 0.8525 - NNP | 0.8929 | 1.0000 | 0.9434 - NNS | 1.0000 | 1.0000 | 1.0000 - -NONE- | 0.0000 | 0.0000 | 0.0000 - CD | 0.7647 | 1.0000 | 0.8667 - VBD | 0.8571 | 0.9231 | 0.8889 - JJ | 0.5882 | 0.8333 | 0.6897 - , | 1.0000 | 1.0000 | 1.0000 - - -Similarly, we can display the confusion matrix for this tagger. - - >>> print(tagger.confusion(gold_data)) - | - | - | N | - | O | - | N J J N N P P R V V V V V | - | ' E C C D I J J J N N N O R R B T V B B B B B W ` | - | ' , - . : C D T N J R S N P S S P B R O B D G N P Z P ` | - -------+-------------------------------------------------------------------------------------+ - '' | <3> . . . . . . . . . . . . . . . . . . . . . . . . . . . | - , | .<11> . . . . . . . . . . . . . . . . . . . . . . . . . . | - -NONE- | . . <.> . . . 4 . . 4 . . 7 2 . . . 1 . . . . . . 3 . . . | - . | . . .<10> . . . . . . . . . . . . . . . . . . . . . . . . | - : | . . . . <1> . . . . . . . . . . . . . . . . . . . . . . . | - CC | . . . . . <5> . . . . . . . . . . . . . . . . . . . . . . | - CD | . . . . . .<13> . . . . . . . . . . . . . . . . . . . . . | - DT | . . . . . . .<28> . . . . . . . . . . . . . . . . . . . . | - IN | . . . . . . . .<34> . . . . . . . . . . . . . . . . . . . | - JJ | . . . . . . . . .<10> . . . 1 . . . . 1 . . . . . . . . . | - JJR | . . . . . . . . . . <1> . . . . . . . . . . . . . . . . . | - JJS | . . . . . . . . . . . <1> . . . . . . . . . . . . . . . . | - NN | . . . . . . . . . 1 . .<26> . . . . . . . . . . . . . . . | - NNP | . . . . . . . . . . . . .<25> . . . . . . . . . . . . . . | - NNS | . . . . . . . . . . . . . .<22> . . . . . . . . . . . . . | - POS | . . . . . . . . . . . . . . . <1> . . . . . . . . . . . . | - PRP | . . . . . . . . . . . . . . . . <3> . . . . . . . . . . . | - RB | . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . | - RBR | . . . . . . . . . . . . . . . . . . <.> . . . . . . . . . | - TO | . . . . . . . . . . . . . . . . . . . <2> . . . . . . . . | - VB | . . . . . . . . . . . . . . . . . . . . <1> . . . . . . . | - VBD | . . . . . . . . . . . . . . . . . . . . .<12> . 1 . . . . | - VBG | . . . . . . . . . . . . . . . . . . . . . . <3> . . . . . | - VBN | . . . . . . . . . 2 . . . . . . . . . . . 2 . <5> . . . . | - VBP | . . . . . . . . . . . . 1 . . . . . . . . . . . <4> . . . | - VBZ | . . . . . . . . . . . . . . . . . . . . . . . . . <2> . . | - WP | . . . . . . . . . . . . . . . . . . . . . . . . . . <3> . | - `` | . . . . . . . . . . . . . . . . . . . . . . . . . . . <3>| - -------+-------------------------------------------------------------------------------------+ - (row = reference; col = test) - - -Brill Trainer with evaluation -============================= - - >>> # Perform the relevant imports. - >>> from nltk.tbl.template import Template - >>> from nltk.tag.brill import Pos, Word - >>> from nltk.tag import untag, RegexpTagger, BrillTaggerTrainer, UnigramTagger - - >>> # Load some data - >>> from nltk.corpus import treebank - >>> training_data = treebank.tagged_sents()[:100] - >>> baseline_data = treebank.tagged_sents()[100:200] - >>> gold_data = treebank.tagged_sents()[200:300] - >>> testing_data = [untag(s) for s in gold_data] - - >>> backoff = RegexpTagger([ - ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers - ... (r'(The|the|A|a|An|an)$', 'AT'), # articles - ... (r'.*able$', 'JJ'), # adjectives - ... (r'.*ness$', 'NN'), # nouns formed from adjectives - ... (r'.*ly$', 'RB'), # adverbs - ... (r'.*s$', 'NNS'), # plural nouns - ... (r'.*ing$', 'VBG'), # gerunds - ... (r'.*ed$', 'VBD'), # past tense verbs - ... (r'.*', 'NN') # nouns (default) - ... ]) - -We've now created a simple ``RegexpTagger``, which tags according to the regular expression -rules it has been supplied. This tagger in and of itself does not have a great accuracy. - - >>> backoff.accuracy(gold_data) #doctest: +ELLIPSIS - 0.245014... - -Neither does a simple ``UnigramTagger``. This tagger is trained on some data, -and will then first try to match unigrams (i.e. tokens) of the sentence it has -to tag to the learned data. - - >>> unigram_tagger = UnigramTagger(baseline_data) - >>> unigram_tagger.accuracy(gold_data) #doctest: +ELLIPSIS - 0.581196... - -The lackluster accuracy here can be explained with the following example: - - >>> unigram_tagger.tag(["I", "would", "like", "this", "sentence", "to", "be", "tagged"]) - [('I', 'NNP'), ('would', 'MD'), ('like', None), ('this', 'DT'), ('sentence', None), - ('to', 'TO'), ('be', 'VB'), ('tagged', None)] - -As you can see, many tokens are tagged as ``None``, as these tokens are OOV (out of vocabulary). -The ``UnigramTagger`` has never seen them, and as a result they are not in its database of known terms. - -In practice, a ``UnigramTagger`` is exclusively used in conjunction with a *backoff*. Our real -baseline which will use such a backoff. We'll create a ``UnigramTagger`` like before, but now -the ``RegexpTagger`` will be used as a backoff for the situations where the ``UnigramTagger`` -encounters an OOV token. - - >>> baseline = UnigramTagger(baseline_data, backoff=backoff) - >>> baseline.accuracy(gold_data) #doctest: +ELLIPSIS - 0.7537647... - -That is already much better. We can investigate the performance further by running -``evaluate_per_tag``. This method will output the *Precision*, *Recall* and *F-measure* -of each tag. - - >>> print(baseline.evaluate_per_tag(gold_data, sort_by_count=True)) - Tag | Prec. | Recall | F-measure - -------+--------+--------+----------- - NNP | 0.9674 | 0.2738 | 0.4269 - NN | 0.4111 | 0.9136 | 0.5670 - IN | 0.9383 | 0.9580 | 0.9480 - DT | 0.9819 | 0.8859 | 0.9314 - JJ | 0.8167 | 0.2970 | 0.4356 - NNS | 0.7393 | 0.9630 | 0.8365 - -NONE- | 1.0000 | 0.8345 | 0.9098 - , | 1.0000 | 1.0000 | 1.0000 - . | 1.0000 | 1.0000 | 1.0000 - VBD | 0.6429 | 0.8804 | 0.7431 - CD | 1.0000 | 0.9872 | 0.9935 - CC | 1.0000 | 0.9355 | 0.9667 - VB | 0.7778 | 0.3684 | 0.5000 - VBN | 0.9375 | 0.3000 | 0.4545 - RB | 0.7778 | 0.7447 | 0.7609 - TO | 1.0000 | 1.0000 | 1.0000 - VBZ | 0.9643 | 0.6429 | 0.7714 - VBG | 0.6415 | 0.9444 | 0.7640 - PRP$ | 1.0000 | 1.0000 | 1.0000 - PRP | 1.0000 | 0.5556 | 0.7143 - MD | 1.0000 | 1.0000 | 1.0000 - VBP | 0.6471 | 0.5789 | 0.6111 - POS | 1.0000 | 1.0000 | 1.0000 - $ | 1.0000 | 0.8182 | 0.9000 - '' | 1.0000 | 1.0000 | 1.0000 - : | 1.0000 | 1.0000 | 1.0000 - WDT | 0.4000 | 0.2000 | 0.2667 - `` | 1.0000 | 1.0000 | 1.0000 - JJR | 1.0000 | 0.5000 | 0.6667 - NNPS | 0.0000 | 0.0000 | 0.0000 - RBR | 1.0000 | 1.0000 | 1.0000 - -LRB- | 0.0000 | 0.0000 | 0.0000 - -RRB- | 0.0000 | 0.0000 | 0.0000 - RP | 0.6667 | 0.6667 | 0.6667 - EX | 0.5000 | 0.5000 | 0.5000 - JJS | 0.0000 | 0.0000 | 0.0000 - WP | 1.0000 | 1.0000 | 1.0000 - PDT | 0.0000 | 0.0000 | 0.0000 - AT | 0.0000 | 0.0000 | 0.0000 - - -It's clear that although the precision of tagging `"NNP"` is high, the recall is very low. -With other words, we're missing a lot of cases where the true label is `"NNP"`. We can see -a similar effect with `"JJ"`. - -We can also see a very expected result: The precision of `"NN"` is low, while the recall -is high. If a term is OOV (i.e. ``UnigramTagger`` defers it to ``RegexpTagger``) and -``RegexpTagger`` doesn't have a good rule for it, then it will be tagged as `"NN"`. So, -we catch almost all tokens that are truly labeled as `"NN"`, but we also tag as `"NN"` -for many tokens that shouldn't be `"NN"`. - -This method gives us some insight in what parts of the tagger needs more attention, and why. -However, it doesn't tell us what the terms with true label `"NNP"` or `"JJ"` are actually -tagged as. -To help that, we can create a confusion matrix. - - >>> print(baseline.confusion(gold_data)) - | - | - | - N - | - | L O R N P | - | R N R J J N N N P P P R R V V V V V W | - | ' B E B A C C D E I J J J M N N P N D O R P R B R T V B B B B B D W ` | - | $ ' , - - - . : T C D T X N J R S D N P S S T S P $ B R P O B D G N P Z T P ` | - -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ - $ | <9> . . . . . . . . . . . . . . . . . 2 . . . . . . . . . . . . . . . . . . . . | - '' | . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | - , | . .<115> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | - -LRB- | . . . <.> . . . . . . . . . . . . . . 3 . . . . . . . . . . . . . . . . . . . . | - -NONE- | . . . .<121> . . . . . . . . . . . . . 24 . . . . . . . . . . . . . . . . . . . . | - -RRB- | . . . . . <.> . . . . . . . . . . . . 3 . . . . . . . . . . . . . . . . . . . . | - . | . . . . . .<100> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | - : | . . . . . . . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | - AT | . . . . . . . . <.> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | - CC | . . . . . . . . . <58> . . . . . . . . 4 . . . . . . . . . . . . . . . . . . . . | - CD | . . . . . . . . . . <77> . . . . . . . 1 . . . . . . . . . . . . . . . . . . . . | - DT | . . . . . . . . 1 . .<163> . 4 . . . . 13 . . . . . . . . . . . . . . . . . 3 . . | - EX | . . . . . . . . . . . . <1> . . . . . 1 . . . . . . . . . . . . . . . . . . . . | - IN | . . . . . . . . . . . . .<228> . . . . 8 . . . . . . . . . . . . . 2 . . . . . . | - JJ | . . . . . . . . . . . . . . <49> . . . 86 2 . 4 . . . . 6 . . . . 12 3 . 3 . . . . | - JJR | . . . . . . . . . . . . . . . <3> . . 3 . . . . . . . . . . . . . . . . . . . . | - JJS | . . . . . . . . . . . . . . . . <.> . 2 . . . . . . . . . . . . . . . . . . . . | - MD | . . . . . . . . . . . . . . . . . <19> . . . . . . . . . . . . . . . . . . . . . | - NN | . . . . . . . . . . . . . . 9 . . .<296> . . 5 . . . . . . . . 5 . 9 . . . . . . | - NNP | . . . . . . . . . . . 2 . . . . . . 199 <89> . 26 . . . . 2 . . . . 2 5 . . . . . . | - NNPS | . . . . . . . . . . . . . . . . . . . 1 <.> 3 . . . . . . . . . . . . . . . . . | - NNS | . . . . . . . . . . . . . . . . . . 5 . .<156> . . . . . . . . . . . . . 1 . . . | - PDT | . . . . . . . . . . . 1 . . . . . . . . . . <.> . . . . . . . . . . . . . . . . | - POS | . . . . . . . . . . . . . . . . . . . . . . . <14> . . . . . . . . . . . . . . . | - PRP | . . . . . . . . . . . . . . . . . . 10 . . 2 . . <15> . . . . . . . . . . . . . . | - PRP$ | . . . . . . . . . . . . . . . . . . . . . . . . . <28> . . . . . . . . . . . . . | - RB | . . . . . . . . . . . . 1 4 . . . . 6 . . . . . . . <35> . 1 . . . . . . . . . . | - RBR | . . . . . . . . . . . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . . | - RP | . . . . . . . . . . . . . . . . . . . . . . . . . . 1 . <2> . . . . . . . . . . | - TO | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <47> . . . . . . . . . | - VB | . . . . . . . . . . . . . . 2 . . . 30 . . . . . . . 1 . . . <21> . . . 3 . . . . | - VBD | . . . . . . . . . . . . . . . . . . 10 . . . . . . . . . . . . <81> . 1 . . . . . | - VBG | . . . . . . . . . . . . . . . . . . 2 . . . . . . . . . . . . . <34> . . . . . . | - VBN | . . . . . . . . . . . . . . . . . . 4 . . . . . . . . . . . . 31 . <15> . . . . . | - VBP | . . . . . . . . . . . . . . . . . . 7 . . . . . . . . . . . 1 . . . <11> . . . . | - VBZ | . . . . . . . . . . . . . . . . . . . . . 15 . . . . . . . . . . . . . <27> . . . | - WDT | . . . . . . . . . . . . . 7 . . . . 1 . . . . . . . . . . . . . . . . . <2> . . | - WP | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <2> . | - `` | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <10>| - -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ - (row = reference; col = test) - - -Once again we can see that `"NN"` is the default if the tagger isn't sure. Beyond that, -we can see why the recall for `"NNP"` is so low: these tokens are often tagged as `"NN"`. -This effect can also be seen for `"JJ"`, where the majority of tokens that ought to be -tagged as `"JJ"` are actually tagged as `"NN"` by our tagger. - -This tagger will only serve as a baseline for the ``BrillTaggerTrainer``, which uses -templates to attempt to improve the performance of the tagger. - - >>> # Set up templates - >>> Template._cleartemplates() #clear any templates created in earlier tests - >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] - - >>> # Construct a BrillTaggerTrainer - >>> tt = BrillTaggerTrainer(baseline, templates, trace=3) - >>> tagger1 = tt.train(training_data, max_rules=10) - TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None) - Finding initial useful rules... - Found 618 useful rules. - - B | - S F r O | Score = Fixed - Broken - c i o t | R Fixed = num tags changed incorrect -> correct - o x k h | u Broken = num tags changed correct -> incorrect - r e e e | l Other = num tags changed incorrect -> incorrect - e d n r | e - ------------------+------------------------------------------------------- - 13 14 1 4 | NN->VB if Pos:TO@[-1] - 8 8 0 0 | NN->VB if Pos:MD@[-1] - 7 10 3 22 | NN->IN if Pos:NNS@[-1] - 5 5 0 0 | NN->VBP if Pos:PRP@[-1] - 5 5 0 0 | VBD->VBN if Pos:VBZ@[-1] - 5 5 0 0 | NNS->NN if Pos:IN@[-1] & Word:asbestos@[0] - 4 4 0 0 | NN->-NONE- if Pos:WP@[-1] - 4 4 0 3 | NN->NNP if Pos:-NONE-@[-1] - 4 6 2 2 | NN->NNP if Pos:NNP@[-1] - 4 4 0 0 | NNS->VBZ if Pos:PRP@[-1] - - >>> tagger1.rules()[1:3] - (Rule('000', 'NN', 'VB', [(Pos([-1]),'MD')]), Rule('000', 'NN', 'IN', [(Pos([-1]),'NNS')])) - - >>> tagger1.print_template_statistics(printunused=False) - TEMPLATE STATISTICS (TRAIN) 2 templates, 10 rules) - TRAIN ( 2417 tokens) initial 555 0.7704 final: 496 0.7948 - #ID | Score (train) | #Rules | Template - -------------------------------------------- - 000 | 54 0.915 | 9 0.900 | Template(Pos([-1])) - 001 | 5 0.085 | 1 0.100 | Template(Pos([-1]),Word([0])) - - - - >>> tagger1.accuracy(gold_data) # doctest: +ELLIPSIS - 0.769230... - - >>> print(tagger1.evaluate_per_tag(gold_data, sort_by_count=True)) - Tag | Prec. | Recall | F-measure - -------+--------+--------+----------- - NNP | 0.8298 | 0.3600 | 0.5021 - NN | 0.4435 | 0.8364 | 0.5797 - IN | 0.8476 | 0.9580 | 0.8994 - DT | 0.9819 | 0.8859 | 0.9314 - JJ | 0.8167 | 0.2970 | 0.4356 - NNS | 0.7464 | 0.9630 | 0.8410 - -NONE- | 1.0000 | 0.8414 | 0.9139 - , | 1.0000 | 1.0000 | 1.0000 - . | 1.0000 | 1.0000 | 1.0000 - VBD | 0.6723 | 0.8696 | 0.7583 - CD | 1.0000 | 0.9872 | 0.9935 - CC | 1.0000 | 0.9355 | 0.9667 - VB | 0.8103 | 0.8246 | 0.8174 - VBN | 0.9130 | 0.4200 | 0.5753 - RB | 0.7778 | 0.7447 | 0.7609 - TO | 1.0000 | 1.0000 | 1.0000 - VBZ | 0.9667 | 0.6905 | 0.8056 - VBG | 0.6415 | 0.9444 | 0.7640 - PRP$ | 1.0000 | 1.0000 | 1.0000 - PRP | 1.0000 | 0.5556 | 0.7143 - MD | 1.0000 | 1.0000 | 1.0000 - VBP | 0.6316 | 0.6316 | 0.6316 - POS | 1.0000 | 1.0000 | 1.0000 - $ | 1.0000 | 0.8182 | 0.9000 - '' | 1.0000 | 1.0000 | 1.0000 - : | 1.0000 | 1.0000 | 1.0000 - WDT | 0.4000 | 0.2000 | 0.2667 - `` | 1.0000 | 1.0000 | 1.0000 - JJR | 1.0000 | 0.5000 | 0.6667 - NNPS | 0.0000 | 0.0000 | 0.0000 - RBR | 1.0000 | 1.0000 | 1.0000 - -LRB- | 0.0000 | 0.0000 | 0.0000 - -RRB- | 0.0000 | 0.0000 | 0.0000 - RP | 0.6667 | 0.6667 | 0.6667 - EX | 0.5000 | 0.5000 | 0.5000 - JJS | 0.0000 | 0.0000 | 0.0000 - WP | 1.0000 | 1.0000 | 1.0000 - PDT | 0.0000 | 0.0000 | 0.0000 - AT | 0.0000 | 0.0000 | 0.0000 - - - >>> print(tagger1.confusion(gold_data)) - | - | - | - N - | - | L O R N P | - | R N R J J N N N P P P R R V V V V V W | - | ' B E B A C C D E I J J J M N N P N D O R P R B R T V B B B B B D W ` | - | $ ' , - - - . : T C D T X N J R S D N P S S T S P $ B R P O B D G N P Z T P ` | - -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ - $ | <9> . . . . . . . . . . . . . . . . . 1 . . . . . . . . . . . 1 . . . . . . . . | - '' | . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | - , | . .<115> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | - -LRB- | . . . <.> . . . . . . . . . 1 . . . . 2 . . . . . . . . . . . . . . . . . . . . | - -NONE- | . . . .<122> . . . . . . . . 1 . . . . 22 . . . . . . . . . . . . . . . . . . . . | - -RRB- | . . . . . <.> . . . . . . . . . . . . 2 1 . . . . . . . . . . . . . . . . . . . | - . | . . . . . .<100> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | - : | . . . . . . . <10> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | - AT | . . . . . . . . <.> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . | - CC | . . . . . . . . . <58> . . . . . . . . 2 1 . . . . . . . . . . . . . . 1 . . . . | - CD | . . . . . . . . . . <77> . . . . . . . 1 . . . . . . . . . . . . . . . . . . . . | - DT | . . . . . . . . 1 . .<163> . 5 . . . . 12 . . . . . . . . . . . . . . . . . 3 . . | - EX | . . . . . . . . . . . . <1> . . . . . 1 . . . . . . . . . . . . . . . . . . . . | - IN | . . . . . . . . . . . . .<228> . . . . 8 . . . . . . . . . . . . . 2 . . . . . . | - JJ | . . . . . . . . . . . . . 4 <49> . . . 79 4 . 4 . . . . 6 . . . 1 12 3 . 3 . . . . | - JJR | . . . . . . . . . . . . . 2 . <3> . . 1 . . . . . . . . . . . . . . . . . . . . | - JJS | . . . . . . . . . . . . . . . . <.> . 2 . . . . . . . . . . . . . . . . . . . . | - MD | . . . . . . . . . . . . . . . . . <19> . . . . . . . . . . . . . . . . . . . . . | - NN | . . . . . . . . . . . . . 7 9 . . .<271> 16 . 5 . . . . . . . . 7 . 9 . . . . . . | - NNP | . . . . . . . . . . . 2 . 7 . . . . 163<117> . 26 . . . . 2 . . . 1 2 5 . . . . . . | - NNPS | . . . . . . . . . . . . . . . . . . . 1 <.> 3 . . . . . . . . . . . . . . . . . | - NNS | . . . . . . . . . . . . . . . . . . 5 . .<156> . . . . . . . . . . . . . 1 . . . | - PDT | . . . . . . . . . . . 1 . . . . . . . . . . <.> . . . . . . . . . . . . . . . . | - POS | . . . . . . . . . . . . . . . . . . . . . . . <14> . . . . . . . . . . . . . . . | - PRP | . . . . . . . . . . . . . . . . . . 10 . . 2 . . <15> . . . . . . . . . . . . . . | - PRP$ | . . . . . . . . . . . . . . . . . . . . . . . . . <28> . . . . . . . . . . . . . | - RB | . . . . . . . . . . . . 1 4 . . . . 6 . . . . . . . <35> . 1 . . . . . . . . . . | - RBR | . . . . . . . . . . . . . . . . . . . . . . . . . . . <4> . . . . . . . . . . . | - RP | . . . . . . . . . . . . . . . . . . . . . . . . . . 1 . <2> . . . . . . . . . . | - TO | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <47> . . . . . . . . . | - VB | . . . . . . . . . . . . . . 2 . . . 4 . . . . . . . 1 . . . <47> . . . 3 . . . . | - VBD | . . . . . . . . . . . . . 1 . . . . 8 1 . . . . . . . . . . . <80> . 2 . . . . . | - VBG | . . . . . . . . . . . . . . . . . . 2 . . . . . . . . . . . . . <34> . . . . . . | - VBN | . . . . . . . . . . . . . . . . . . 4 . . . . . . . . . . . . 25 . <21> . . . . . | - VBP | . . . . . . . . . . . . . 2 . . . . 4 . . . . . . . . . . . 1 . . . <12> . . . . | - VBZ | . . . . . . . . . . . . . . . . . . . . . 13 . . . . . . . . . . . . . <29> . . . | - WDT | . . . . . . . . . . . . . 7 . . . . 1 . . . . . . . . . . . . . . . . . <2> . . | - WP | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <2> . | - `` | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . <10>| - -------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+ - (row = reference; col = test) - - - >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data) - >>> tagged[33][12:] - [('foreign', 'NN'), ('debt', 'NN'), ('of', 'IN'), ('$', '$'), ('64', 'CD'), - ('billion', 'CD'), ('*U*', '-NONE-'), ('--', ':'), ('the', 'DT'), ('third-highest', 'NN'), - ('in', 'IN'), ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')] - -Regression Tests -~~~~~~~~~~~~~~~~ - -Sequential Taggers ------------------- - -Add tests for: - - make sure backoff is being done correctly. - - make sure ngram taggers don't use previous sentences for context. - - make sure ngram taggers see 'beginning of the sentence' as a - unique context - - make sure regexp tagger's regexps are tried in order - - train on some simple examples, & make sure that the size & the - generated models are correct. - - make sure cutoff works as intended - - make sure that ngram models only exclude contexts covered by the - backoff tagger if the backoff tagger gets that context correct at - *all* locations. - - -Regression Testing for issue #1025 -================================== - -We want to ensure that a RegexpTagger can be created with more than 100 patterns -and does not fail with: "AssertionError: sorry, but this version only supports 100 named groups" - - >>> from nltk.tag import RegexpTagger - >>> patterns = [(str(i), 'NNP',) for i in range(200)] - >>> tagger = RegexpTagger(patterns) - -Regression Testing for issue #2483 -================================== - -Ensure that tagging with pos_tag (PerceptronTagger) does not throw an IndexError -when attempting tagging an empty string. What it must return instead is not -strictly defined. - - >>> from nltk.tag import pos_tag - >>> pos_tag(['', 'is', 'a', 'beautiful', 'day']) - [...] diff --git a/pipeline/nltk/test/tokenize.doctest b/pipeline/nltk/test/tokenize.doctest deleted file mode 100644 index c3f40c8b64820315eb3c809e31ac53517d4dfca8..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/tokenize.doctest +++ /dev/null @@ -1,397 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - - >>> from nltk.tokenize import * - -Regression Tests: NLTKWordTokenizer -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Tokenizing some test strings. - - >>> s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88." - >>> word_tokenize(s1) - ['On', 'a', '$', '50,000', 'mortgage', 'of', '30', 'years', 'at', '8', 'percent', ',', 'the', 'monthly', 'payment', 'would', 'be', '$', '366.88', '.'] - >>> s2 = "\"We beat some pretty good teams to get here,\" Slocum said." - >>> word_tokenize(s2) - ['``', 'We', 'beat', 'some', 'pretty', 'good', 'teams', 'to', 'get', 'here', ',', "''", 'Slocum', 'said', '.'] - >>> s3 = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't." - >>> word_tokenize(s3) - ['Well', ',', 'we', 'could', "n't", 'have', 'this', 'predictable', ',', 'cliche-ridden', ',', '``', 'Touched', 'by', 'an', 'Angel', "''", '(', 'a', 'show', 'creator', 'John', 'Masius', 'worked', 'on', ')', 'wanna-be', 'if', 'she', 'did', "n't", '.'] - >>> s4 = "I cannot cannot work under these conditions!" - >>> word_tokenize(s4) - ['I', 'can', 'not', 'can', 'not', 'work', 'under', 'these', 'conditions', '!'] - >>> s5 = "The company spent $30,000,000 last year." - >>> word_tokenize(s5) - ['The', 'company', 'spent', '$', '30,000,000', 'last', 'year', '.'] - >>> s6 = "The company spent 40.75% of its income last year." - >>> word_tokenize(s6) - ['The', 'company', 'spent', '40.75', '%', 'of', 'its', 'income', 'last', 'year', '.'] - >>> s7 = "He arrived at 3:00 pm." - >>> word_tokenize(s7) - ['He', 'arrived', 'at', '3:00', 'pm', '.'] - >>> s8 = "I bought these items: books, pencils, and pens." - >>> word_tokenize(s8) - ['I', 'bought', 'these', 'items', ':', 'books', ',', 'pencils', ',', 'and', 'pens', '.'] - >>> s9 = "Though there were 150, 100 of them were old." - >>> word_tokenize(s9) - ['Though', 'there', 'were', '150', ',', '100', 'of', 'them', 'were', 'old', '.'] - >>> s10 = "There were 300,000, but that wasn't enough." - >>> word_tokenize(s10) - ['There', 'were', '300,000', ',', 'but', 'that', 'was', "n't", 'enough', '.'] - >>> s11 = "It's more'n enough." - >>> word_tokenize(s11) - ['It', "'s", 'more', "'n", 'enough', '.'] - -Gathering the spans of the tokenized strings. - - >>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).''' - >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), - ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), - ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), - ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] - >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected - True - >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in', - ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')', - ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.'] - >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected - True - - >>> s = '''I said, "I'd like to buy some ''good muffins" which cost $3.88\n each in New (York)."''' - >>> expected = [(0, 1), (2, 6), (6, 7), (8, 9), (9, 10), (10, 12), - ... (13, 17), (18, 20), (21, 24), (25, 29), (30, 32), (32, 36), - ... (37, 44), (44, 45), (46, 51), (52, 56), (57, 58), (58, 62), - ... (64, 68), (69, 71), (72, 75), (76, 77), (77, 81), (81, 82), - ... (82, 83), (83, 84)] - >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected - True - >>> expected = ['I', 'said', ',', '"', 'I', "'d", 'like', 'to', - ... 'buy', 'some', "''", "good", 'muffins', '"', 'which', 'cost', - ... '$', '3.88', 'each', 'in', 'New', '(', 'York', ')', '.', '"'] - >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected - True - -Testing improvement made to the TreebankWordTokenizer - - >>> sx1 = '\xabNow that I can do.\xbb' - >>> expected = ['\xab', 'Now', 'that', 'I', 'can', 'do', '.', '\xbb'] - >>> word_tokenize(sx1) == expected - True - >>> sx2 = 'The unicode 201C and 201D \u201cLEFT(RIGHT) DOUBLE QUOTATION MARK\u201d is also OPEN_PUNCT and CLOSE_PUNCT.' - >>> expected = ['The', 'unicode', '201C', 'and', '201D', '\u201c', 'LEFT', '(', 'RIGHT', ')', 'DOUBLE', 'QUOTATION', 'MARK', '\u201d', 'is', 'also', 'OPEN_PUNCT', 'and', 'CLOSE_PUNCT', '.'] - >>> word_tokenize(sx2) == expected - True - - -Testing treebank's detokenizer - - >>> from nltk.tokenize.treebank import TreebankWordDetokenizer - >>> detokenizer = TreebankWordDetokenizer() - >>> s = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88." - >>> detokenizer.detokenize(word_tokenize(s)) - 'On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88.' - >>> s = "\"We beat some pretty good teams to get here,\" Slocum said." - >>> detokenizer.detokenize(word_tokenize(s)) - '"We beat some pretty good teams to get here," Slocum said.' - >>> s = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't." - >>> detokenizer.detokenize(word_tokenize(s)) - 'Well, we couldn\'t have this predictable, cliche-ridden, "Touched by an Angel" (a show creator John Masius worked on) wanna-be if she didn\'t.' - >>> s = "I cannot cannot work under these conditions!" - >>> detokenizer.detokenize(word_tokenize(s)) - 'I cannot cannot work under these conditions!' - >>> s = "The company spent $30,000,000 last year." - >>> detokenizer.detokenize(word_tokenize(s)) - 'The company spent $30,000,000 last year.' - >>> s = "The company spent 40.75% of its income last year." - >>> detokenizer.detokenize(word_tokenize(s)) - 'The company spent 40.75% of its income last year.' - >>> s = "He arrived at 3:00 pm." - >>> detokenizer.detokenize(word_tokenize(s)) - 'He arrived at 3:00 pm.' - >>> s = "I bought these items: books, pencils, and pens." - >>> detokenizer.detokenize(word_tokenize(s)) - 'I bought these items: books, pencils, and pens.' - >>> s = "Though there were 150, 100 of them were old." - >>> detokenizer.detokenize(word_tokenize(s)) - 'Though there were 150, 100 of them were old.' - >>> s = "There were 300,000, but that wasn't enough." - >>> detokenizer.detokenize(word_tokenize(s)) - "There were 300,000, but that wasn't enough." - >>> s = 'How "are" you?' - >>> detokenizer.detokenize(word_tokenize(s)) - 'How "are" you?' - >>> s = "Hello (world)" - >>> detokenizer.detokenize(word_tokenize(s)) - 'Hello (world)' - >>> s = ' with (many) [kinds] of {parentheses}. "Sometimes it\'s inside (quotes)". ("Sometimes the otherway around").' - >>> detokenizer.detokenize(word_tokenize(s)) - ' with (many) [kinds] of {parentheses}. "Sometimes it\'s inside (quotes)". ("Sometimes the otherway around").' - >>> s = "Sentence ending with (parentheses)" - >>> detokenizer.detokenize(word_tokenize(s)) - 'Sentence ending with (parentheses)' - >>> s = "(Sentence) starting with parentheses." - >>> detokenizer.detokenize(word_tokenize(s)) - '(Sentence) starting with parentheses.' - >>> s = "I've" - >>> detokenizer.detokenize(word_tokenize(s)) - "I've" - >>> s = "Don't" - >>> detokenizer.detokenize(word_tokenize(s)) - "Don't" - >>> s = "I'd" - >>> detokenizer.detokenize(word_tokenize(s)) - "I'd" - - -Sentence tokenization in word_tokenize: - - >>> s11 = "I called Dr. Jones. I called Dr. Jones." - >>> word_tokenize(s11) - ['I', 'called', 'Dr.', 'Jones', '.', 'I', 'called', 'Dr.', 'Jones', '.'] - >>> s12 = ("Ich muss unbedingt daran denken, Mehl, usw. fur einen " - ... "Kuchen einzukaufen. Ich muss.") - >>> word_tokenize(s12) - ['Ich', 'muss', 'unbedingt', 'daran', 'denken', ',', 'Mehl', ',', 'usw', - '.', 'fur', 'einen', 'Kuchen', 'einzukaufen', '.', 'Ich', 'muss', '.'] - >>> word_tokenize(s12, 'german') - ['Ich', 'muss', 'unbedingt', 'daran', 'denken', ',', 'Mehl', ',', 'usw.', - 'fur', 'einen', 'Kuchen', 'einzukaufen', '.', 'Ich', 'muss', '.'] - - -Regression Tests: Regexp Tokenizer -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Some additional test strings. - - >>> s = ("Good muffins cost $3.88\nin New York. Please buy me\n" - ... "two of them.\n\nThanks.") - >>> s2 = ("Alas, it has not rained today. When, do you think, " - ... "will it rain again?") - >>> s3 = ("

    Although this is not the case here, we must " - ... "not relax our vigilance!

    ") - - >>> regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=False) - [', ', '. ', ', ', ', ', '?'] - >>> regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=True) - ['Alas', 'it has not rained today', 'When', 'do you think', - 'will it rain again'] - -Take care to avoid using capturing groups: - - >>> regexp_tokenize(s3, r'', gaps=False) - ['

    ', '', '', '

    '] - >>> regexp_tokenize(s3, r'', gaps=False) - ['

    ', '', '', '

    '] - >>> regexp_tokenize(s3, r'', gaps=True) - ['Although this is ', 'not', - ' the case here, we must not relax our vigilance!'] - -Named groups are capturing groups, and confuse the tokenizer: - - >>> regexp_tokenize(s3, r'b|p)>', gaps=False) - ['p', 'b', 'b', 'p'] - >>> regexp_tokenize(s3, r'b|p)>', gaps=True) - ['p', 'Although this is ', 'b', 'not', 'b', - ' the case here, we must not relax our vigilance!', 'p'] - -Make sure that nested groups don't confuse the tokenizer: - - >>> regexp_tokenize(s2, r'(?:h|r|l)a(?:s|(?:i|n0))', gaps=False) - ['las', 'has', 'rai', 'rai'] - >>> regexp_tokenize(s2, r'(?:h|r|l)a(?:s|(?:i|n0))', gaps=True) - ['A', ', it ', ' not ', 'ned today. When, do you think, will it ', - 'n again?'] - -Back-references require capturing groups, and these are not supported: - - >>> regexp_tokenize("aabbbcccc", r'(.)\1') - ['a', 'b', 'c', 'c'] - -A simple sentence tokenizer '\.(\s+|$)' - - >>> regexp_tokenize(s, pattern=r'\.(?:\s+|$)', gaps=True) - ['Good muffins cost $3.88\nin New York', - 'Please buy me\ntwo of them', 'Thanks'] - - -Regression Tests: TweetTokenizer -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -TweetTokenizer is a tokenizer specifically designed for micro-blogging tokenization tasks. - - >>> from nltk.tokenize import TweetTokenizer - >>> tknzr = TweetTokenizer() - >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--" - >>> tknzr.tokenize(s0) - ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--'] - >>> s1 = "@Joyster2012 @CathStaincliffe Good for you, girl!! Best wishes :-)" - >>> tknzr.tokenize(s1) - ['@Joyster2012', '@CathStaincliffe', 'Good', 'for', 'you', ',', 'girl', '!', '!', 'Best', 'wishes', ':-)'] - >>> s2 = "3Points for #DreamTeam Gooo BAILEY! :) #PBB737Gold @PBBabscbn" - >>> tknzr.tokenize(s2) - ['3Points', 'for', '#DreamTeam', 'Gooo', 'BAILEY', '!', ':)', '#PBB737Gold', '@PBBabscbn'] - >>> s3 = "@Insanomania They do... Their mentality doesn't :(" - >>> tknzr.tokenize(s3) - ['@Insanomania', 'They', 'do', '...', 'Their', 'mentality', "doesn't", ':('] - >>> s4 = "RT @facugambande: Ya por arrancar a grabar !!! #TirenTirenTiren vamoo !!" - >>> tknzr.tokenize(s4) - ['RT', '@facugambande', ':', 'Ya', 'por', 'arrancar', 'a', 'grabar', '!', '!', '!', '#TirenTirenTiren', 'vamoo', '!', '!'] - >>> tknzr = TweetTokenizer(reduce_len=True) - >>> s5 = "@crushinghes the summer holidays are great but I'm so bored already :(" - >>> tknzr.tokenize(s5) - ['@crushinghes', 'the', 'summer', 'holidays', 'are', 'great', 'but', "I'm", 'so', 'bored', 'already', ':('] - -It is possible to specify `strip_handles` and `reduce_len` parameters for a TweetTokenizer instance. Setting `strip_handles` to True, the tokenizer will remove Twitter handles (e.g. usernames). Setting `reduce_len` to True, repeated character sequences of length 3 or greater will be replaced with sequences of length 3. - - >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True) - >>> s6 = '@remy: This is waaaaayyyy too much for you!!!!!!' - >>> tknzr.tokenize(s6) - [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!'] - >>> s7 = '@_willy65: No place for @chuck tonight. Sorry.' - >>> tknzr.tokenize(s7) - [':', 'No', 'place', 'for', 'tonight', '.', 'Sorry', '.'] - >>> s8 = '@mar_tin is a great developer. Contact him at mar_tin@email.com.' - >>> tknzr.tokenize(s8) - ['is', 'a', 'great', 'developer', '.', 'Contact', 'him', 'at', 'mar_tin@email.com', '.'] - -The `preserve_case` parameter (default: True) allows to convert uppercase tokens to lowercase tokens. Emoticons are not affected: - - >>> tknzr = TweetTokenizer(preserve_case=False) - >>> s9 = "@jrmy: I'm REALLY HAPPYYY about that! NICEEEE :D :P" - >>> tknzr.tokenize(s9) - ['@jrmy', ':', "i'm", 'really', 'happyyy', 'about', 'that', '!', 'niceeee', ':D', ':P'] - -It should not hang on long sequences of the same punctuation character. - - >>> tknzr = TweetTokenizer() - >>> s10 = "Photo: Aujourd'hui sur http://t.co/0gebOFDUzn Projet... http://t.co/bKfIUbydz2.............................. http://fb.me/3b6uXpz0L" - >>> tknzr.tokenize(s10) - ['Photo', ':', "Aujourd'hui", 'sur', 'http://t.co/0gebOFDUzn', 'Projet', '...', 'http://t.co/bKfIUbydz2', '...', 'http://fb.me/3b6uXpz0L'] - -Tokenizing multiple sentences at once: - - >>> tknzr = TweetTokenizer() - >>> sentences = [ - ... "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--", - ... "@jrmy: I'm REALLY HAPPYYY about that! NICEEEE :D :P", - ... "@_willy65: No place for @chuck tonight. Sorry." - ... ] - >>> tknzr.tokenize_sents(sentences) # doctest: +NORMALIZE_WHITESPACE - [['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--'], - ['@jrmy', ':', "I'm", 'REALLY', 'HAPPYYY', 'about', 'that', '!', 'NICEEEE', ':D', ':P'], - ['@_willy65', ':', 'No', 'place', 'for', '@chuck', 'tonight', '.', 'Sorry', '.']] - - -Regression Tests: PunktSentenceTokenizer -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The sentence splitter should remove whitespace following the sentence boundary. - - >>> pst = PunktSentenceTokenizer() - >>> pst.tokenize('See Section 3). Or Section 2). ') - ['See Section 3).', 'Or Section 2).'] - >>> pst.tokenize('See Section 3.) Or Section 2.) ') - ['See Section 3.)', 'Or Section 2.)'] - >>> pst.tokenize('See Section 3.) Or Section 2.) ', realign_boundaries=False) - ['See Section 3.', ') Or Section 2.', ')'] - - -Two instances of PunktSentenceTokenizer should not share PunktParameters. - - >>> pst = PunktSentenceTokenizer() - >>> pst2 = PunktSentenceTokenizer() - >>> pst._params is pst2._params - False - -Testing mutable default arguments for https://github.com/nltk/nltk/pull/2067 - - >>> from nltk.tokenize.punkt import PunktBaseClass, PunktTrainer, PunktSentenceTokenizer - >>> from nltk.tokenize.punkt import PunktLanguageVars, PunktParameters - >>> pbc = PunktBaseClass(lang_vars=None, params=None) - >>> type(pbc._params) - - >>> type(pbc._lang_vars) - - >>> pt = PunktTrainer(lang_vars=None) - >>> type(pt._lang_vars) - - >>> pst = PunktSentenceTokenizer(lang_vars=None) - >>> type(pst._lang_vars) - - -Testing that inputs can start with dots. - - >>> pst = PunktSentenceTokenizer(lang_vars=None) - >>> pst.tokenize(". This input starts with a dot. This used to cause issues.") - ['.', 'This input starts with a dot.', 'This used to cause issues.'] - -Regression Tests: align_tokens -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Post-hoc alignment of tokens with a source string - - >>> from nltk.tokenize.util import align_tokens - >>> list(align_tokens([''], "")) - [(0, 0)] - >>> list(align_tokens([''], " ")) - [(0, 0)] - >>> list(align_tokens([], "")) - [] - >>> list(align_tokens([], " ")) - [] - >>> list(align_tokens(['a'], "a")) - [(0, 1)] - >>> list(align_tokens(['abc', 'def'], "abcdef")) - [(0, 3), (3, 6)] - >>> list(align_tokens(['abc', 'def'], "abc def")) - [(0, 3), (4, 7)] - >>> list(align_tokens(['ab', 'cd'], "ab cd ef")) - [(0, 2), (3, 5)] - >>> list(align_tokens(['ab', 'cd', 'ef'], "ab cd ef")) - [(0, 2), (3, 5), (6, 8)] - >>> list(align_tokens(['ab', 'cd', 'efg'], "ab cd ef")) - Traceback (most recent call last): - .... - ValueError: substring "efg" not found in "ab cd ef" - >>> list(align_tokens(['ab', 'cd', 'ef', 'gh'], "ab cd ef")) - Traceback (most recent call last): - .... - ValueError: substring "gh" not found in "ab cd ef" - >>> list(align_tokens(['The', 'plane', ',', 'bound', 'for', 'St', 'Petersburg', ',', 'crashed', 'in', 'Egypt', "'s", 'Sinai', 'desert', 'just', '23', 'minutes', 'after', 'take-off', 'from', 'Sharm', 'el-Sheikh', 'on', 'Saturday', '.'], "The plane, bound for St Petersburg, crashed in Egypt's Sinai desert just 23 minutes after take-off from Sharm el-Sheikh on Saturday.")) - [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23), (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54), (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89), (90, 98), (99, 103), (104, 109), (110, 119), (120, 122), (123, 131), (131, 132)] - - -Regression Tests: MWETokenizer -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Pickle an MWETokenizer - - >>> from nltk.tokenize import MWETokenizer - >>> import pickle - - >>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+') - >>> p = pickle.dumps(tokenizer) - >>> unpickeled = pickle.loads(p) - >>> unpickeled.tokenize("An hors d'oeuvre tonight, sir?".split()) - ['An', "hors+d'oeuvre", 'tonight,', 'sir?'] - - -Regression Tests: TextTilingTokenizer -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -TextTilingTokenizer tokenizes text into coherent subtopic chunks based upon Hearst's TextTiling algorithm. - - >>> from nltk.tokenize import TextTilingTokenizer - >>> from nltk.corpus import brown - >>> tt = TextTilingTokenizer() - >>> tt.tokenize(brown.raw()[0:1000]) - ["\n\n\tThe/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at investigation/nn of/in Atlanta's/np$ recent/jj primary/nn election/nn produced/vbd ``/`` no/at evidence/nn ''/'' that/cs any/dti irregularities/nns took/vbd place/nn ./.\n\n\n\tThe/at jury/nn further/rbr said/vbd in/in term-end/nn presentments/nns that/cs the/at City/nn-tl Executive/jj-tl Committee/nn-tl ,/, which/wdt had/hvd over-all/jj charge/nn of/in the/at election/nn ,/, ``/`` deserves/vbz the/at praise/nn and/cc thanks/nns of/in the/at City/nn-tl of/in-tl Atlanta/np-tl ''/'' for/in the/at manner/nn in/in which/wdt the/at election/nn was/bedz conducted/vbn ./.\n\n\n\tThe/at September-October/np term/nn jury/nn had/hvd been/ben charged/vbn by/in Fulton/np-tl Superior/jj-tl Court/nn-tl Judge/nn-tl Durwood/np Pye/np to/to investigate/vb reports/nns of/in possible/jj ``/`` irregularities/nns ''/'' in/in the/at hard-fought/jj primary/nn which/wdt was/bedz won/vbn by/in Mayor-nominate/nn-tl Ivan/np Allen/np Jr./"] - -Test that `ValueError` exceptions are raised when illegal arguments are used. - - >>> TextTilingTokenizer(similarity_method='foo').tokenize(brown.raw()[0:1000]) - Traceback (most recent call last): - ... - ValueError: Similarity method foo not recognized - >>> TextTilingTokenizer(smoothing_method='bar').tokenize(brown.raw()[0:1000]) - Traceback (most recent call last): - ... - ValueError: Smoothing method bar not recognized diff --git a/pipeline/nltk/test/toolbox.doctest b/pipeline/nltk/test/toolbox.doctest deleted file mode 100644 index 0dcf8495ad83460e081d47007ee5439aa54e097e..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/toolbox.doctest +++ /dev/null @@ -1,306 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -=============================== -Unit test cases for ``toolbox`` -=============================== - - >>> from nltk import toolbox - --------------------------- -``toolbox.StandardFormat`` --------------------------- - - >>> f = toolbox.StandardFormat() - -``toolbox.StandardFormat.open()`` ---------------------------------- - >>> import os, tempfile - >>> (fd, fname) = tempfile.mkstemp() - >>> tf = os.fdopen(fd, "w") - >>> _ = tf.write('\\lx a value\n\\lx another value\n') - >>> tf.close() - >>> f = toolbox.StandardFormat() - >>> f.open(fname) - >>> list(f.fields()) - [('lx', 'a value'), ('lx', 'another value')] - >>> f.close() - >>> os.unlink(fname) - -``toolbox.StandardFormat.open_string()`` ----------------------------------------- - >>> f = toolbox.StandardFormat() - >>> f.open_string('\\lx a value\n\\lx another value\n') - >>> list(f.fields()) - [('lx', 'a value'), ('lx', 'another value')] - >>> f.close() - -``toolbox.StandardFormat.close()`` ----------------------------------- - >>> f = toolbox.StandardFormat() - >>> f.open_string('\\lx a value\n\\lx another value\n') - >>> list(f.fields()) - [('lx', 'a value'), ('lx', 'another value')] - >>> f.close() - -``toolbox.StandardFormat.line_num`` ---------------------------------------- - -``StandardFormat.line_num`` contains the line number of the last line returned: - - >>> f = toolbox.StandardFormat() - >>> f.open_string('\\lx a value\n\\lx another value\n\\lx a third value\n') - >>> line_nums = [] - >>> for l in f.raw_fields(): - ... line_nums.append(f.line_num) - >>> line_nums - [1, 2, 3] - -``StandardFormat.line_num`` contains the line number of the last line returned: - - >>> f = toolbox.StandardFormat() - >>> f.open_string('\\lx two\nlines\n\\lx three\nlines\n\n\\lx two\nlines\n') - >>> line_nums = [] - >>> for l in f.raw_fields(): - ... line_nums.append(f.line_num) - >>> line_nums - [2, 5, 7] - -``StandardFormat.line_num`` doesn't exist before opening or after closing -a file or string: - - >>> f = toolbox.StandardFormat() - >>> f.line_num - Traceback (most recent call last): - ... - AttributeError: 'StandardFormat' object has no attribute 'line_num' - >>> f.open_string('\\lx two\nlines\n\\lx three\nlines\n\n\\lx two\nlines\n') - >>> line_nums = [] - >>> for l in f.raw_fields(): - ... line_nums.append(f.line_num) - >>> line_nums - [2, 5, 7] - >>> f.close() - >>> f.line_num - Traceback (most recent call last): - ... - AttributeError: 'StandardFormat' object has no attribute 'line_num' - -``toolbox.StandardFormat.raw_fields()`` ---------------------------------------- -``raw_fields()`` returns an iterator over tuples of two strings representing the -marker and its value. The marker is given without the backslash and the value -without its trailing newline: - - >>> f = toolbox.StandardFormat() - >>> f.open_string('\\lx a value\n\\lx another value\n') - >>> list(f.raw_fields()) - [('lx', 'a value'), ('lx', 'another value')] - -an empty file returns nothing: - - >>> f = toolbox.StandardFormat() - >>> f.open_string('') - >>> list(f.raw_fields()) - [] - -file with only a newline returns WHAT SHOULD IT RETURN???: - - >>> f = toolbox.StandardFormat() - >>> f.open_string('\n') - >>> list(f.raw_fields()) - [(None, '')] - -file with only one field should be parsed ok: - - >>> f = toolbox.StandardFormat() - >>> f.open_string('\\lx one value\n') - >>> list(f.raw_fields()) - [('lx', 'one value')] - -file without a trailing newline should be parsed ok: - - >>> f = toolbox.StandardFormat() - >>> f.open_string('\\lx a value\n\\lx another value') - >>> list(f.raw_fields()) - [('lx', 'a value'), ('lx', 'another value')] - -trailing white space is preserved except for the final newline: - - >>> f = toolbox.StandardFormat() - >>> f.open_string('\\lx trailing space \n\\lx trailing tab\t\n\\lx extra newline\n\n') - >>> list(f.raw_fields()) - [('lx', 'trailing space '), ('lx', 'trailing tab\t'), ('lx', 'extra newline\n')] - -line wrapping is preserved: - - >>> f = toolbox.StandardFormat() - >>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n') - >>> list(f.raw_fields()) - [('lx', 'a value\nmore of the value\nand still more'), ('lc', 'another val')] - -file beginning with a multiline record should be parsed ok: - - >>> f = toolbox.StandardFormat() - >>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n') - >>> list(f.raw_fields()) - [('lx', 'a value\nmore of the value\nand still more'), ('lc', 'another val')] - -file ending with a multiline record should be parsed ok: - - >>> f = toolbox.StandardFormat() - >>> f.open_string('\\lc a value\n\\lx another value\nmore of the value\nand still more\n') - >>> list(f.raw_fields()) - [('lc', 'a value'), ('lx', 'another value\nmore of the value\nand still more')] - -file beginning with a BOM should be parsed ok: - - >>> f = toolbox.StandardFormat() - >>> f.open_string('\xef\xbb\xbf\\lx a value\n\\lx another value\n') - >>> list(f.raw_fields()) - [('lx', 'a value'), ('lx', 'another value')] - -file beginning with two BOMs should ignore only the first one: - - >>> f = toolbox.StandardFormat() - >>> f.open_string('\xef\xbb\xbf\xef\xbb\xbf\\lx a value\n\\lx another value\n') - >>> list(f.raw_fields()) - [(None, '\xef\xbb\xbf\\lx a value'), ('lx', 'another value')] - -should not ignore a BOM not at the beginning of the file: - - >>> f = toolbox.StandardFormat() - >>> f.open_string('\\lx a value\n\xef\xbb\xbf\\lx another value\n') - >>> list(f.raw_fields()) - [('lx', 'a value\n\xef\xbb\xbf\\lx another value')] - -``toolbox.StandardFormat.fields()`` ------------------------------------ -trailing white space is not preserved: - - >>> f = toolbox.StandardFormat() - >>> f.open_string('\\lx trailing space \n\\lx trailing tab\t\n\\lx extra newline\n\n') - >>> list(f.fields()) - [('lx', 'trailing space'), ('lx', 'trailing tab'), ('lx', 'extra newline')] - -multiline fields are unwrapped: - - >>> f = toolbox.StandardFormat() - >>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n') - >>> list(f.fields()) - [('lx', 'a value more of the value and still more'), ('lc', 'another val')] - -markers -------- -A backslash in the first position on a new line indicates the start of a -marker. The backslash is not part of the marker: - - >>> f = toolbox.StandardFormat() - >>> f.open_string('\\mk a value\n') - >>> list(f.fields()) - [('mk', 'a value')] - -If the backslash occurs later in the line it does not indicate the start -of a marker: - - >>> f = toolbox.StandardFormat() - >>> f.open_string('\\mk a value\n \\mk another one\n') - >>> list(f.raw_fields()) - [('mk', 'a value\n \\mk another one')] - -There is no specific limit to the length of a marker: - - >>> f = toolbox.StandardFormat() - >>> f.open_string('\\this_is_an_extremely_long_marker value\n') - >>> list(f.fields()) - [('this_is_an_extremely_long_marker', 'value')] - -A marker can contain any non white space character: - - >>> f = toolbox.StandardFormat() - >>> f.open_string('\\`~!@#$%^&*()_-=+[{]}\\|,<.>/?;:"0123456789 value\n') - >>> list(f.fields()) - [('`~!@#$%^&*()_-=+[{]}\\|,<.>/?;:"0123456789', 'value')] - -A marker is terminated by any white space character: - - >>> f = toolbox.StandardFormat() - >>> f.open_string('\\mk a value\n\\mk\tanother one\n\\mk\rthird one\n\\mk\ffourth one') - >>> list(f.fields()) - [('mk', 'a value'), ('mk', 'another one'), ('mk', 'third one'), ('mk', 'fourth one')] - -Consecutive whitespace characters (except newline) are treated the same as one: - - >>> f = toolbox.StandardFormat() - >>> f.open_string('\\mk \t\r\fa value\n') - >>> list(f.fields()) - [('mk', 'a value')] - ------------------------ -``toolbox.ToolboxData`` ------------------------ - - >>> db = toolbox.ToolboxData() - -``toolbox.ToolboxData.parse()`` -------------------------------- -check that normal parsing works: - - >>> from xml.etree import ElementTree - >>> td = toolbox.ToolboxData() - >>> s = """\\_sh v3.0 400 Rotokas Dictionary - ... \\_DateStampHasFourDigitYear - ... - ... \\lx kaa - ... \\ps V.A - ... \\ge gag - ... \\gp nek i pas - ... - ... \\lx kaa - ... \\ps V.B - ... \\ge strangle - ... \\gp pasim nek - ... """ - >>> td.open_string(s) - >>> tree = td.parse(key='lx') - >>> tree.tag - 'toolbox_data' - >>> ElementTree.tostring(list(tree)[0]).decode('utf8') - '
    <_sh>v3.0 400 Rotokas Dictionary<_DateStampHasFourDigitYear />
    ' - >>> ElementTree.tostring(list(tree)[1]).decode('utf8') - 'kaaV.Agagnek i pas' - >>> ElementTree.tostring(list(tree)[2]).decode('utf8') - 'kaaV.Bstranglepasim nek' - -check that guessing the key marker works: - - >>> from xml.etree import ElementTree - >>> td = toolbox.ToolboxData() - >>> s = """\\_sh v3.0 400 Rotokas Dictionary - ... \\_DateStampHasFourDigitYear - ... - ... \\lx kaa - ... \\ps V.A - ... \\ge gag - ... \\gp nek i pas - ... - ... \\lx kaa - ... \\ps V.B - ... \\ge strangle - ... \\gp pasim nek - ... """ - >>> td.open_string(s) - >>> tree = td.parse() - >>> ElementTree.tostring(list(tree)[0]).decode('utf8') - '
    <_sh>v3.0 400 Rotokas Dictionary<_DateStampHasFourDigitYear />
    ' - >>> ElementTree.tostring(list(tree)[1]).decode('utf8') - 'kaaV.Agagnek i pas' - >>> ElementTree.tostring(list(tree)[2]).decode('utf8') - 'kaaV.Bstranglepasim nek' - ------------------------ -``toolbox`` functions ------------------------ - -``toolbox.to_sfm_string()`` -------------------------------- diff --git a/pipeline/nltk/test/translate.doctest b/pipeline/nltk/test/translate.doctest deleted file mode 100644 index fd8eb4c1b50ac9f24fcc18cd12cb614f2b2feda9..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/translate.doctest +++ /dev/null @@ -1,240 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -.. -*- coding: utf-8 -*- - -========= -Alignment -========= - -Corpus Reader -------------- - - >>> from nltk.corpus import comtrans - >>> words = comtrans.words('alignment-en-fr.txt') - >>> for word in words[:6]: - ... print(word) - Resumption - of - the - session - I - declare - >>> als = comtrans.aligned_sents('alignment-en-fr.txt')[0] - >>> als - AlignedSent(['Resumption', 'of', 'the', 'session'], - ['Reprise', 'de', 'la', 'session'], - Alignment([(0, 0), (1, 1), (2, 2), (3, 3)])) - - -Alignment Objects ------------------ - -Aligned sentences are simply a mapping between words in a sentence: - - >>> print(" ".join(als.words)) - Resumption of the session - >>> print(" ".join(als.mots)) - Reprise de la session - >>> als.alignment - Alignment([(0, 0), (1, 1), (2, 2), (3, 3)]) - - -Usually we look at them from the perspective of a source to a target language, -but they are easily inverted: - - >>> als.invert() - AlignedSent(['Reprise', 'de', 'la', 'session'], - ['Resumption', 'of', 'the', 'session'], - Alignment([(0, 0), (1, 1), (2, 2), (3, 3)])) - - -We can create new alignments, but these need to be in the correct range of -the corresponding sentences: - - >>> from nltk.translate import Alignment, AlignedSent - >>> als = AlignedSent(['Reprise', 'de', 'la', 'session'], - ... ['Resumption', 'of', 'the', 'session'], - ... Alignment([(0, 0), (1, 4), (2, 1), (3, 3)])) - Traceback (most recent call last): - ... - IndexError: Alignment is outside boundary of mots - - -You can set alignments with any sequence of tuples, so long as the first two -indexes of the tuple are the alignment indices: - - >>> als.alignment = Alignment([(0, 0), (1, 1), (2, 2, "boat"), (3, 3, False, (1,2))]) - - >>> Alignment([(0, 0), (1, 1), (2, 2, "boat"), (3, 3, False, (1,2))]) - Alignment([(0, 0), (1, 1), (2, 2, 'boat'), (3, 3, False, (1, 2))]) - - -Alignment Algorithms --------------------- - -EM for IBM Model 1 -~~~~~~~~~~~~~~~~~~ - -Here is an example from Koehn, 2010: - - >>> from nltk.translate import IBMModel1 - >>> corpus = [AlignedSent(['the', 'house'], ['das', 'Haus']), - ... AlignedSent(['the', 'book'], ['das', 'Buch']), - ... AlignedSent(['a', 'book'], ['ein', 'Buch'])] - >>> em_ibm1 = IBMModel1(corpus, 20) - >>> print(round(em_ibm1.translation_table['the']['das'], 1)) - 1.0 - >>> print(round(em_ibm1.translation_table['book']['das'], 1)) - 0.0 - >>> print(round(em_ibm1.translation_table['house']['das'], 1)) - 0.0 - >>> print(round(em_ibm1.translation_table['the']['Buch'], 1)) - 0.0 - >>> print(round(em_ibm1.translation_table['book']['Buch'], 1)) - 1.0 - >>> print(round(em_ibm1.translation_table['a']['Buch'], 1)) - 0.0 - >>> print(round(em_ibm1.translation_table['book']['ein'], 1)) - 0.0 - >>> print(round(em_ibm1.translation_table['a']['ein'], 1)) - 1.0 - >>> print(round(em_ibm1.translation_table['the']['Haus'], 1)) - 0.0 - >>> print(round(em_ibm1.translation_table['house']['Haus'], 1)) - 1.0 - >>> print(round(em_ibm1.translation_table['book'][None], 1)) - 0.5 - -And using an NLTK corpus. We train on only 10 sentences, since it is so slow: - - >>> from nltk.corpus import comtrans - >>> com_ibm1 = IBMModel1(comtrans.aligned_sents()[:10], 20) - >>> print(round(com_ibm1.translation_table['bitte']['Please'], 1)) - 0.2 - >>> print(round(com_ibm1.translation_table['Sitzungsperiode']['session'], 1)) - 1.0 - - -Evaluation ----------- -The evaluation metrics for alignments are usually not interested in the -contents of alignments but more often the comparison to a "gold standard" -alignment that has been been constructed by human experts. For this reason we -often want to work just with raw set operations against the alignment points. -This then gives us a very clean form for defining our evaluation metrics. - -.. Note:: - The AlignedSent class has no distinction of "possible" or "sure" - alignments. Thus all alignments are treated as "sure". - -Consider the following aligned sentence for evaluation: - - >>> my_als = AlignedSent(['Resumption', 'of', 'the', 'session'], - ... ['Reprise', 'de', 'la', 'session'], - ... Alignment([(0, 0), (3, 3), (1, 2), (1, 1), (1, 3)])) - -Precision -~~~~~~~~~ -``precision = |A∩P| / |A|`` - -**Precision** is probably the most well known evaluation metric and it is implemented -in `nltk.metrics.scores.precision`_. Since precision is simply interested in the -proportion of correct alignments, we calculate the ratio of the number of our -test alignments (*A*) that match a possible alignment (*P*), over the number of -test alignments provided. There is no penalty for missing a possible alignment -in our test alignments. An easy way to game this metric is to provide just one -test alignment that is in *P* [OCH2000]_. - -Here are some examples: - - >>> from nltk.metrics import precision - >>> als.alignment = Alignment([(0,0), (1,1), (2,2), (3,3)]) - >>> precision(Alignment([]), als.alignment) - 0.0 - >>> precision(Alignment([(0,0), (1,1), (2,2), (3,3)]), als.alignment) - 1.0 - >>> precision(Alignment([(0,0), (3,3)]), als.alignment) - 0.5 - >>> precision(Alignment.fromstring('0-0 3-3'), als.alignment) - 0.5 - >>> precision(Alignment([(0,0), (1,1), (2,2), (3,3), (1,2), (2,1)]), als.alignment) - 1.0 - >>> precision(als.alignment, my_als.alignment) - 0.6 - - -.. _nltk.metrics.scores.precision: - https://www.nltk.org/api/nltk.metrics.html#nltk.metrics.scores.precision - - -Recall -~~~~~~ -``recall = |A∩S| / |S|`` - -**Recall** is another well known evaluation metric that has a set based -implementation in NLTK as `nltk.metrics.scores.recall`_. Since recall is -simply interested in the proportion of found alignments, we calculate the -ratio of the number of our test alignments (*A*) that match a sure alignment -(*S*) over the number of sure alignments. There is no penalty for producing -a lot of test alignments. An easy way to game this metric is to include every -possible alignment in our test alignments, regardless if they are correct or -not [OCH2000]_. - -Here are some examples: - - >>> from nltk.metrics import recall - >>> print(recall(Alignment([]), als.alignment)) - None - >>> recall(Alignment([(0,0), (1,1), (2,2), (3,3)]), als.alignment) - 1.0 - >>> recall(Alignment.fromstring('0-0 3-3'), als.alignment) - 1.0 - >>> recall(Alignment([(0,0), (3,3)]), als.alignment) - 1.0 - >>> recall(Alignment([(0,0), (1,1), (2,2), (3,3), (1,2), (2,1)]), als.alignment) - 0.66666... - >>> recall(als.alignment, my_als.alignment) - 0.75 - - -.. _nltk.metrics.scores.recall: - https://www.nltk.org/api/nltk.metrics.html#nltk.metrics.scores.recall - - -Alignment Error Rate (AER) -~~~~~~~~~~~~~~~~~~~~~~~~~~ -``AER = 1 - (|A∩S| + |A∩P|) / (|A| + |S|)`` - -**Alignment Error Rate** is commonly used metric for assessing sentence -alignments. It combines precision and recall metrics together such that a -perfect alignment must have all of the sure alignments and may have some -possible alignments [MIHALCEA2003]_ [KOEHN2010]_. - -.. Note:: - [KOEHN2010]_ defines the AER as ``AER = (|A∩S| + |A∩P|) / (|A| + |S|)`` - in his book, but corrects it to the above in his online errata. This is - in line with [MIHALCEA2003]_. - -Here are some examples: - - >>> from nltk.translate import alignment_error_rate - >>> alignment_error_rate(Alignment([]), als.alignment) - 1.0 - >>> alignment_error_rate(Alignment([(0,0), (1,1), (2,2), (3,3)]), als.alignment) - 0.0 - >>> alignment_error_rate(als.alignment, my_als.alignment) - 0.333333... - >>> alignment_error_rate(als.alignment, my_als.alignment, - ... als.alignment | Alignment([(1,2), (2,1)])) - 0.222222... - - -.. [OCH2000] Och, F. and Ney, H. (2000) - *Statistical Machine Translation*, EAMT Workshop - -.. [MIHALCEA2003] Mihalcea, R. and Pedersen, T. (2003) - *An evaluation exercise for word alignment*, HLT-NAACL 2003 - -.. [KOEHN2010] Koehn, P. (2010) - *Statistical Machine Translation*, Cambridge University Press diff --git a/pipeline/nltk/test/tree.doctest b/pipeline/nltk/test/tree.doctest deleted file mode 100644 index 7b6748bd4abdde316b92b38789c777b2209c3da0..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/tree.doctest +++ /dev/null @@ -1,1223 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -=============================== - Unit tests for nltk.tree.Tree -=============================== - - >>> from nltk.tree import * - -Some trees to run tests on: - - >>> dp1 = Tree('dp', [Tree('d', ['the']), Tree('np', ['dog'])]) - >>> dp2 = Tree('dp', [Tree('d', ['the']), Tree('np', ['cat'])]) - >>> vp = Tree('vp', [Tree('v', ['chased']), dp2]) - >>> tree = Tree('s', [dp1, vp]) - >>> print(tree) - (s (dp (d the) (np dog)) (vp (v chased) (dp (d the) (np cat)))) - -The node label is accessed using the `label()` method: - - >>> dp1.label(), dp2.label(), vp.label(), tree.label() - ('dp', 'dp', 'vp', 's') - - >>> print(tree[1,1,1,0]) - cat - -The `treepositions` method returns a list of the tree positions of -subtrees and leaves in a tree. By default, it gives the position of -every tree, subtree, and leaf, in prefix order: - - >>> print(tree.treepositions()) - [(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0), (1, 0, 0), (1, 1), (1, 1, 0), (1, 1, 0, 0), (1, 1, 1), (1, 1, 1, 0)] - -In addition to `str` and `repr`, several methods exist to convert a -tree object to one of several standard tree encodings: - - >>> print(tree.pformat_latex_qtree()) - \Tree [.s - [.dp [.d the ] [.np dog ] ] - [.vp [.v chased ] [.dp [.d the ] [.np cat ] ] ] ] - -There is also a fancy ASCII art representation: - - >>> tree.pretty_print() - s - ________|_____ - | vp - | _____|___ - dp | dp - ___|___ | ___|___ - d np v d np - | | | | | - the dog chased the cat - - >>> tree.pretty_print(unicodelines=True, nodedist=4) - s - ┌──────────────┴────────┐ - │ vp - │ ┌────────┴──────┐ - dp │ dp - ┌──────┴──────┐ │ ┌──────┴──────┐ - d np v d np - │ │ │ │ │ - the dog chased the cat - -Trees can be initialized from treebank strings: - - >>> tree2 = Tree.fromstring('(S (NP I) (VP (V enjoyed) (NP my cookie)))') - >>> print(tree2) - (S (NP I) (VP (V enjoyed) (NP my cookie))) - -Trees can be compared for equality: - - >>> tree == Tree.fromstring(str(tree)) - True - >>> tree2 == Tree.fromstring(str(tree2)) - True - >>> tree == tree2 - False - >>> tree == Tree.fromstring(str(tree2)) - False - >>> tree2 == Tree.fromstring(str(tree)) - False - - >>> tree != Tree.fromstring(str(tree)) - False - >>> tree2 != Tree.fromstring(str(tree2)) - False - >>> tree != tree2 - True - >>> tree != Tree.fromstring(str(tree2)) - True - >>> tree2 != Tree.fromstring(str(tree)) - True - - >>> tree < tree2 or tree > tree2 - True - -Tree Parsing -============ - -The class method `Tree.fromstring()` can be used to parse trees, and it -provides some additional options. - - >>> tree = Tree.fromstring('(S (NP I) (VP (V enjoyed) (NP my cookie)))') - >>> print(tree) - (S (NP I) (VP (V enjoyed) (NP my cookie))) - -When called on a subclass of `Tree`, it will create trees of that -type: - - >>> tree = ImmutableTree.fromstring('(VP (V enjoyed) (NP my cookie))') - >>> print(tree) - (VP (V enjoyed) (NP my cookie)) - >>> print(type(tree)) - - >>> tree[1] = 'x' - Traceback (most recent call last): - . . . - ValueError: ImmutableTree may not be modified - >>> del tree[0] - Traceback (most recent call last): - . . . - ValueError: ImmutableTree may not be modified - -The ``brackets`` parameter can be used to specify two characters that -should be used as brackets: - - >>> print(Tree.fromstring('[S [NP I] [VP [V enjoyed] [NP my cookie]]]', - ... brackets='[]')) - (S (NP I) (VP (V enjoyed) (NP my cookie))) - >>> print(Tree.fromstring(' >>', - ... brackets='<>')) - (S (NP I) (VP (V enjoyed) (NP my cookie))) - -If ``brackets`` is not a string, or is not exactly two characters, -then `Tree.fromstring` raises an exception: - - >>> Tree.fromstring(' >', brackets='') - Traceback (most recent call last): - . . . - TypeError: brackets must be a length-2 string - >>> Tree.fromstring(' >', brackets='<<>>') - Traceback (most recent call last): - . . . - TypeError: brackets must be a length-2 string - >>> Tree.fromstring(' >', brackets=12) - Traceback (most recent call last): - . . . - TypeError: brackets must be a length-2 string - >>> Tree.fromstring('<>', brackets=('<<','>>')) - Traceback (most recent call last): - . . . - TypeError: brackets must be a length-2 string - -(We may add support for multi-character brackets in the future, in -which case the ``brackets=('<<','>>')`` example would start working.) - -Whitespace brackets are not permitted: - - >>> Tree.fromstring('(NP my cookie\n', brackets='(\n') - Traceback (most recent call last): - . . . - TypeError: whitespace brackets not allowed - -If an invalid tree is given to Tree.fromstring, then it raises a -ValueError, with a description of the problem: - - >>> Tree.fromstring('(NP my cookie) (NP my milk)') - Traceback (most recent call last): - . . . - ValueError: Tree.fromstring(): expected 'end-of-string' but got '(NP' - at index 15. - "...y cookie) (NP my mil..." - ^ - >>> Tree.fromstring(')NP my cookie(') - Traceback (most recent call last): - . . . - ValueError: Tree.fromstring(): expected '(' but got ')' - at index 0. - ")NP my coo..." - ^ - >>> Tree.fromstring('(NP my cookie))') - Traceback (most recent call last): - . . . - ValueError: Tree.fromstring(): expected 'end-of-string' but got ')' - at index 14. - "...my cookie))" - ^ - >>> Tree.fromstring('my cookie)') - Traceback (most recent call last): - . . . - ValueError: Tree.fromstring(): expected '(' but got 'my' - at index 0. - "my cookie)" - ^ - >>> Tree.fromstring('(NP my cookie') - Traceback (most recent call last): - . . . - ValueError: Tree.fromstring(): expected ')' but got 'end-of-string' - at index 13. - "... my cookie" - ^ - >>> Tree.fromstring('') - Traceback (most recent call last): - . . . - ValueError: Tree.fromstring(): expected '(' but got 'end-of-string' - at index 0. - "" - ^ - -Trees with no children are supported: - - >>> print(Tree.fromstring('(S)')) - (S ) - >>> print(Tree.fromstring('(X (Y) (Z))')) - (X (Y ) (Z )) - -Trees with an empty node label and no children are supported: - - >>> print(Tree.fromstring('()')) - ( ) - >>> print(Tree.fromstring('(X () ())')) - (X ( ) ( )) - -Trees with an empty node label and children are supported, but only if the -first child is not a leaf (otherwise, it will be treated as the node label). - - >>> print(Tree.fromstring('((A) (B) (C))')) - ( (A ) (B ) (C )) - >>> print(Tree.fromstring('((A) leaf)')) - ( (A ) leaf) - >>> print(Tree.fromstring('(((())))')) - ( ( ( ( )))) - -The optional arguments `read_node` and `read_leaf` may be used to -transform the string values of nodes or leaves. - - >>> print(Tree.fromstring('(A b (C d e) (F (G h i)))', - ... read_node=lambda s: '<%s>' % s, - ... read_leaf=lambda s: '"%s"' % s)) - (
    "b" ( "d" "e") ( ( "h" "i"))) - -These transformation functions are typically used when the node or -leaf labels should be parsed to a non-string value (such as a feature -structure). If node and leaf labels need to be able to include -whitespace, then you must also use the optional `node_pattern` and -`leaf_pattern` arguments. - - >>> from nltk.featstruct import FeatStruct - >>> tree = Tree.fromstring('([cat=NP] [lex=the] [lex=dog])', - ... read_node=FeatStruct, read_leaf=FeatStruct) - >>> tree.set_label(tree.label().unify(FeatStruct('[num=singular]'))) - >>> print(tree) - ([cat='NP', num='singular'] [lex='the'] [lex='dog']) - -The optional argument ``remove_empty_top_bracketing`` can be used to -remove any top-level empty bracketing that occurs. - - >>> print(Tree.fromstring('((S (NP I) (VP (V enjoyed) (NP my cookie))))', - ... remove_empty_top_bracketing=True)) - (S (NP I) (VP (V enjoyed) (NP my cookie))) - -It will not remove a top-level empty bracketing with multiple children: - - >>> print(Tree.fromstring('((A a) (B b))')) - ( (A a) (B b)) - - -Tree.fromlist() ---------------- -The class method `Tree.fromlist()` can be used to parse trees -that are expressed as nested lists, such as those produced by -the tree() function from the wordnet module. - - >>> from nltk.corpus import wordnet as wn - >>> t=Tree.fromlist(wn.synset('dog.n.01').tree(lambda s:s.hypernyms())) - >>> print(t.height()) - 14 - >>> print(t.leaves()) - ["Synset('entity.n.01')", "Synset('entity.n.01')"] - >>> t.pretty_print() - Synset('dog.n.01') - _________________|__________________ - Synset('canine.n. | - 02') | - | | - Synset('carnivor | - e.n.01') | - | | - Synset('placenta | - l.n.01') | - | | - Synset('mammal.n. | - 01') | - | | - Synset('vertebra | - te.n.01') | - | | - Synset('chordate. Synset('domestic - n.01') _animal.n.01') - | | - Synset('animal.n. Synset('animal.n. - 01') 01') - | | - Synset('organism. Synset('organism. - n.01') n.01') - | | - Synset('living_t Synset('living_t - hing.n.01') hing.n.01') - | | - Synset('whole.n. Synset('whole.n. - 02') 02') - | | - Synset('object.n. Synset('object.n. - 01') 01') - | | - Synset('physical Synset('physical - _entity.n.01') _entity.n.01') - | | - Synset('entity.n. Synset('entity.n. - 01') 01') - - - -Parented Trees -============== -`ParentedTree` is a subclass of `Tree` that automatically maintains -parent pointers for single-parented trees. Parented trees can be -created directly from a node label and a list of children: - - >>> ptree = ( - ... ParentedTree('VP', [ - ... ParentedTree('VERB', ['saw']), - ... ParentedTree('NP', [ - ... ParentedTree('DET', ['the']), - ... ParentedTree('NOUN', ['dog'])])])) - >>> print(ptree) - (VP (VERB saw) (NP (DET the) (NOUN dog))) - -Parented trees can be created from strings using the classmethod -`ParentedTree.fromstring`: - - >>> ptree = ParentedTree.fromstring('(VP (VERB saw) (NP (DET the) (NOUN dog)))') - >>> print(ptree) - (VP (VERB saw) (NP (DET the) (NOUN dog))) - >>> print(type(ptree)) - - -Parented trees can also be created by using the classmethod -`ParentedTree.convert` to convert another type of tree to a parented -tree: - - >>> tree = Tree.fromstring('(VP (VERB saw) (NP (DET the) (NOUN dog)))') - >>> ptree = ParentedTree.convert(tree) - >>> print(ptree) - (VP (VERB saw) (NP (DET the) (NOUN dog))) - >>> print(type(ptree)) - - -.. clean-up: - - >>> del tree - -`ParentedTree`\ s should never be used in the same tree as `Tree`\ s -or `MultiParentedTree`\ s. Mixing tree implementations may result in -incorrect parent pointers and in `TypeError` exceptions: - - >>> # Inserting a Tree in a ParentedTree gives an exception: - >>> ParentedTree('NP', [ - ... Tree('DET', ['the']), Tree('NOUN', ['dog'])]) - Traceback (most recent call last): - . . . - TypeError: Can not insert a non-ParentedTree into a ParentedTree - - >>> # inserting a ParentedTree in a Tree gives incorrect parent pointers: - >>> broken_tree = Tree('NP', [ - ... ParentedTree('DET', ['the']), ParentedTree('NOUN', ['dog'])]) - >>> print(broken_tree[0].parent()) - None - -Parented Tree Methods ------------------------- -In addition to all the methods defined by the `Tree` class, the -`ParentedTree` class adds six new methods whose values are -automatically updated whenever a parented tree is modified: `parent()`, -`parent_index()`, `left_sibling()`, `right_sibling()`, `root()`, and -`treeposition()`. - -The `parent()` method contains a `ParentedTree`\ 's parent, if it has -one; and ``None`` otherwise. `ParentedTree`\ s that do not have -parents are known as "root trees." - - >>> for subtree in ptree.subtrees(): - ... print(subtree) - ... print(' Parent = %s' % subtree.parent()) - (VP (VERB saw) (NP (DET the) (NOUN dog))) - Parent = None - (VERB saw) - Parent = (VP (VERB saw) (NP (DET the) (NOUN dog))) - (NP (DET the) (NOUN dog)) - Parent = (VP (VERB saw) (NP (DET the) (NOUN dog))) - (DET the) - Parent = (NP (DET the) (NOUN dog)) - (NOUN dog) - Parent = (NP (DET the) (NOUN dog)) - -The `parent_index()` method stores the index of a tree in its parent's -child list. If a tree does not have a parent, then its `parent_index` -is ``None``. - - >>> for subtree in ptree.subtrees(): - ... print(subtree) - ... print(' Parent Index = %s' % subtree.parent_index()) - ... assert (subtree.parent() is None or - ... subtree.parent()[subtree.parent_index()] is subtree) - (VP (VERB saw) (NP (DET the) (NOUN dog))) - Parent Index = None - (VERB saw) - Parent Index = 0 - (NP (DET the) (NOUN dog)) - Parent Index = 1 - (DET the) - Parent Index = 0 - (NOUN dog) - Parent Index = 1 - -Note that ``ptree.parent().index(ptree)`` is *not* equivalent to -``ptree.parent_index()``. In particular, ``ptree.parent().index(ptree)`` -will return the index of the first child of ``ptree.parent()`` that is -equal to ``ptree`` (using ``==``); and that child may not be -``ptree``: - - >>> on_and_on = ParentedTree('CONJP', [ - ... ParentedTree('PREP', ['on']), - ... ParentedTree('COJN', ['and']), - ... ParentedTree('PREP', ['on'])]) - >>> second_on = on_and_on[2] - >>> print(second_on.parent_index()) - 2 - >>> print(second_on.parent().index(second_on)) - 0 - -The methods `left_sibling()` and `right_sibling()` can be used to get a -parented tree's siblings. If a tree does not have a left or right -sibling, then the corresponding method's value is ``None``: - - >>> for subtree in ptree.subtrees(): - ... print(subtree) - ... print(' Left Sibling = %s' % subtree.left_sibling()) - ... print(' Right Sibling = %s' % subtree.right_sibling()) - (VP (VERB saw) (NP (DET the) (NOUN dog))) - Left Sibling = None - Right Sibling = None - (VERB saw) - Left Sibling = None - Right Sibling = (NP (DET the) (NOUN dog)) - (NP (DET the) (NOUN dog)) - Left Sibling = (VERB saw) - Right Sibling = None - (DET the) - Left Sibling = None - Right Sibling = (NOUN dog) - (NOUN dog) - Left Sibling = (DET the) - Right Sibling = None - -A parented tree's root tree can be accessed using the `root()` -method. This method follows the tree's parent pointers until it -finds a tree without a parent. If a tree does not have a parent, then -it is its own root: - - >>> for subtree in ptree.subtrees(): - ... print(subtree) - ... print(' Root = %s' % subtree.root()) - (VP (VERB saw) (NP (DET the) (NOUN dog))) - Root = (VP (VERB saw) (NP (DET the) (NOUN dog))) - (VERB saw) - Root = (VP (VERB saw) (NP (DET the) (NOUN dog))) - (NP (DET the) (NOUN dog)) - Root = (VP (VERB saw) (NP (DET the) (NOUN dog))) - (DET the) - Root = (VP (VERB saw) (NP (DET the) (NOUN dog))) - (NOUN dog) - Root = (VP (VERB saw) (NP (DET the) (NOUN dog))) - -The `treeposition()` method can be used to find a tree's treeposition -relative to its root: - - >>> for subtree in ptree.subtrees(): - ... print(subtree) - ... print(' Tree Position = %s' % (subtree.treeposition(),)) - ... assert subtree.root()[subtree.treeposition()] is subtree - (VP (VERB saw) (NP (DET the) (NOUN dog))) - Tree Position = () - (VERB saw) - Tree Position = (0,) - (NP (DET the) (NOUN dog)) - Tree Position = (1,) - (DET the) - Tree Position = (1, 0) - (NOUN dog) - Tree Position = (1, 1) - -Whenever a parented tree is modified, all of the methods described -above (`parent()`, `parent_index()`, `left_sibling()`, `right_sibling()`, -`root()`, and `treeposition()`) are automatically updated. For example, -if we replace ``ptree``\ 's subtree for the word "dog" with a new -subtree for "cat," the method values for both the "dog" subtree and the -"cat" subtree get automatically updated: - - >>> # Replace the dog with a cat - >>> dog = ptree[1,1] - >>> cat = ParentedTree('NOUN', ['cat']) - >>> ptree[1,1] = cat - - >>> # the noun phrase is no longer the dog's parent: - >>> print(dog.parent(), dog.parent_index(), dog.left_sibling()) - None None None - >>> # dog is now its own root. - >>> print(dog.root()) - (NOUN dog) - >>> print(dog.treeposition()) - () - - >>> # the cat's parent is now the noun phrase: - >>> print(cat.parent()) - (NP (DET the) (NOUN cat)) - >>> print(cat.parent_index()) - 1 - >>> print(cat.left_sibling()) - (DET the) - >>> print(cat.root()) - (VP (VERB saw) (NP (DET the) (NOUN cat))) - >>> print(cat.treeposition()) - (1, 1) - -ParentedTree Regression Tests ------------------------------ -Keep track of all trees that we create (including subtrees) using this -variable: - - >>> all_ptrees = [] - -Define a helper function to create new parented trees: - - >>> def make_ptree(s): - ... ptree = ParentedTree.convert(Tree.fromstring(s)) - ... all_ptrees.extend(t for t in ptree.subtrees() - ... if isinstance(t, Tree)) - ... return ptree - -Define a test function that examines every subtree in all_ptrees; and -checks that all six of its methods are defined correctly. If any -ptrees are passed as arguments, then they are printed. - - >>> def pcheck(*print_ptrees): - ... for ptree in all_ptrees: - ... # Check ptree's methods. - ... if ptree.parent() is not None: - ... i = ptree.parent_index() - ... assert ptree.parent()[i] is ptree - ... if i > 0: - ... assert ptree.left_sibling() is ptree.parent()[i-1] - ... if i < (len(ptree.parent())-1): - ... assert ptree.right_sibling() is ptree.parent()[i+1] - ... assert len(ptree.treeposition()) > 0 - ... assert (ptree.treeposition() == - ... ptree.parent().treeposition() + (ptree.parent_index(),)) - ... assert ptree.root() is not ptree - ... assert ptree.root() is not None - ... assert ptree.root() is ptree.parent().root() - ... assert ptree.root()[ptree.treeposition()] is ptree - ... else: - ... assert ptree.parent_index() is None - ... assert ptree.left_sibling() is None - ... assert ptree.right_sibling() is None - ... assert ptree.root() is ptree - ... assert ptree.treeposition() == () - ... # Check ptree's children's methods: - ... for i, child in enumerate(ptree): - ... if isinstance(child, Tree): - ... # pcheck parent() & parent_index() methods - ... assert child.parent() is ptree - ... assert child.parent_index() == i - ... # pcheck sibling methods - ... if i == 0: - ... assert child.left_sibling() is None - ... else: - ... assert child.left_sibling() is ptree[i-1] - ... if i == len(ptree)-1: - ... assert child.right_sibling() is None - ... else: - ... assert child.right_sibling() is ptree[i+1] - ... if print_ptrees: - ... print('ok!', end=' ') - ... for ptree in print_ptrees: print(ptree) - ... else: - ... print('ok!') - -Run our test function on a variety of newly-created trees: - - >>> pcheck(make_ptree('(A)')) - ok! (A ) - >>> pcheck(make_ptree('(A (B (C (D) (E f)) g) h)')) - ok! (A (B (C (D ) (E f)) g) h) - >>> pcheck(make_ptree('(A (B) (C c) (D d d) (E e e e))')) - ok! (A (B ) (C c) (D d d) (E e e e)) - >>> pcheck(make_ptree('(A (B) (C (c)) (D (d) (d)) (E (e) (e) (e)))')) - ok! (A (B ) (C (c )) (D (d ) (d )) (E (e ) (e ) (e ))) - -Run our test function after performing various tree-modification -operations: - -**__delitem__()** - - >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') - >>> e = ptree[0,0,1] - >>> del ptree[0,0,1]; pcheck(ptree); pcheck(e) - ok! (A (B (C (D ) (Q p)) g) h) - ok! (E f) - >>> del ptree[0,0,0]; pcheck(ptree) - ok! (A (B (C (Q p)) g) h) - >>> del ptree[0,1]; pcheck(ptree) - ok! (A (B (C (Q p))) h) - >>> del ptree[-1]; pcheck(ptree) - ok! (A (B (C (Q p)))) - >>> del ptree[-100] - Traceback (most recent call last): - . . . - IndexError: index out of range - >>> del ptree[()] - Traceback (most recent call last): - . . . - IndexError: The tree position () may not be deleted. - - >>> # With slices: - >>> ptree = make_ptree('(A (B c) (D e) f g (H i) j (K l))') - >>> b = ptree[0] - >>> del ptree[0:0]; pcheck(ptree) - ok! (A (B c) (D e) f g (H i) j (K l)) - >>> del ptree[:1]; pcheck(ptree); pcheck(b) - ok! (A (D e) f g (H i) j (K l)) - ok! (B c) - >>> del ptree[-2:]; pcheck(ptree) - ok! (A (D e) f g (H i)) - >>> del ptree[1:3]; pcheck(ptree) - ok! (A (D e) (H i)) - >>> ptree = make_ptree('(A (B c) (D e) f g (H i) j (K l))') - >>> del ptree[5:1000]; pcheck(ptree) - ok! (A (B c) (D e) f g (H i)) - >>> del ptree[-2:1000]; pcheck(ptree) - ok! (A (B c) (D e) f) - >>> del ptree[-100:1]; pcheck(ptree) - ok! (A (D e) f) - >>> ptree = make_ptree('(A (B c) (D e) f g (H i) j (K l))') - >>> del ptree[1:-2:2]; pcheck(ptree) - ok! (A (B c) f (H i) j (K l)) - -**__setitem__()** - - >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') - >>> d, e, q = ptree[0,0] - >>> ptree[0,0,0] = 'x'; pcheck(ptree); pcheck(d) - ok! (A (B (C x (E f) (Q p)) g) h) - ok! (D ) - >>> ptree[0,0,1] = make_ptree('(X (Y z))'); pcheck(ptree); pcheck(e) - ok! (A (B (C x (X (Y z)) (Q p)) g) h) - ok! (E f) - >>> ptree[1] = d; pcheck(ptree) - ok! (A (B (C x (X (Y z)) (Q p)) g) (D )) - >>> ptree[-1] = 'x'; pcheck(ptree) - ok! (A (B (C x (X (Y z)) (Q p)) g) x) - >>> ptree[-100] = 'y' - Traceback (most recent call last): - . . . - IndexError: index out of range - >>> ptree[()] = make_ptree('(X y)') - Traceback (most recent call last): - . . . - IndexError: The tree position () may not be assigned to. - - >>> # With slices: - >>> ptree = make_ptree('(A (B c) (D e) f g (H i) j (K l))') - >>> b = ptree[0] - >>> ptree[0:0] = ('x', make_ptree('(Y)')); pcheck(ptree) - ok! (A x (Y ) (B c) (D e) f g (H i) j (K l)) - >>> ptree[2:6] = (); pcheck(ptree); pcheck(b) - ok! (A x (Y ) (H i) j (K l)) - ok! (B c) - >>> ptree[-2:] = ('z', 'p'); pcheck(ptree) - ok! (A x (Y ) (H i) z p) - >>> ptree[1:3] = [make_ptree('(X)') for x in range(10)]; pcheck(ptree) - ok! (A x (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) z p) - >>> ptree[5:1000] = []; pcheck(ptree) - ok! (A x (X ) (X ) (X ) (X )) - >>> ptree[-2:1000] = ['n']; pcheck(ptree) - ok! (A x (X ) (X ) n) - >>> ptree[-100:1] = [make_ptree('(U v)')]; pcheck(ptree) - ok! (A (U v) (X ) (X ) n) - >>> ptree[-1:] = (make_ptree('(X)') for x in range(3)); pcheck(ptree) - ok! (A (U v) (X ) (X ) (X ) (X ) (X )) - >>> ptree[1:-2:2] = ['x', 'y']; pcheck(ptree) - ok! (A (U v) x (X ) y (X ) (X )) - -**append()** - - >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') - >>> ptree.append('x'); pcheck(ptree) - ok! (A (B (C (D ) (E f) (Q p)) g) h x) - >>> ptree.append(make_ptree('(X (Y z))')); pcheck(ptree) - ok! (A (B (C (D ) (E f) (Q p)) g) h x (X (Y z))) - -**extend()** - - >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') - >>> ptree.extend(['x', 'y', make_ptree('(X (Y z))')]); pcheck(ptree) - ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z))) - >>> ptree.extend([]); pcheck(ptree) - ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z))) - >>> ptree.extend(make_ptree('(X)') for x in range(3)); pcheck(ptree) - ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z)) (X ) (X ) (X )) - -**insert()** - - >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') - >>> ptree.insert(0, make_ptree('(X (Y z))')); pcheck(ptree) - ok! (A (X (Y z)) (B (C (D ) (E f) (Q p)) g) h) - >>> ptree.insert(-1, make_ptree('(X (Y z))')); pcheck(ptree) - ok! (A (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h) - >>> ptree.insert(-4, make_ptree('(X (Y z))')); pcheck(ptree) - ok! (A (X (Y z)) (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h) - >>> # Note: as with ``list``, inserting at a negative index that - >>> # gives a position before the start of the list does *not* - >>> # raise an IndexError exception; it just inserts at 0. - >>> ptree.insert(-400, make_ptree('(X (Y z))')); pcheck(ptree) - ok! (A - (X (Y z)) - (X (Y z)) - (X (Y z)) - (B (C (D ) (E f) (Q p)) g) - (X (Y z)) - h) - -**pop()** - - >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') - >>> ptree[0,0].pop(1); pcheck(ptree) - ParentedTree('E', ['f']) - ok! (A (B (C (D ) (Q p)) g) h) - >>> ptree[0].pop(-1); pcheck(ptree) - 'g' - ok! (A (B (C (D ) (Q p))) h) - >>> ptree.pop(); pcheck(ptree) - 'h' - ok! (A (B (C (D ) (Q p)))) - >>> ptree.pop(-100) - Traceback (most recent call last): - . . . - IndexError: index out of range - -**remove()** - - >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') - >>> e = ptree[0,0,1] - >>> ptree[0,0].remove(ptree[0,0,1]); pcheck(ptree); pcheck(e) - ok! (A (B (C (D ) (Q p)) g) h) - ok! (E f) - >>> ptree[0,0].remove(make_ptree('(Q p)')); pcheck(ptree) - ok! (A (B (C (D )) g) h) - >>> ptree[0,0].remove(make_ptree('(Q p)')) - Traceback (most recent call last): - . . . - ValueError: ParentedTree('Q', ['p']) is not in list - >>> ptree.remove('h'); pcheck(ptree) - ok! (A (B (C (D )) g)) - >>> ptree.remove('h'); - Traceback (most recent call last): - . . . - ValueError: 'h' is not in list - >>> # remove() removes the first subtree that is equal (==) to the - >>> # given tree, which may not be the identical tree we give it: - >>> ptree = make_ptree('(A (X x) (Y y) (X x))') - >>> x1, y, x2 = ptree - >>> ptree.remove(ptree[-1]); pcheck(ptree) - ok! (A (Y y) (X x)) - >>> print(x1.parent()); pcheck(x1) - None - ok! (X x) - >>> print(x2.parent()) - (A (Y y) (X x)) - -Test that a tree can not be given multiple parents: - - >>> ptree = make_ptree('(A (X x) (Y y) (Z z))') - >>> ptree[0] = ptree[1] - Traceback (most recent call last): - . . . - ValueError: Can not insert a subtree that already has a parent. - >>> pcheck() - ok! - -[more to be written] - -Shallow copying can be tricky for Tree and several of its subclasses. -For shallow copies of Tree, only the root node is reconstructed, while -all the children are shared between the two trees. Modify the children -of one tree - and the shallowly copied tree will also update. - - >>> from nltk.tree import Tree, ParentedTree, MultiParentedTree - >>> tree = Tree.fromstring("(TOP (S (NP (NNP Bell,)) (NP (NP (DT a) (NN company)) (SBAR (WHNP (WDT which)) (S (VP (VBZ is) (VP (VBN based) (PP (IN in) (NP (NNP LA,)))))))) (VP (VBZ makes) (CC and) (VBZ distributes) (NP (NN computer))) (. products.)))") - >>> copy_tree = tree.copy(deep=False) - >>> tree == copy_tree # Ensure identical labels and nodes - True - >>> id(copy_tree[0]) == id(tree[0]) # Ensure shallow copy - the children are the same objects in memory - True - -For ParentedTree objects, this behaviour is not possible. With a shallow -copy, the children of the root node would be reused for both the original -and the shallow copy. For this to be possible, some children would need -to have multiple parents. As this is forbidden for ParentedTree objects, -attempting to make a shallow copy will cause a warning, and a deep copy -is made instead. - - >>> ptree = ParentedTree.fromstring("(TOP (S (NP (NNP Bell,)) (NP (NP (DT a) (NN company)) (SBAR (WHNP (WDT which)) (S (VP (VBZ is) (VP (VBN based) (PP (IN in) (NP (NNP LA,)))))))) (VP (VBZ makes) (CC and) (VBZ distributes) (NP (NN computer))) (. products.)))") - >>> copy_ptree = ptree.copy(deep=False) - >>> copy_ptree == ptree # Ensure identical labels and nodes - True - >>> id(copy_ptree[0]) != id(ptree[0]) # Shallow copying isn't supported - it defaults to deep copy. - True - -For MultiParentedTree objects, the issue of only allowing one parent that -can be seen for ParentedTree objects is no more. Shallow copying a -MultiParentedTree gives the children of the root node two parents: -the original and the newly copied root. - - >>> mptree = MultiParentedTree.fromstring("(TOP (S (NP (NNP Bell,)) (NP (NP (DT a) (NN company)) (SBAR (WHNP (WDT which)) (S (VP (VBZ is) (VP (VBN based) (PP (IN in) (NP (NNP LA,)))))))) (VP (VBZ makes) (CC and) (VBZ distributes) (NP (NN computer))) (. products.)))") - >>> len(mptree[0].parents()) - 1 - >>> copy_mptree = mptree.copy(deep=False) - >>> copy_mptree == mptree # Ensure identical labels and nodes - True - >>> len(mptree[0].parents()) - 2 - >>> len(copy_mptree[0].parents()) - 2 - -Shallow copying a MultiParentedTree is similar to creating a second root -which is identically labeled as the root on which the copy method was called. - - -ImmutableParentedTree Regression Tests --------------------------------------- - - >>> iptree = ImmutableParentedTree.convert(ptree) - >>> type(iptree) - - >>> del iptree[0] - Traceback (most recent call last): - . . . - ValueError: ImmutableParentedTree may not be modified - >>> iptree.set_label('newnode') - Traceback (most recent call last): - . . . - ValueError: ImmutableParentedTree may not be modified - - -MultiParentedTree Regression Tests ----------------------------------- -Keep track of all trees that we create (including subtrees) using this -variable: - - >>> all_mptrees = [] - -Define a helper function to create new parented trees: - - >>> def make_mptree(s): - ... mptree = MultiParentedTree.convert(Tree.fromstring(s)) - ... all_mptrees.extend(t for t in mptree.subtrees() - ... if isinstance(t, Tree)) - ... return mptree - -Define a test function that examines every subtree in all_mptrees; and -checks that all six of its methods are defined correctly. If any -mptrees are passed as arguments, then they are printed. - - >>> def mpcheck(*print_mptrees): - ... def has(seq, val): # uses identity comparison - ... for item in seq: - ... if item is val: return True - ... return False - ... for mptree in all_mptrees: - ... # Check mptree's methods. - ... if len(mptree.parents()) == 0: - ... assert len(mptree.left_siblings()) == 0 - ... assert len(mptree.right_siblings()) == 0 - ... assert len(mptree.roots()) == 1 - ... assert mptree.roots()[0] is mptree - ... assert mptree.treepositions(mptree) == [()] - ... left_siblings = right_siblings = () - ... roots = {id(mptree): 1} - ... else: - ... roots = dict((id(r), 0) for r in mptree.roots()) - ... left_siblings = mptree.left_siblings() - ... right_siblings = mptree.right_siblings() - ... for parent in mptree.parents(): - ... for i in mptree.parent_indices(parent): - ... assert parent[i] is mptree - ... # check left siblings - ... if i > 0: - ... for j in range(len(left_siblings)): - ... if left_siblings[j] is parent[i-1]: - ... del left_siblings[j] - ... break - ... else: - ... assert 0, 'sibling not found!' - ... # check ight siblings - ... if i < (len(parent)-1): - ... for j in range(len(right_siblings)): - ... if right_siblings[j] is parent[i+1]: - ... del right_siblings[j] - ... break - ... else: - ... assert 0, 'sibling not found!' - ... # check roots - ... for root in parent.roots(): - ... assert id(root) in roots, 'missing root' - ... roots[id(root)] += 1 - ... # check that we don't have any unexplained values - ... assert len(left_siblings)==0, 'unexpected sibling' - ... assert len(right_siblings)==0, 'unexpected sibling' - ... for v in roots.values(): assert v>0, roots #'unexpected root' - ... # check treepositions - ... for root in mptree.roots(): - ... for treepos in mptree.treepositions(root): - ... assert root[treepos] is mptree - ... # Check mptree's children's methods: - ... for i, child in enumerate(mptree): - ... if isinstance(child, Tree): - ... # mpcheck parent() & parent_index() methods - ... assert has(child.parents(), mptree) - ... assert i in child.parent_indices(mptree) - ... # mpcheck sibling methods - ... if i > 0: - ... assert has(child.left_siblings(), mptree[i-1]) - ... if i < len(mptree)-1: - ... assert has(child.right_siblings(), mptree[i+1]) - ... if print_mptrees: - ... print('ok!', end=' ') - ... for mptree in print_mptrees: print(mptree) - ... else: - ... print('ok!') - -Run our test function on a variety of newly-created trees: - - >>> mpcheck(make_mptree('(A)')) - ok! (A ) - >>> mpcheck(make_mptree('(A (B (C (D) (E f)) g) h)')) - ok! (A (B (C (D ) (E f)) g) h) - >>> mpcheck(make_mptree('(A (B) (C c) (D d d) (E e e e))')) - ok! (A (B ) (C c) (D d d) (E e e e)) - >>> mpcheck(make_mptree('(A (B) (C (c)) (D (d) (d)) (E (e) (e) (e)))')) - ok! (A (B ) (C (c )) (D (d ) (d )) (E (e ) (e ) (e ))) - >>> subtree = make_mptree('(A (B (C (D) (E f)) g) h)') - -Including some trees that contain multiple parents: - - >>> mpcheck(MultiParentedTree('Z', [subtree, subtree])) - ok! (Z (A (B (C (D ) (E f)) g) h) (A (B (C (D ) (E f)) g) h)) - -Run our test function after performing various tree-modification -operations (n.b., these are the same tests that we ran for -`ParentedTree`, above; thus, none of these trees actually *uses* -multiple parents.) - -**__delitem__()** - - >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') - >>> e = mptree[0,0,1] - >>> del mptree[0,0,1]; mpcheck(mptree); mpcheck(e) - ok! (A (B (C (D ) (Q p)) g) h) - ok! (E f) - >>> del mptree[0,0,0]; mpcheck(mptree) - ok! (A (B (C (Q p)) g) h) - >>> del mptree[0,1]; mpcheck(mptree) - ok! (A (B (C (Q p))) h) - >>> del mptree[-1]; mpcheck(mptree) - ok! (A (B (C (Q p)))) - >>> del mptree[-100] - Traceback (most recent call last): - . . . - IndexError: index out of range - >>> del mptree[()] - Traceback (most recent call last): - . . . - IndexError: The tree position () may not be deleted. - - >>> # With slices: - >>> mptree = make_mptree('(A (B c) (D e) f g (H i) j (K l))') - >>> b = mptree[0] - >>> del mptree[0:0]; mpcheck(mptree) - ok! (A (B c) (D e) f g (H i) j (K l)) - >>> del mptree[:1]; mpcheck(mptree); mpcheck(b) - ok! (A (D e) f g (H i) j (K l)) - ok! (B c) - >>> del mptree[-2:]; mpcheck(mptree) - ok! (A (D e) f g (H i)) - >>> del mptree[1:3]; mpcheck(mptree) - ok! (A (D e) (H i)) - >>> mptree = make_mptree('(A (B c) (D e) f g (H i) j (K l))') - >>> del mptree[5:1000]; mpcheck(mptree) - ok! (A (B c) (D e) f g (H i)) - >>> del mptree[-2:1000]; mpcheck(mptree) - ok! (A (B c) (D e) f) - >>> del mptree[-100:1]; mpcheck(mptree) - ok! (A (D e) f) - >>> mptree = make_mptree('(A (B c) (D e) f g (H i) j (K l))') - >>> del mptree[1:-2:2]; mpcheck(mptree) - ok! (A (B c) f (H i) j (K l)) - -**__setitem__()** - - >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') - >>> d, e, q = mptree[0,0] - >>> mptree[0,0,0] = 'x'; mpcheck(mptree); mpcheck(d) - ok! (A (B (C x (E f) (Q p)) g) h) - ok! (D ) - >>> mptree[0,0,1] = make_mptree('(X (Y z))'); mpcheck(mptree); mpcheck(e) - ok! (A (B (C x (X (Y z)) (Q p)) g) h) - ok! (E f) - >>> mptree[1] = d; mpcheck(mptree) - ok! (A (B (C x (X (Y z)) (Q p)) g) (D )) - >>> mptree[-1] = 'x'; mpcheck(mptree) - ok! (A (B (C x (X (Y z)) (Q p)) g) x) - >>> mptree[-100] = 'y' - Traceback (most recent call last): - . . . - IndexError: index out of range - >>> mptree[()] = make_mptree('(X y)') - Traceback (most recent call last): - . . . - IndexError: The tree position () may not be assigned to. - - >>> # With slices: - >>> mptree = make_mptree('(A (B c) (D e) f g (H i) j (K l))') - >>> b = mptree[0] - >>> mptree[0:0] = ('x', make_mptree('(Y)')); mpcheck(mptree) - ok! (A x (Y ) (B c) (D e) f g (H i) j (K l)) - >>> mptree[2:6] = (); mpcheck(mptree); mpcheck(b) - ok! (A x (Y ) (H i) j (K l)) - ok! (B c) - >>> mptree[-2:] = ('z', 'p'); mpcheck(mptree) - ok! (A x (Y ) (H i) z p) - >>> mptree[1:3] = [make_mptree('(X)') for x in range(10)]; mpcheck(mptree) - ok! (A x (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) z p) - >>> mptree[5:1000] = []; mpcheck(mptree) - ok! (A x (X ) (X ) (X ) (X )) - >>> mptree[-2:1000] = ['n']; mpcheck(mptree) - ok! (A x (X ) (X ) n) - >>> mptree[-100:1] = [make_mptree('(U v)')]; mpcheck(mptree) - ok! (A (U v) (X ) (X ) n) - >>> mptree[-1:] = (make_mptree('(X)') for x in range(3)); mpcheck(mptree) - ok! (A (U v) (X ) (X ) (X ) (X ) (X )) - >>> mptree[1:-2:2] = ['x', 'y']; mpcheck(mptree) - ok! (A (U v) x (X ) y (X ) (X )) - -**append()** - - >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') - >>> mptree.append('x'); mpcheck(mptree) - ok! (A (B (C (D ) (E f) (Q p)) g) h x) - >>> mptree.append(make_mptree('(X (Y z))')); mpcheck(mptree) - ok! (A (B (C (D ) (E f) (Q p)) g) h x (X (Y z))) - -**extend()** - - >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') - >>> mptree.extend(['x', 'y', make_mptree('(X (Y z))')]); mpcheck(mptree) - ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z))) - >>> mptree.extend([]); mpcheck(mptree) - ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z))) - >>> mptree.extend(make_mptree('(X)') for x in range(3)); mpcheck(mptree) - ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z)) (X ) (X ) (X )) - -**insert()** - - >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') - >>> mptree.insert(0, make_mptree('(X (Y z))')); mpcheck(mptree) - ok! (A (X (Y z)) (B (C (D ) (E f) (Q p)) g) h) - >>> mptree.insert(-1, make_mptree('(X (Y z))')); mpcheck(mptree) - ok! (A (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h) - >>> mptree.insert(-4, make_mptree('(X (Y z))')); mpcheck(mptree) - ok! (A (X (Y z)) (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h) - >>> # Note: as with ``list``, inserting at a negative index that - >>> # gives a position before the start of the list does *not* - >>> # raise an IndexError exception; it just inserts at 0. - >>> mptree.insert(-400, make_mptree('(X (Y z))')); mpcheck(mptree) - ok! (A - (X (Y z)) - (X (Y z)) - (X (Y z)) - (B (C (D ) (E f) (Q p)) g) - (X (Y z)) - h) - -**pop()** - - >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') - >>> mptree[0,0].pop(1); mpcheck(mptree) - MultiParentedTree('E', ['f']) - ok! (A (B (C (D ) (Q p)) g) h) - >>> mptree[0].pop(-1); mpcheck(mptree) - 'g' - ok! (A (B (C (D ) (Q p))) h) - >>> mptree.pop(); mpcheck(mptree) - 'h' - ok! (A (B (C (D ) (Q p)))) - >>> mptree.pop(-100) - Traceback (most recent call last): - . . . - IndexError: index out of range - -**remove()** - - >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') - >>> e = mptree[0,0,1] - >>> mptree[0,0].remove(mptree[0,0,1]); mpcheck(mptree); mpcheck(e) - ok! (A (B (C (D ) (Q p)) g) h) - ok! (E f) - >>> mptree[0,0].remove(make_mptree('(Q p)')); mpcheck(mptree) - ok! (A (B (C (D )) g) h) - >>> mptree[0,0].remove(make_mptree('(Q p)')) - Traceback (most recent call last): - . . . - ValueError: MultiParentedTree('Q', ['p']) is not in list - >>> mptree.remove('h'); mpcheck(mptree) - ok! (A (B (C (D )) g)) - >>> mptree.remove('h'); - Traceback (most recent call last): - . . . - ValueError: 'h' is not in list - >>> # remove() removes the first subtree that is equal (==) to the - >>> # given tree, which may not be the identical tree we give it: - >>> mptree = make_mptree('(A (X x) (Y y) (X x))') - >>> x1, y, x2 = mptree - >>> mptree.remove(mptree[-1]); mpcheck(mptree) - ok! (A (Y y) (X x)) - >>> print([str(p) for p in x1.parents()]) - [] - >>> print([str(p) for p in x2.parents()]) - ['(A (Y y) (X x))'] - - -ImmutableMultiParentedTree Regression Tests -------------------------------------------- - - >>> imptree = ImmutableMultiParentedTree.convert(mptree) - >>> type(imptree) - - >>> del imptree[0] - Traceback (most recent call last): - . . . - ValueError: ImmutableMultiParentedTree may not be modified - >>> imptree.set_label('newnode') - Traceback (most recent call last): - . . . - ValueError: ImmutableMultiParentedTree may not be modified - - -ProbabilisticTree Regression Tests ----------------------------------- - - >>> prtree = ProbabilisticTree("S", [ProbabilisticTree("NP", ["N"], prob=0.3)], prob=0.6) - >>> print(prtree) - (S (NP N)) (p=0.6) - >>> import copy - >>> prtree == copy.deepcopy(prtree) == prtree.copy(deep=True) == prtree.copy() - True - >>> prtree[0] is prtree.copy()[0] - True - >>> prtree[0] is prtree.copy(deep=True)[0] - False - - >>> imprtree = ImmutableProbabilisticTree.convert(prtree) - >>> type(imprtree) - - >>> del imprtree[0] - Traceback (most recent call last): - . . . - ValueError: ImmutableProbabilisticTree may not be modified - >>> imprtree.set_label('newnode') - Traceback (most recent call last): - . . . - ValueError: ImmutableProbabilisticTree may not be modified - - -Squashed Bugs -============= - -This used to discard the ``(B b)`` subtree (fixed in svn 6270): - - >>> print(Tree.fromstring('((A a) (B b))')) - ( (A a) (B b)) - -Pickling ParentedTree instances didn't work for Python 3.7 onwards (See #2478) - - >>> import pickle - >>> tree = ParentedTree.fromstring('(S (NN x) (NP x) (NN x))') - >>> print(tree) - (S (NN x) (NP x) (NN x)) - - >>> pickled = pickle.dumps(tree) - >>> tree_loaded = pickle.loads(pickled) - >>> print(tree_loaded) - (S (NN x) (NP x) (NN x)) - -ParentedTree used to be impossible to (deep)copy. (See #1324) - - >>> from nltk.tree import ParentedTree - >>> import copy - >>> tree = ParentedTree.fromstring("(TOP (S (NP (NNP Bell,)) (NP (NP (DT a) (NN company)) (SBAR (WHNP (WDT which)) (S (VP (VBZ is) (VP (VBN based) (PP (IN in) (NP (NNP LA,)))))))) (VP (VBZ makes) (CC and) (VBZ distributes) (NP (NN computer))) (. products.)))") - >>> tree == copy.deepcopy(tree) == copy.copy(tree) == tree.copy(deep=True) == tree.copy() - True diff --git a/pipeline/nltk/test/treeprettyprinter.doctest b/pipeline/nltk/test/treeprettyprinter.doctest deleted file mode 100644 index b85c6d1e251d7e6e95a68fbeaf88eb8dd20fed00..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/treeprettyprinter.doctest +++ /dev/null @@ -1,177 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -========================================================= - Unit tests for nltk.tree.prettyprinter.TreePrettyPrinter -========================================================= - - >>> from nltk.tree import Tree, TreePrettyPrinter - -Tree nr 2170 from nltk.corpus.treebank: - - >>> tree = Tree.fromstring( - ... '(S (NP-SBJ (PRP I)) (VP (VBP feel) (ADJP-PRD (RB pretty) ' - ... '(JJ good)) (PP-CLR (IN about) (NP (PRP it)))) (. .))') - >>> tpp = TreePrettyPrinter(tree) - >>> print(tpp.text()) - S - __________________________|_____________________ - | VP | - | ____________________|___________ | - | | | PP-CLR | - | | | _____|_____ | - NP-SBJ | ADJP-PRD | NP | - | | _______|______ | | | - PRP VBP RB JJ IN PRP . - | | | | | | | - I feel pretty good about it . - - >>> print(tpp.text(unicodelines=True)) - S - ┌──────────────────────────┼─────────────────────┐ - │ VP │ - │ ┌─────────────┬──────┴───────────┐ │ - │ │ │ PP-CLR │ - │ │ │ ┌─────┴─────┐ │ - NP-SBJ │ ADJP-PRD │ NP │ - │ │ ┌───────┴──────┐ │ │ │ - PRP VBP RB JJ IN PRP . - │ │ │ │ │ │ │ - I feel pretty good about it . - -A tree with long labels: - - >>> tree = Tree.fromstring( - ... '(sentence (plural-noun-phrase (plural-noun Superconductors)) ' - ... '(verb-phrase (plural-verb conduct) ' - ... '(noun-phrase (singular-noun electricity))))') - >>> tpp = TreePrettyPrinter(tree) - >>> print(tpp.text(abbreviate=8, nodedist=2)) - sentence - __________|__________ - | verb-phr. - | __________|__________ - plural-n. | noun-phr. - | | | - plural-n. plural-v. singular. - | | | - Supercon. conduct electric. - - >>> print(tpp.text(maxwidth=8, nodedist=2)) - sentence - _________|________ - | verb- - | phrase - | ________|_________ - plural- | noun- - noun- | phrase - phrase | | - | | | - plural- plural- singular- - noun verb noun - | | | - Supercon conduct electric - ductors ity - -A discontinuous tree: - - >>> tree = Tree.fromstring( - ... '(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) ' - ... '(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) ' - ... '(vg 10) (inf (verb 11)))))) (punct 12))', read_leaf=int) - >>> sentence = ('Ze had met haar moeder kunnen gaan winkelen ,' - ... ' zwemmen of terrassen .'.split()) - >>> tpp = TreePrettyPrinter(tree, sentence) - >>> print(tpp.text()) - top - _____|______________________________________________ - smain | | - _______________________________|_____ | | - | | inf | | - | | _____|____ | | - | | | inf | | - | | | ____|_____ | | - | | | | conj | | - | | _____ | ___ | _________|______ | __________________ | - | | inf | | | | | | | - | | _________|_____ | ___ | _________ | | | | | - | | pp | | | | | | | | - | | ____|____ | | | | | | | | - | | | np | | | | inf | inf | - | | | ____|____ | | | | | | | | - noun verb prep det noun verb verb verb punct verb vg verb punct - | | | | | | | | | | | | | - Ze had met haar moeder kunnen gaan winkelen , zwemmen of terrassen . - - >>> print(tpp.text(unicodelines=True)) - top - ┌─────┴──────────────────┬───────────────────────────┐ - smain │ │ - ┌────┬──────────────────────────┴─────┐ │ │ - │ │ inf │ │ - │ │ ┌─────┴────┐ │ │ - │ │ │ inf │ │ - │ │ │ ┌────┴─────┐ │ │ - │ │ │ │ conj │ │ - │ │ ┌───── │ ─── │ ─────────┴────── │ ─────┬─────┬──────┐ │ - │ │ inf │ │ │ │ │ │ │ - │ │ ┌─────────┴───── │ ─── │ ─────────┐ │ │ │ │ │ - │ │ pp │ │ │ │ │ │ │ │ - │ │ ┌────┴────┐ │ │ │ │ │ │ │ │ - │ │ │ np │ │ │ │ inf │ inf │ - │ │ │ ┌────┴────┐ │ │ │ │ │ │ │ │ - noun verb prep det noun verb verb verb punct verb vg verb punct - │ │ │ │ │ │ │ │ │ │ │ │ │ - Ze had met haar moeder kunnen gaan winkelen , zwemmen of terrassen . - -Importing TreePrettyPrinter ---------------------------- - -First of all, a simple tree will be constructed:: - - >>> from nltk.tree import Tree - >>> tree = Tree.fromstring('(S (NP Mary) (VP walks))') - -We'll use this sample tree to show that the method of importing `TreePrettyPrinter` work correctly: - -- Recommended:: - - >>> from nltk.tree import TreePrettyPrinter - >>> print(TreePrettyPrinter(tree).text()) - S - ____|____ - NP VP - | | - Mary walks - -- Alternative but valid options:: - - >>> from nltk import TreePrettyPrinter - >>> print(TreePrettyPrinter(tree).text()) - S - ____|____ - NP VP - | | - Mary walks - - >>> from nltk.tree.prettyprinter import TreePrettyPrinter - >>> print(TreePrettyPrinter(tree).text()) - S - ____|____ - NP VP - | | - Mary walks - -- Deprecated, do not use:: - - >>> from nltk.treeprettyprinter import TreePrettyPrinter - >>> print(TreePrettyPrinter(tree).text()) - S - ____|____ - NP VP - | | - Mary walks - - This method will throw a DeprecationWarning:: - - Import `TreePrettyPrinter` using `from nltk.tree import TreePrettyPrinter` instead. diff --git a/pipeline/nltk/test/treetransforms.doctest b/pipeline/nltk/test/treetransforms.doctest deleted file mode 100644 index a1ea0eb6b61914644d80170e50b87e1dd03c6413..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/treetransforms.doctest +++ /dev/null @@ -1,154 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -------------------------------------------- -Unit tests for the TreeTransformation class -------------------------------------------- - - >>> from copy import deepcopy - >>> from nltk.tree import Tree, collapse_unary, chomsky_normal_form, un_chomsky_normal_form - - >>> tree_string = "(TOP (S (S (VP (VBN Turned) (ADVP (RB loose)) (PP (IN in) (NP (NP (NNP Shane) (NNP Longman) (POS 's)) (NN trading) (NN room))))) (, ,) (NP (DT the) (NN yuppie) (NNS dealers)) (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) (. .)))" - - >>> tree = Tree.fromstring(tree_string) - >>> print(tree) - (TOP - (S - (S - (VP - (VBN Turned) - (ADVP (RB loose)) - (PP - (IN in) - (NP - (NP (NNP Shane) (NNP Longman) (POS 's)) - (NN trading) - (NN room))))) - (, ,) - (NP (DT the) (NN yuppie) (NNS dealers)) - (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) - (. .))) - -Make a copy of the original tree and collapse the subtrees with only one child - - >>> collapsedTree = deepcopy(tree) - >>> collapse_unary(collapsedTree) - >>> print(collapsedTree) - (TOP - (S - (S+VP - (VBN Turned) - (ADVP (RB loose)) - (PP - (IN in) - (NP - (NP (NNP Shane) (NNP Longman) (POS 's)) - (NN trading) - (NN room)))) - (, ,) - (NP (DT the) (NN yuppie) (NNS dealers)) - (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) - (. .))) - - >>> collapsedTree2 = deepcopy(tree) - >>> collapse_unary(collapsedTree2, collapsePOS=True, collapseRoot=True) - >>> print(collapsedTree2) - (TOP+S - (S+VP - (VBN Turned) - (ADVP+RB loose) - (PP - (IN in) - (NP - (NP (NNP Shane) (NNP Longman) (POS 's)) - (NN trading) - (NN room)))) - (, ,) - (NP (DT the) (NN yuppie) (NNS dealers)) - (VP (AUX do) (NP (NP+RB little) (ADJP+RB right))) - (. .)) - -Convert the tree to Chomsky Normal Form i.e. each subtree has either two -subtree children or a single leaf value. This conversion can be performed -using either left- or right-factoring. - - >>> cnfTree = deepcopy(collapsedTree) - >>> chomsky_normal_form(cnfTree, factor='left') - >>> print(cnfTree) - (TOP - (S - (S| - (S| - (S| - (S+VP - (S+VP| (VBN Turned) (ADVP (RB loose))) - (PP - (IN in) - (NP - (NP| - (NP - (NP| (NNP Shane) (NNP Longman)) - (POS 's)) - (NN trading)) - (NN room)))) - (, ,)) - (NP (NP| (DT the) (NN yuppie)) (NNS dealers))) - (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right))))) - (. .))) - - >>> cnfTree = deepcopy(collapsedTree) - >>> chomsky_normal_form(cnfTree, factor='right') - >>> print(cnfTree) - (TOP - (S - (S+VP - (VBN Turned) - (S+VP| - (ADVP (RB loose)) - (PP - (IN in) - (NP - (NP (NNP Shane) (NP| (NNP Longman) (POS 's))) - (NP| (NN trading) (NN room)))))) - (S|<,-NP-VP-.> - (, ,) - (S| - (NP (DT the) (NP| (NN yuppie) (NNS dealers))) - (S| - (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) - (. .)))))) - -Employ some Markov smoothing to make the artificial node labels a bit more -readable. See the treetransforms.py documentation for more details. - - >>> markovTree = deepcopy(collapsedTree) - >>> chomsky_normal_form(markovTree, horzMarkov=2, vertMarkov=1) - >>> print(markovTree) - (TOP - (S^ - (S+VP^ - (VBN Turned) - (S+VP|^ - (ADVP^ (RB loose)) - (PP^ - (IN in) - (NP^ - (NP^ - (NNP Shane) - (NP|^ (NNP Longman) (POS 's))) - (NP|^ (NN trading) (NN room)))))) - (S|<,-NP>^ - (, ,) - (S|^ - (NP^ (DT the) (NP|^ (NN yuppie) (NNS dealers))) - (S|^ - (VP^ - (AUX do) - (NP^ (NP^ (RB little)) (ADJP^ (RB right)))) - (. .)))))) - -Convert the transformed tree back to its original form - - >>> un_chomsky_normal_form(markovTree) - >>> tree == markovTree - True diff --git a/pipeline/nltk/test/unit/__init__.py b/pipeline/nltk/test/unit/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/pipeline/nltk/test/unit/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 7c0ff713b8cf1633c30cc0519ca980a79d74e24d..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_aline.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_aline.cpython-39.pyc deleted file mode 100644 index 7ebfa60b094edd04323542cc07fd0ba0b07444b9..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_aline.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_bllip.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_bllip.cpython-39.pyc deleted file mode 100644 index 665695891592ef0c6008a7569a64c944f871a76a..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_bllip.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_brill.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_brill.cpython-39.pyc deleted file mode 100644 index 5424f9a1961a7c547687255e451cb440ffe83c8e..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_brill.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_cfd_mutation.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_cfd_mutation.cpython-39.pyc deleted file mode 100644 index b26fc2b0be78a107dabf770c1b4563a1fdd64620..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_cfd_mutation.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_cfg2chomsky.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_cfg2chomsky.cpython-39.pyc deleted file mode 100644 index 713d32049b95cfb6c8ee203c9f81ae1f831d2a2b..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_cfg2chomsky.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_chunk.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_chunk.cpython-39.pyc deleted file mode 100644 index 92113b8708a462c3796ae836480c2b2e8133b2ef..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_chunk.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_classify.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_classify.cpython-39.pyc deleted file mode 100644 index 6ad799b8ecc7d60003e0a9ada59da27f03161923..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_classify.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_collocations.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_collocations.cpython-39.pyc deleted file mode 100644 index d840b0ba7d170ef882c57034fe158c26d181597b..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_collocations.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_concordance.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_concordance.cpython-39.pyc deleted file mode 100644 index f40e88091b781740082478b6346e21c4132ab632..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_concordance.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_corenlp.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_corenlp.cpython-39.pyc deleted file mode 100644 index fcd16cc2e6481df28ca1f72fb774fafacab133c8..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_corenlp.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_corpora.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_corpora.cpython-39.pyc deleted file mode 100644 index d4712ce3a36c016654f0f06c7def430ccc0428d0..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_corpora.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_corpus_views.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_corpus_views.cpython-39.pyc deleted file mode 100644 index a469524df81a5f3cb5ddbd8265058fc6339c3be6..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_corpus_views.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_data.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_data.cpython-39.pyc deleted file mode 100644 index 1407714142896e0c1b992290d8c98a2a3ce98e7e..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_data.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_disagreement.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_disagreement.cpython-39.pyc deleted file mode 100644 index 2ab76ba8e5fd77335e6c4cdfc744bc6a4b6e707c..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_disagreement.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_distance.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_distance.cpython-39.pyc deleted file mode 100644 index 181d21fbc48a348233e6559ff2a6e96cf05c4d5b..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_distance.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_downloader.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_downloader.cpython-39.pyc deleted file mode 100644 index f6e4ade0e107216a92ef64c0bef87e9192e45d2a..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_downloader.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_freqdist.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_freqdist.cpython-39.pyc deleted file mode 100644 index 6a08126c4607a8ade939e5b16b45a72e1cbe5732..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_freqdist.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_hmm.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_hmm.cpython-39.pyc deleted file mode 100644 index b60065b477c3b4bc4fd9f2ced3be5ab5c0cc35ff..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_hmm.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_json2csv_corpus.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_json2csv_corpus.cpython-39.pyc deleted file mode 100644 index 08c8e998c74324702c95e634a7851d001c200541..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_json2csv_corpus.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_json_serialization.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_json_serialization.cpython-39.pyc deleted file mode 100644 index 79219de7f0fde0ceddf8a25d3df78272a751fb47..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_json_serialization.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_metrics.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_metrics.cpython-39.pyc deleted file mode 100644 index 479bf89cc98f533edd88dc4374fc0db5f6c82475..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_metrics.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_naivebayes.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_naivebayes.cpython-39.pyc deleted file mode 100644 index 85710d1db2fec23066d13f5604d125ffb7bd69fd..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_naivebayes.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_nombank.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_nombank.cpython-39.pyc deleted file mode 100644 index 366e47220c2791be321b5465e6532b7271df088a..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_nombank.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_pl196x.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_pl196x.cpython-39.pyc deleted file mode 100644 index 1da46dd7573835e4b107baa567d99d893f3b3c48..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_pl196x.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_pos_tag.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_pos_tag.cpython-39.pyc deleted file mode 100644 index df42cc761bf7f5d908d137000146d71c7daf6066..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_pos_tag.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_ribes.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_ribes.cpython-39.pyc deleted file mode 100644 index ef9e28456e92b8483704f15d1d655e3de00c27df..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_ribes.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_rte_classify.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_rte_classify.cpython-39.pyc deleted file mode 100644 index 947622b1c022b6379f8d21626a9a7f39f2b4136b..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_rte_classify.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_seekable_unicode_stream_reader.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_seekable_unicode_stream_reader.cpython-39.pyc deleted file mode 100644 index 5a9c66ea915b5f4d3c2230d860752655b579b079..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_seekable_unicode_stream_reader.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_senna.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_senna.cpython-39.pyc deleted file mode 100644 index fe011cebcda51b0498acdd112486fe5c9b30d28a..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_senna.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_stem.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_stem.cpython-39.pyc deleted file mode 100644 index 2f9199cdbb1d8dae7a74b43a67eae80bc8fff96a..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_stem.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_tag.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_tag.cpython-39.pyc deleted file mode 100644 index e834e95ebd8b222cf139f00cdbe060011280ebd4..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_tag.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_tgrep.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_tgrep.cpython-39.pyc deleted file mode 100644 index ed734552939ec5bcc8c8f0ebfdc7cb1e777bc04c..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_tgrep.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_tokenize.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_tokenize.cpython-39.pyc deleted file mode 100644 index 17775d31ac74fb8bd869d37954628406c264d5ea..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_tokenize.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_twitter_auth.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_twitter_auth.cpython-39.pyc deleted file mode 100644 index 8a1373d0ab7904cdb7910f78f585f44046f475b7..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_twitter_auth.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_util.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_util.cpython-39.pyc deleted file mode 100644 index 9d3d6571ac48c15f330d1df64ce168bc27f916fe..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_util.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/__pycache__/test_wordnet.cpython-39.pyc b/pipeline/nltk/test/unit/__pycache__/test_wordnet.cpython-39.pyc deleted file mode 100644 index 804537ac49810ac6eb868003d982b79a8e157e49..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/__pycache__/test_wordnet.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/lm/__init__.py b/pipeline/nltk/test/unit/lm/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/pipeline/nltk/test/unit/lm/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/test/unit/lm/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 081fe29311e5874f8fa49c4b16a501a84f3ca569..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/lm/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/lm/__pycache__/test_counter.cpython-39.pyc b/pipeline/nltk/test/unit/lm/__pycache__/test_counter.cpython-39.pyc deleted file mode 100644 index 5cdb1a2317ad3a1d2541b20c95b25e4ea1fb13b2..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/lm/__pycache__/test_counter.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/lm/__pycache__/test_models.cpython-39.pyc b/pipeline/nltk/test/unit/lm/__pycache__/test_models.cpython-39.pyc deleted file mode 100644 index 0cfba0e2028e25fbad29e52aad24883331a43e3d..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/lm/__pycache__/test_models.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/lm/__pycache__/test_preprocessing.cpython-39.pyc b/pipeline/nltk/test/unit/lm/__pycache__/test_preprocessing.cpython-39.pyc deleted file mode 100644 index ef3e31d3c7fc09a3d6b0fa8b7a837aa119fcad3f..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/lm/__pycache__/test_preprocessing.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/lm/__pycache__/test_vocabulary.cpython-39.pyc b/pipeline/nltk/test/unit/lm/__pycache__/test_vocabulary.cpython-39.pyc deleted file mode 100644 index c11fbe9287e6abd17611bfb58ef07a66e9f1b24f..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/lm/__pycache__/test_vocabulary.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/lm/test_counter.py b/pipeline/nltk/test/unit/lm/test_counter.py deleted file mode 100644 index f28b361cb76121f76d633d709aca6b5e32acb14d..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/lm/test_counter.py +++ /dev/null @@ -1,116 +0,0 @@ -# Natural Language Toolkit: Language Model Unit Tests -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ilia Kurenkov -# URL: -# For license information, see LICENSE.TXT - -import unittest - -import pytest - -from nltk import FreqDist -from nltk.lm import NgramCounter -from nltk.util import everygrams - - -class TestNgramCounter: - """Tests for NgramCounter that only involve lookup, no modification.""" - - @classmethod - def setup_class(self): - text = [list("abcd"), list("egdbe")] - self.trigram_counter = NgramCounter( - everygrams(sent, max_len=3) for sent in text - ) - self.bigram_counter = NgramCounter(everygrams(sent, max_len=2) for sent in text) - self.case = unittest.TestCase() - - def test_N(self): - assert self.bigram_counter.N() == 16 - assert self.trigram_counter.N() == 21 - - def test_counter_len_changes_with_lookup(self): - assert len(self.bigram_counter) == 2 - self.bigram_counter[50] - assert len(self.bigram_counter) == 3 - - def test_ngram_order_access_unigrams(self): - assert self.bigram_counter[1] == self.bigram_counter.unigrams - - def test_ngram_conditional_freqdist(self): - case = unittest.TestCase() - expected_trigram_contexts = [ - ("a", "b"), - ("b", "c"), - ("e", "g"), - ("g", "d"), - ("d", "b"), - ] - expected_bigram_contexts = [("a",), ("b",), ("d",), ("e",), ("c",), ("g",)] - - bigrams = self.trigram_counter[2] - trigrams = self.trigram_counter[3] - - self.case.assertCountEqual(expected_bigram_contexts, bigrams.conditions()) - self.case.assertCountEqual(expected_trigram_contexts, trigrams.conditions()) - - def test_bigram_counts_seen_ngrams(self): - assert self.bigram_counter[["a"]]["b"] == 1 - assert self.bigram_counter[["b"]]["c"] == 1 - - def test_bigram_counts_unseen_ngrams(self): - assert self.bigram_counter[["b"]]["z"] == 0 - - def test_unigram_counts_seen_words(self): - assert self.bigram_counter["b"] == 2 - - def test_unigram_counts_completely_unseen_words(self): - assert self.bigram_counter["z"] == 0 - - -class TestNgramCounterTraining: - @classmethod - def setup_class(self): - self.counter = NgramCounter() - self.case = unittest.TestCase() - - @pytest.mark.parametrize("case", ["", [], None]) - def test_empty_inputs(self, case): - test = NgramCounter(case) - assert 2 not in test - assert test[1] == FreqDist() - - def test_train_on_unigrams(self): - words = list("abcd") - counter = NgramCounter([[(w,) for w in words]]) - - assert not counter[3] - assert not counter[2] - self.case.assertCountEqual(words, counter[1].keys()) - - def test_train_on_illegal_sentences(self): - str_sent = ["Check", "this", "out", "!"] - list_sent = [["Check", "this"], ["this", "out"], ["out", "!"]] - - with pytest.raises(TypeError): - NgramCounter([str_sent]) - - with pytest.raises(TypeError): - NgramCounter([list_sent]) - - def test_train_on_bigrams(self): - bigram_sent = [("a", "b"), ("c", "d")] - counter = NgramCounter([bigram_sent]) - assert not bool(counter[3]) - - def test_train_on_mix(self): - mixed_sent = [("a", "b"), ("c", "d"), ("e", "f", "g"), ("h",)] - counter = NgramCounter([mixed_sent]) - unigrams = ["h"] - bigram_contexts = [("a",), ("c",)] - trigram_contexts = [("e", "f")] - - self.case.assertCountEqual(unigrams, counter[1].keys()) - self.case.assertCountEqual(bigram_contexts, counter[2].keys()) - self.case.assertCountEqual(trigram_contexts, counter[3].keys()) diff --git a/pipeline/nltk/test/unit/lm/test_models.py b/pipeline/nltk/test/unit/lm/test_models.py deleted file mode 100644 index c0649fcc255b8452ec7de20afd04d099bf8e644d..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/lm/test_models.py +++ /dev/null @@ -1,610 +0,0 @@ -# Natural Language Toolkit: Language Model Unit Tests -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ilia Kurenkov -# URL: -# For license information, see LICENSE.TXT -import math -from operator import itemgetter - -import pytest - -from nltk.lm import ( - MLE, - AbsoluteDiscountingInterpolated, - KneserNeyInterpolated, - Laplace, - Lidstone, - StupidBackoff, - Vocabulary, - WittenBellInterpolated, -) -from nltk.lm.preprocessing import padded_everygrams - - -@pytest.fixture(scope="session") -def vocabulary(): - return Vocabulary(["a", "b", "c", "d", "z", "", ""], unk_cutoff=1) - - -@pytest.fixture(scope="session") -def training_data(): - return [["a", "b", "c", "d"], ["e", "g", "a", "d", "b", "e"]] - - -@pytest.fixture(scope="session") -def bigram_training_data(training_data): - return [list(padded_everygrams(2, sent)) for sent in training_data] - - -@pytest.fixture(scope="session") -def trigram_training_data(training_data): - return [list(padded_everygrams(3, sent)) for sent in training_data] - - -@pytest.fixture -def mle_bigram_model(vocabulary, bigram_training_data): - model = MLE(2, vocabulary=vocabulary) - model.fit(bigram_training_data) - return model - - -@pytest.mark.parametrize( - "word, context, expected_score", - [ - ("d", ["c"], 1), - # Unseen ngrams should yield 0 - ("d", ["e"], 0), - # Unigrams should also be 0 - ("z", None, 0), - # N unigrams = 14 - # count('a') = 2 - ("a", None, 2.0 / 14), - # count('y') = 3 - ("y", None, 3.0 / 14), - ], -) -def test_mle_bigram_scores(mle_bigram_model, word, context, expected_score): - assert pytest.approx(mle_bigram_model.score(word, context), 1e-4) == expected_score - - -def test_mle_bigram_logscore_for_zero_score(mle_bigram_model): - assert math.isinf(mle_bigram_model.logscore("d", ["e"])) - - -def test_mle_bigram_entropy_perplexity_seen(mle_bigram_model): - # ngrams seen during training - trained = [ - ("", "a"), - ("a", "b"), - ("b", ""), - ("", "a"), - ("a", "d"), - ("d", ""), - ] - # Ngram = Log score - # , a = -1 - # a, b = -1 - # b, UNK = -1 - # UNK, a = -1.585 - # a, d = -1 - # d, = -1 - # TOTAL logscores = -6.585 - # - AVG logscores = 1.0975 - H = 1.0975 - perplexity = 2.1398 - assert pytest.approx(mle_bigram_model.entropy(trained), 1e-4) == H - assert pytest.approx(mle_bigram_model.perplexity(trained), 1e-4) == perplexity - - -def test_mle_bigram_entropy_perplexity_unseen(mle_bigram_model): - # In MLE, even one unseen ngram should make entropy and perplexity infinite - untrained = [("", "a"), ("a", "c"), ("c", "d"), ("d", "")] - - assert math.isinf(mle_bigram_model.entropy(untrained)) - assert math.isinf(mle_bigram_model.perplexity(untrained)) - - -def test_mle_bigram_entropy_perplexity_unigrams(mle_bigram_model): - # word = score, log score - # = 0.1429, -2.8074 - # a = 0.1429, -2.8074 - # c = 0.0714, -3.8073 - # UNK = 0.2143, -2.2224 - # d = 0.1429, -2.8074 - # c = 0.0714, -3.8073 - # = 0.1429, -2.8074 - # TOTAL logscores = -21.6243 - # - AVG logscores = 3.0095 - H = 3.0095 - perplexity = 8.0529 - - text = [("",), ("a",), ("c",), ("-",), ("d",), ("c",), ("",)] - - assert pytest.approx(mle_bigram_model.entropy(text), 1e-4) == H - assert pytest.approx(mle_bigram_model.perplexity(text), 1e-4) == perplexity - - -@pytest.fixture -def mle_trigram_model(trigram_training_data, vocabulary): - model = MLE(order=3, vocabulary=vocabulary) - model.fit(trigram_training_data) - return model - - -@pytest.mark.parametrize( - "word, context, expected_score", - [ - # count(d | b, c) = 1 - # count(b, c) = 1 - ("d", ("b", "c"), 1), - # count(d | c) = 1 - # count(c) = 1 - ("d", ["c"], 1), - # total number of tokens is 18, of which "a" occurred 2 times - ("a", None, 2.0 / 18), - # in vocabulary but unseen - ("z", None, 0), - # out of vocabulary should use "UNK" score - ("y", None, 3.0 / 18), - ], -) -def test_mle_trigram_scores(mle_trigram_model, word, context, expected_score): - assert pytest.approx(mle_trigram_model.score(word, context), 1e-4) == expected_score - - -@pytest.fixture -def lidstone_bigram_model(bigram_training_data, vocabulary): - model = Lidstone(0.1, order=2, vocabulary=vocabulary) - model.fit(bigram_training_data) - return model - - -@pytest.mark.parametrize( - "word, context, expected_score", - [ - # count(d | c) = 1 - # *count(d | c) = 1.1 - # Count(w | c for w in vocab) = 1 - # *Count(w | c for w in vocab) = 1.8 - ("d", ["c"], 1.1 / 1.8), - # Total unigrams: 14 - # Vocab size: 8 - # Denominator: 14 + 0.8 = 14.8 - # count("a") = 2 - # *count("a") = 2.1 - ("a", None, 2.1 / 14.8), - # in vocabulary but unseen - # count("z") = 0 - # *count("z") = 0.1 - ("z", None, 0.1 / 14.8), - # out of vocabulary should use "UNK" score - # count("") = 3 - # *count("") = 3.1 - ("y", None, 3.1 / 14.8), - ], -) -def test_lidstone_bigram_score(lidstone_bigram_model, word, context, expected_score): - assert ( - pytest.approx(lidstone_bigram_model.score(word, context), 1e-4) - == expected_score - ) - - -def test_lidstone_entropy_perplexity(lidstone_bigram_model): - text = [ - ("", "a"), - ("a", "c"), - ("c", ""), - ("", "d"), - ("d", "c"), - ("c", ""), - ] - # Unlike MLE this should be able to handle completely novel ngrams - # Ngram = score, log score - # , a = 0.3929, -1.3479 - # a, c = 0.0357, -4.8074 - # c, UNK = 0.0(5), -4.1699 - # UNK, d = 0.0263, -5.2479 - # d, c = 0.0357, -4.8074 - # c, = 0.0(5), -4.1699 - # TOTAL logscore: −24.5504 - # - AVG logscore: 4.0917 - H = 4.0917 - perplexity = 17.0504 - assert pytest.approx(lidstone_bigram_model.entropy(text), 1e-4) == H - assert pytest.approx(lidstone_bigram_model.perplexity(text), 1e-4) == perplexity - - -@pytest.fixture -def lidstone_trigram_model(trigram_training_data, vocabulary): - model = Lidstone(0.1, order=3, vocabulary=vocabulary) - model.fit(trigram_training_data) - return model - - -@pytest.mark.parametrize( - "word, context, expected_score", - [ - # Logic behind this is the same as for bigram model - ("d", ["c"], 1.1 / 1.8), - # if we choose a word that hasn't appeared after (b, c) - ("e", ["c"], 0.1 / 1.8), - # Trigram score now - ("d", ["b", "c"], 1.1 / 1.8), - ("e", ["b", "c"], 0.1 / 1.8), - ], -) -def test_lidstone_trigram_score(lidstone_trigram_model, word, context, expected_score): - assert ( - pytest.approx(lidstone_trigram_model.score(word, context), 1e-4) - == expected_score - ) - - -@pytest.fixture -def laplace_bigram_model(bigram_training_data, vocabulary): - model = Laplace(2, vocabulary=vocabulary) - model.fit(bigram_training_data) - return model - - -@pytest.mark.parametrize( - "word, context, expected_score", - [ - # basic sanity-check: - # count(d | c) = 1 - # *count(d | c) = 2 - # Count(w | c for w in vocab) = 1 - # *Count(w | c for w in vocab) = 9 - ("d", ["c"], 2.0 / 9), - # Total unigrams: 14 - # Vocab size: 8 - # Denominator: 14 + 8 = 22 - # count("a") = 2 - # *count("a") = 3 - ("a", None, 3.0 / 22), - # in vocabulary but unseen - # count("z") = 0 - # *count("z") = 1 - ("z", None, 1.0 / 22), - # out of vocabulary should use "UNK" score - # count("") = 3 - # *count("") = 4 - ("y", None, 4.0 / 22), - ], -) -def test_laplace_bigram_score(laplace_bigram_model, word, context, expected_score): - assert ( - pytest.approx(laplace_bigram_model.score(word, context), 1e-4) == expected_score - ) - - -def test_laplace_bigram_entropy_perplexity(laplace_bigram_model): - text = [ - ("", "a"), - ("a", "c"), - ("c", ""), - ("", "d"), - ("d", "c"), - ("c", ""), - ] - # Unlike MLE this should be able to handle completely novel ngrams - # Ngram = score, log score - # , a = 0.2, -2.3219 - # a, c = 0.1, -3.3219 - # c, UNK = 0.(1), -3.1699 - # UNK, d = 0.(09), 3.4594 - # d, c = 0.1 -3.3219 - # c, = 0.(1), -3.1699 - # Total logscores: −18.7651 - # - AVG logscores: 3.1275 - H = 3.1275 - perplexity = 8.7393 - assert pytest.approx(laplace_bigram_model.entropy(text), 1e-4) == H - assert pytest.approx(laplace_bigram_model.perplexity(text), 1e-4) == perplexity - - -def test_laplace_gamma(laplace_bigram_model): - assert laplace_bigram_model.gamma == 1 - - -@pytest.fixture -def wittenbell_trigram_model(trigram_training_data, vocabulary): - model = WittenBellInterpolated(3, vocabulary=vocabulary) - model.fit(trigram_training_data) - return model - - -@pytest.mark.parametrize( - "word, context, expected_score", - [ - # For unigram scores by default revert to regular MLE - # Total unigrams: 18 - # Vocab Size = 7 - # count('c'): 1 - ("c", None, 1.0 / 18), - # in vocabulary but unseen - # count("z") = 0 - ("z", None, 0 / 18), - # out of vocabulary should use "UNK" score - # count("") = 3 - ("y", None, 3.0 / 18), - # 2 words follow b and b occurred a total of 2 times - # gamma(['b']) = 2 / (2 + 2) = 0.5 - # mle.score('c', ['b']) = 0.5 - # mle('c') = 1 / 18 = 0.055 - # (1 - gamma) * mle + gamma * mle('c') ~= 0.27 + 0.055 - ("c", ["b"], (1 - 0.5) * 0.5 + 0.5 * 1 / 18), - # building on that, let's try 'a b c' as the trigram - # 1 word follows 'a b' and 'a b' occurred 1 time - # gamma(['a', 'b']) = 1 / (1 + 1) = 0.5 - # mle("c", ["a", "b"]) = 1 - ("c", ["a", "b"], (1 - 0.5) + 0.5 * ((1 - 0.5) * 0.5 + 0.5 * 1 / 18)), - # P(c|zb) - # The ngram 'zbc' was not seen, so we use P(c|b). See issue #2332. - ("c", ["z", "b"], ((1 - 0.5) * 0.5 + 0.5 * 1 / 18)), - ], -) -def test_wittenbell_trigram_score( - wittenbell_trigram_model, word, context, expected_score -): - assert ( - pytest.approx(wittenbell_trigram_model.score(word, context), 1e-4) - == expected_score - ) - - -############################################################################### -# Notation Explained # -############################################################################### -# For all subsequent calculations we use the following notation: -# 1. '*': Placeholder for any word/character. E.g. '*b' stands for -# all bigrams that end in 'b'. '*b*' stands for all trigrams that -# contain 'b' in the middle. -# 1. count(ngram): Count all instances (tokens) of an ngram. -# 1. unique(ngram): Count unique instances (types) of an ngram. - - -@pytest.fixture -def kneserney_trigram_model(trigram_training_data, vocabulary): - model = KneserNeyInterpolated(order=3, discount=0.75, vocabulary=vocabulary) - model.fit(trigram_training_data) - return model - - -@pytest.mark.parametrize( - "word, context, expected_score", - [ - # P(c) = count('*c') / unique('**') - # = 1 / 14 - ("c", None, 1.0 / 14), - # P(z) = count('*z') / unique('**') - # = 0 / 14 - # 'z' is in the vocabulary, but it was not seen during training. - ("z", None, 0.0 / 14), - # P(y) - # Out of vocabulary should use "UNK" score. - # P(y) = P(UNK) = count('*UNK') / unique('**') - ("y", None, 3 / 14), - # We start with P(c|b) - # P(c|b) = alpha('bc') + gamma('b') * P(c) - # alpha('bc') = max(unique('*bc') - discount, 0) / unique('*b*') - # = max(1 - 0.75, 0) / 2 - # = 0.125 - # gamma('b') = discount * unique('b*') / unique('*b*') - # = (0.75 * 2) / 2 - # = 0.75 - ("c", ["b"], (0.125 + 0.75 * (1 / 14))), - # Building on that, let's try P(c|ab). - # P(c|ab) = alpha('abc') + gamma('ab') * P(c|b) - # alpha('abc') = max(count('abc') - discount, 0) / count('ab*') - # = max(1 - 0.75, 0) / 1 - # = 0.25 - # gamma('ab') = (discount * unique('ab*')) / count('ab*') - # = 0.75 * 1 / 1 - ("c", ["a", "b"], 0.25 + 0.75 * (0.125 + 0.75 * (1 / 14))), - # P(c|zb) - # The ngram 'zbc' was not seen, so we use P(c|b). See issue #2332. - ("c", ["z", "b"], (0.125 + 0.75 * (1 / 14))), - ], -) -def test_kneserney_trigram_score( - kneserney_trigram_model, word, context, expected_score -): - assert ( - pytest.approx(kneserney_trigram_model.score(word, context), 1e-4) - == expected_score - ) - - -@pytest.fixture -def absolute_discounting_trigram_model(trigram_training_data, vocabulary): - model = AbsoluteDiscountingInterpolated(order=3, vocabulary=vocabulary) - model.fit(trigram_training_data) - return model - - -@pytest.mark.parametrize( - "word, context, expected_score", - [ - # For unigram scores revert to uniform - # P(c) = count('c') / count('**') - ("c", None, 1.0 / 18), - # in vocabulary but unseen - # count('z') = 0 - ("z", None, 0.0 / 18), - # out of vocabulary should use "UNK" score - # count('') = 3 - ("y", None, 3 / 18), - # P(c|b) = alpha('bc') + gamma('b') * P(c) - # alpha('bc') = max(count('bc') - discount, 0) / count('b*') - # = max(1 - 0.75, 0) / 2 - # = 0.125 - # gamma('b') = discount * unique('b*') / count('b*') - # = (0.75 * 2) / 2 - # = 0.75 - ("c", ["b"], (0.125 + 0.75 * (2 / 2) * (1 / 18))), - # Building on that, let's try P(c|ab). - # P(c|ab) = alpha('abc') + gamma('ab') * P(c|b) - # alpha('abc') = max(count('abc') - discount, 0) / count('ab*') - # = max(1 - 0.75, 0) / 1 - # = 0.25 - # gamma('ab') = (discount * unique('ab*')) / count('ab*') - # = 0.75 * 1 / 1 - ("c", ["a", "b"], 0.25 + 0.75 * (0.125 + 0.75 * (2 / 2) * (1 / 18))), - # P(c|zb) - # The ngram 'zbc' was not seen, so we use P(c|b). See issue #2332. - ("c", ["z", "b"], (0.125 + 0.75 * (2 / 2) * (1 / 18))), - ], -) -def test_absolute_discounting_trigram_score( - absolute_discounting_trigram_model, word, context, expected_score -): - assert ( - pytest.approx(absolute_discounting_trigram_model.score(word, context), 1e-4) - == expected_score - ) - - -@pytest.fixture -def stupid_backoff_trigram_model(trigram_training_data, vocabulary): - model = StupidBackoff(order=3, vocabulary=vocabulary) - model.fit(trigram_training_data) - return model - - -@pytest.mark.parametrize( - "word, context, expected_score", - [ - # For unigram scores revert to uniform - # total bigrams = 18 - ("c", None, 1.0 / 18), - # in vocabulary but unseen - # bigrams ending with z = 0 - ("z", None, 0.0 / 18), - # out of vocabulary should use "UNK" score - # count(''): 3 - ("y", None, 3 / 18), - # c follows 1 time out of 2 after b - ("c", ["b"], 1 / 2), - # c always follows ab - ("c", ["a", "b"], 1 / 1), - # The ngram 'z b c' was not seen, so we backoff to - # the score of the ngram 'b c' * smoothing factor - ("c", ["z", "b"], (0.4 * (1 / 2))), - ], -) -def test_stupid_backoff_trigram_score( - stupid_backoff_trigram_model, word, context, expected_score -): - assert ( - pytest.approx(stupid_backoff_trigram_model.score(word, context), 1e-4) - == expected_score - ) - - -############################################################################### -# Probability Distributions Should Sum up to Unity # -############################################################################### - - -@pytest.fixture(scope="session") -def kneserney_bigram_model(bigram_training_data, vocabulary): - model = KneserNeyInterpolated(order=2, vocabulary=vocabulary) - model.fit(bigram_training_data) - return model - - -@pytest.mark.parametrize( - "model_fixture", - [ - "mle_bigram_model", - "mle_trigram_model", - "lidstone_bigram_model", - "laplace_bigram_model", - "wittenbell_trigram_model", - "absolute_discounting_trigram_model", - "kneserney_bigram_model", - pytest.param( - "stupid_backoff_trigram_model", - marks=pytest.mark.xfail( - reason="Stupid Backoff is not a valid distribution" - ), - ), - ], -) -@pytest.mark.parametrize( - "context", - [("a",), ("c",), ("",), ("b",), ("",), ("d",), ("e",), ("r",), ("w",)], - ids=itemgetter(0), -) -def test_sums_to_1(model_fixture, context, request): - model = request.getfixturevalue(model_fixture) - scores_for_context = sum(model.score(w, context) for w in model.vocab) - assert pytest.approx(scores_for_context, 1e-7) == 1.0 - - -############################################################################### -# Generating Text # -############################################################################### - - -def test_generate_one_no_context(mle_trigram_model): - assert mle_trigram_model.generate(random_seed=3) == "" - - -def test_generate_one_from_limiting_context(mle_trigram_model): - # We don't need random_seed for contexts with only one continuation - assert mle_trigram_model.generate(text_seed=["c"]) == "d" - assert mle_trigram_model.generate(text_seed=["b", "c"]) == "d" - assert mle_trigram_model.generate(text_seed=["a", "c"]) == "d" - - -def test_generate_one_from_varied_context(mle_trigram_model): - # When context doesn't limit our options enough, seed the random choice - assert mle_trigram_model.generate(text_seed=("a", ""), random_seed=2) == "a" - - -def test_generate_cycle(mle_trigram_model): - # Add a cycle to the model: bd -> b, db -> d - more_training_text = [padded_everygrams(mle_trigram_model.order, list("bdbdbd"))] - - mle_trigram_model.fit(more_training_text) - # Test that we can escape the cycle - assert mle_trigram_model.generate(7, text_seed=("b", "d"), random_seed=5) == [ - "b", - "d", - "b", - "d", - "b", - "d", - "", - ] - - -def test_generate_with_text_seed(mle_trigram_model): - assert mle_trigram_model.generate(5, text_seed=("", "e"), random_seed=3) == [ - "", - "a", - "d", - "b", - "", - ] - - -def test_generate_oov_text_seed(mle_trigram_model): - assert mle_trigram_model.generate( - text_seed=("aliens",), random_seed=3 - ) == mle_trigram_model.generate(text_seed=("",), random_seed=3) - - -def test_generate_None_text_seed(mle_trigram_model): - # should crash with type error when we try to look it up in vocabulary - with pytest.raises(TypeError): - mle_trigram_model.generate(text_seed=(None,)) - - # This will work - assert mle_trigram_model.generate( - text_seed=None, random_seed=3 - ) == mle_trigram_model.generate(random_seed=3) diff --git a/pipeline/nltk/test/unit/lm/test_preprocessing.py b/pipeline/nltk/test/unit/lm/test_preprocessing.py deleted file mode 100644 index e517a83266fce7c30e2a18c9d0a52a0e1cd1fdfc..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/lm/test_preprocessing.py +++ /dev/null @@ -1,30 +0,0 @@ -# Natural Language Toolkit: Language Model Unit Tests -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ilia Kurenkov -# URL: -# For license information, see LICENSE.TXT -import unittest - -from nltk.lm.preprocessing import padded_everygram_pipeline - - -class TestPreprocessing(unittest.TestCase): - def test_padded_everygram_pipeline(self): - expected_train = [ - [ - ("",), - ("", "a"), - ("a",), - ("a", "b"), - ("b",), - ("b", "c"), - ("c",), - ("c", ""), - ("",), - ] - ] - expected_vocab = ["", "a", "b", "c", ""] - train_data, vocab_data = padded_everygram_pipeline(2, [["a", "b", "c"]]) - self.assertEqual([list(sent) for sent in train_data], expected_train) - self.assertEqual(list(vocab_data), expected_vocab) diff --git a/pipeline/nltk/test/unit/lm/test_vocabulary.py b/pipeline/nltk/test/unit/lm/test_vocabulary.py deleted file mode 100644 index 39249454f144912d6715b8a396de2caa9619ae18..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/lm/test_vocabulary.py +++ /dev/null @@ -1,156 +0,0 @@ -# Natural Language Toolkit: Language Model Unit Tests -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ilia Kurenkov -# URL: -# For license information, see LICENSE.TXT - -import unittest -from collections import Counter -from timeit import timeit - -from nltk.lm import Vocabulary - - -class NgramModelVocabularyTests(unittest.TestCase): - """tests Vocabulary Class""" - - @classmethod - def setUpClass(cls): - cls.vocab = Vocabulary( - ["z", "a", "b", "c", "f", "d", "e", "g", "a", "d", "b", "e", "w"], - unk_cutoff=2, - ) - - def test_truthiness(self): - self.assertTrue(self.vocab) - - def test_cutoff_value_set_correctly(self): - self.assertEqual(self.vocab.cutoff, 2) - - def test_unable_to_change_cutoff(self): - with self.assertRaises(AttributeError): - self.vocab.cutoff = 3 - - def test_cutoff_setter_checks_value(self): - with self.assertRaises(ValueError) as exc_info: - Vocabulary("abc", unk_cutoff=0) - expected_error_msg = "Cutoff value cannot be less than 1. Got: 0" - self.assertEqual(expected_error_msg, str(exc_info.exception)) - - def test_counts_set_correctly(self): - self.assertEqual(self.vocab.counts["a"], 2) - self.assertEqual(self.vocab.counts["b"], 2) - self.assertEqual(self.vocab.counts["c"], 1) - - def test_membership_check_respects_cutoff(self): - # a was seen 2 times, so it should be considered part of the vocabulary - self.assertTrue("a" in self.vocab) - # "c" was seen once, it shouldn't be considered part of the vocab - self.assertFalse("c" in self.vocab) - # "z" was never seen at all, also shouldn't be considered in the vocab - self.assertFalse("z" in self.vocab) - - def test_vocab_len_respects_cutoff(self): - # Vocab size is the number of unique tokens that occur at least as often - # as the cutoff value, plus 1 to account for unknown words. - self.assertEqual(5, len(self.vocab)) - - def test_vocab_iter_respects_cutoff(self): - vocab_counts = ["a", "b", "c", "d", "e", "f", "g", "w", "z"] - vocab_items = ["a", "b", "d", "e", ""] - - self.assertCountEqual(vocab_counts, list(self.vocab.counts.keys())) - self.assertCountEqual(vocab_items, list(self.vocab)) - - def test_update_empty_vocab(self): - empty = Vocabulary(unk_cutoff=2) - self.assertEqual(len(empty), 0) - self.assertFalse(empty) - self.assertIn(empty.unk_label, empty) - - empty.update(list("abcde")) - self.assertIn(empty.unk_label, empty) - - def test_lookup(self): - self.assertEqual(self.vocab.lookup("a"), "a") - self.assertEqual(self.vocab.lookup("c"), "") - - def test_lookup_iterables(self): - self.assertEqual(self.vocab.lookup(["a", "b"]), ("a", "b")) - self.assertEqual(self.vocab.lookup(("a", "b")), ("a", "b")) - self.assertEqual(self.vocab.lookup(("a", "c")), ("a", "")) - self.assertEqual( - self.vocab.lookup(map(str, range(3))), ("", "", "") - ) - - def test_lookup_empty_iterables(self): - self.assertEqual(self.vocab.lookup(()), ()) - self.assertEqual(self.vocab.lookup([]), ()) - self.assertEqual(self.vocab.lookup(iter([])), ()) - self.assertEqual(self.vocab.lookup(n for n in range(0, 0)), ()) - - def test_lookup_recursive(self): - self.assertEqual( - self.vocab.lookup([["a", "b"], ["a", "c"]]), (("a", "b"), ("a", "")) - ) - self.assertEqual(self.vocab.lookup([["a", "b"], "c"]), (("a", "b"), "")) - self.assertEqual(self.vocab.lookup([[[[["a", "b"]]]]]), ((((("a", "b"),),),),)) - - def test_lookup_None(self): - with self.assertRaises(TypeError): - self.vocab.lookup(None) - with self.assertRaises(TypeError): - list(self.vocab.lookup([None, None])) - - def test_lookup_int(self): - with self.assertRaises(TypeError): - self.vocab.lookup(1) - with self.assertRaises(TypeError): - list(self.vocab.lookup([1, 2])) - - def test_lookup_empty_str(self): - self.assertEqual(self.vocab.lookup(""), "") - - def test_eqality(self): - v1 = Vocabulary(["a", "b", "c"], unk_cutoff=1) - v2 = Vocabulary(["a", "b", "c"], unk_cutoff=1) - v3 = Vocabulary(["a", "b", "c"], unk_cutoff=1, unk_label="blah") - v4 = Vocabulary(["a", "b"], unk_cutoff=1) - - self.assertEqual(v1, v2) - self.assertNotEqual(v1, v3) - self.assertNotEqual(v1, v4) - - def test_str(self): - self.assertEqual( - str(self.vocab), "" - ) - - def test_creation_with_counter(self): - self.assertEqual( - self.vocab, - Vocabulary( - Counter( - ["z", "a", "b", "c", "f", "d", "e", "g", "a", "d", "b", "e", "w"] - ), - unk_cutoff=2, - ), - ) - - @unittest.skip( - reason="Test is known to be flaky as it compares (runtime) performance." - ) - def test_len_is_constant(self): - # Given an obviously small and an obviously large vocabulary. - small_vocab = Vocabulary("abcde") - from nltk.corpus.europarl_raw import english - - large_vocab = Vocabulary(english.words()) - - # If we time calling `len` on them. - small_vocab_len_time = timeit("len(small_vocab)", globals=locals()) - large_vocab_len_time = timeit("len(large_vocab)", globals=locals()) - - # The timing should be the same order of magnitude. - self.assertAlmostEqual(small_vocab_len_time, large_vocab_len_time, places=1) diff --git a/pipeline/nltk/test/unit/test_aline.py b/pipeline/nltk/test/unit/test_aline.py deleted file mode 100644 index 68cb55f74809ac3cb53e8dfd56be8706a656f0fb..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_aline.py +++ /dev/null @@ -1,48 +0,0 @@ -""" -Test Aline algorithm for aligning phonetic sequences -""" -from nltk.metrics import aline - - -def test_aline(): - result = aline.align("θin", "tenwis") - expected = [[("θ", "t"), ("i", "e"), ("n", "n")]] - - assert result == expected - - result = aline.align("jo", "ʒə") - expected = [[("j", "ʒ"), ("o", "ə")]] - - assert result == expected - - result = aline.align("pematesiweni", "pematesewen") - expected = [ - [ - ("p", "p"), - ("e", "e"), - ("m", "m"), - ("a", "a"), - ("t", "t"), - ("e", "e"), - ("s", "s"), - ("i", "e"), - ("w", "w"), - ("e", "e"), - ("n", "n"), - ] - ] - - assert result == expected - - result = aline.align("tuwθ", "dentis") - expected = [[("t", "t"), ("u", "i"), ("w", "-"), ("θ", "s")]] - - assert result == expected - - -def test_aline_delta(): - """ - Test aline for computing the difference between two segments - """ - assert aline.delta("p", "q") == 20.0 - assert aline.delta("a", "A") == 0.0 diff --git a/pipeline/nltk/test/unit/test_bllip.py b/pipeline/nltk/test/unit/test_bllip.py deleted file mode 100644 index b134dd0dd1a217ecff53309614cd96529cc407e9..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_bllip.py +++ /dev/null @@ -1,42 +0,0 @@ -import pytest - -from nltk.data import find -from nltk.parse.bllip import BllipParser -from nltk.tree import Tree - - -@pytest.fixture(scope="module") -def parser(): - model_dir = find("models/bllip_wsj_no_aux").path - return BllipParser.from_unified_model_dir(model_dir) - - -def setup_module(): - pytest.importorskip("bllipparser") - - -class TestBllipParser: - def test_parser_loads_a_valid_tree(self, parser): - parsed = parser.parse("I saw the man with the telescope") - tree = next(parsed) - - assert isinstance(tree, Tree) - assert ( - tree.pformat() - == """ -(S1 - (S - (NP (PRP I)) - (VP - (VBD saw) - (NP (DT the) (NN man)) - (PP (IN with) (NP (DT the) (NN telescope)))))) -""".strip() - ) - - def test_tagged_parse_finds_matching_element(self, parser): - parsed = parser.parse("I saw the man with the telescope") - tagged_tree = next(parser.tagged_parse([("telescope", "NN")])) - - assert isinstance(tagged_tree, Tree) - assert tagged_tree.pformat() == "(S1 (NP (NN telescope)))" diff --git a/pipeline/nltk/test/unit/test_brill.py b/pipeline/nltk/test/unit/test_brill.py deleted file mode 100644 index cea8a854ea27b37bd9cadb4493e4dfc4ddb46cf5..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_brill.py +++ /dev/null @@ -1,34 +0,0 @@ -""" -Tests for Brill tagger. -""" - -import unittest - -from nltk.corpus import treebank -from nltk.tag import UnigramTagger, brill, brill_trainer -from nltk.tbl import demo - - -class TestBrill(unittest.TestCase): - def test_pos_template(self): - train_sents = treebank.tagged_sents()[:1000] - tagger = UnigramTagger(train_sents) - trainer = brill_trainer.BrillTaggerTrainer( - tagger, [brill.Template(brill.Pos([-1]))] - ) - brill_tagger = trainer.train(train_sents) - # Example from https://github.com/nltk/nltk/issues/769 - result = brill_tagger.tag("This is a foo bar sentence".split()) - expected = [ - ("This", "DT"), - ("is", "VBZ"), - ("a", "DT"), - ("foo", None), - ("bar", "NN"), - ("sentence", None), - ] - self.assertEqual(result, expected) - - @unittest.skip("Should be tested in __main__ of nltk.tbl.demo") - def test_brill_demo(self): - demo() diff --git a/pipeline/nltk/test/unit/test_cfd_mutation.py b/pipeline/nltk/test/unit/test_cfd_mutation.py deleted file mode 100644 index 8952f1f35fe7f78d96b92cc4c377dbd1d387aaf0..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_cfd_mutation.py +++ /dev/null @@ -1,39 +0,0 @@ -import unittest - -import pytest - -from nltk import ConditionalFreqDist, tokenize - - -class TestEmptyCondFreq(unittest.TestCase): - def test_tabulate(self): - empty = ConditionalFreqDist() - self.assertEqual(empty.conditions(), []) - with pytest.raises(ValueError): - empty.tabulate(conditions="BUG") # nonexistent keys shouldn't be added - self.assertEqual(empty.conditions(), []) - - def test_plot(self): - empty = ConditionalFreqDist() - self.assertEqual(empty.conditions(), []) - empty.plot(conditions=["BUG"]) # nonexistent keys shouldn't be added - self.assertEqual(empty.conditions(), []) - - def test_increment(self): - # make sure that we can still mutate cfd normally - text = "cow cat mouse cat tiger" - cfd = ConditionalFreqDist() - - # create cfd with word length as condition - for word in tokenize.word_tokenize(text): - condition = len(word) - cfd[condition][word] += 1 - - self.assertEqual(cfd.conditions(), [3, 5]) - - # incrementing previously unseen key is still possible - cfd[2]["hi"] += 1 - self.assertCountEqual(cfd.conditions(), [3, 5, 2]) # new condition added - self.assertEqual( - cfd[2]["hi"], 1 - ) # key's frequency incremented from 0 (unseen) to 1 diff --git a/pipeline/nltk/test/unit/test_cfg2chomsky.py b/pipeline/nltk/test/unit/test_cfg2chomsky.py deleted file mode 100644 index 1a9f24d245d5c9dfd4c4d507237651407d2cc444..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_cfg2chomsky.py +++ /dev/null @@ -1,49 +0,0 @@ -import unittest - -import nltk -from nltk.grammar import CFG - - -class ChomskyNormalFormForCFGTest(unittest.TestCase): - def test_simple(self): - grammar = CFG.fromstring( - """ - S -> NP VP - PP -> P NP - NP -> Det N | NP PP P - VP -> V NP | VP PP - VP -> Det - Det -> 'a' | 'the' - N -> 'dog' | 'cat' - V -> 'chased' | 'sat' - P -> 'on' | 'in' - """ - ) - self.assertFalse(grammar.is_flexible_chomsky_normal_form()) - self.assertFalse(grammar.is_chomsky_normal_form()) - grammar = grammar.chomsky_normal_form(flexible=True) - self.assertTrue(grammar.is_flexible_chomsky_normal_form()) - self.assertFalse(grammar.is_chomsky_normal_form()) - - grammar2 = CFG.fromstring( - """ - S -> NP VP - NP -> VP N P - VP -> P - N -> 'dog' | 'cat' - P -> 'on' | 'in' - """ - ) - self.assertFalse(grammar2.is_flexible_chomsky_normal_form()) - self.assertFalse(grammar2.is_chomsky_normal_form()) - grammar2 = grammar2.chomsky_normal_form() - self.assertTrue(grammar2.is_flexible_chomsky_normal_form()) - self.assertTrue(grammar2.is_chomsky_normal_form()) - - def test_complex(self): - grammar = nltk.data.load("grammars/large_grammars/atis.cfg") - self.assertFalse(grammar.is_flexible_chomsky_normal_form()) - self.assertFalse(grammar.is_chomsky_normal_form()) - grammar = grammar.chomsky_normal_form(flexible=True) - self.assertTrue(grammar.is_flexible_chomsky_normal_form()) - self.assertFalse(grammar.is_chomsky_normal_form()) diff --git a/pipeline/nltk/test/unit/test_chunk.py b/pipeline/nltk/test/unit/test_chunk.py deleted file mode 100644 index 60b56317f2b5cae224b906f0c71458144a74f6a8..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_chunk.py +++ /dev/null @@ -1,85 +0,0 @@ -import unittest - -from nltk import RegexpParser - - -class TestChunkRule(unittest.TestCase): - def test_tag_pattern2re_pattern_quantifier(self): - """Test for bug https://github.com/nltk/nltk/issues/1597 - - Ensures that curly bracket quantifiers can be used inside a chunk rule. - This type of quantifier has been used for the supplementary example - in https://www.nltk.org/book/ch07.html#exploring-text-corpora. - """ - sent = [ - ("The", "AT"), - ("September-October", "NP"), - ("term", "NN"), - ("jury", "NN"), - ("had", "HVD"), - ("been", "BEN"), - ("charged", "VBN"), - ("by", "IN"), - ("Fulton", "NP-TL"), - ("Superior", "JJ-TL"), - ("Court", "NN-TL"), - ("Judge", "NN-TL"), - ("Durwood", "NP"), - ("Pye", "NP"), - ("to", "TO"), - ("investigate", "VB"), - ("reports", "NNS"), - ("of", "IN"), - ("possible", "JJ"), - ("``", "``"), - ("irregularities", "NNS"), - ("''", "''"), - ("in", "IN"), - ("the", "AT"), - ("hard-fought", "JJ"), - ("primary", "NN"), - ("which", "WDT"), - ("was", "BEDZ"), - ("won", "VBN"), - ("by", "IN"), - ("Mayor-nominate", "NN-TL"), - ("Ivan", "NP"), - ("Allen", "NP"), - ("Jr.", "NP"), - (".", "."), - ] # source: brown corpus - cp = RegexpParser("CHUNK: {{4,}}") - tree = cp.parse(sent) - assert ( - tree.pformat() - == """(S - The/AT - September-October/NP - term/NN - jury/NN - had/HVD - been/BEN - charged/VBN - by/IN - Fulton/NP-TL - Superior/JJ-TL - (CHUNK Court/NN-TL Judge/NN-TL Durwood/NP Pye/NP) - to/TO - investigate/VB - reports/NNS - of/IN - possible/JJ - ``/`` - irregularities/NNS - ''/'' - in/IN - the/AT - hard-fought/JJ - primary/NN - which/WDT - was/BEDZ - won/VBN - by/IN - (CHUNK Mayor-nominate/NN-TL Ivan/NP Allen/NP Jr./NP) - ./.)""" - ) diff --git a/pipeline/nltk/test/unit/test_classify.py b/pipeline/nltk/test/unit/test_classify.py deleted file mode 100644 index 4e21a6cf4aa119e6696a9d2ba618ff08220b0fac..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_classify.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -Unit tests for nltk.classify. See also: nltk/test/classify.doctest -""" -import pytest - -from nltk import classify - -TRAIN = [ - (dict(a=1, b=1, c=1), "y"), - (dict(a=1, b=1, c=1), "x"), - (dict(a=1, b=1, c=0), "y"), - (dict(a=0, b=1, c=1), "x"), - (dict(a=0, b=1, c=1), "y"), - (dict(a=0, b=0, c=1), "y"), - (dict(a=0, b=1, c=0), "x"), - (dict(a=0, b=0, c=0), "x"), - (dict(a=0, b=1, c=1), "y"), -] - -TEST = [ - (dict(a=1, b=0, c=1)), # unseen - (dict(a=1, b=0, c=0)), # unseen - (dict(a=0, b=1, c=1)), # seen 3 times, labels=y,y,x - (dict(a=0, b=1, c=0)), # seen 1 time, label=x -] - -RESULTS = [(0.16, 0.84), (0.46, 0.54), (0.41, 0.59), (0.76, 0.24)] - - -def assert_classifier_correct(algorithm): - try: - classifier = classify.MaxentClassifier.train( - TRAIN, algorithm, trace=0, max_iter=1000 - ) - except (LookupError, AttributeError) as e: - pytest.skip(str(e)) - - for (px, py), featureset in zip(RESULTS, TEST): - pdist = classifier.prob_classify(featureset) - assert abs(pdist.prob("x") - px) < 1e-2, (pdist.prob("x"), px) - assert abs(pdist.prob("y") - py) < 1e-2, (pdist.prob("y"), py) - - -def test_megam(): - assert_classifier_correct("MEGAM") - - -def test_tadm(): - assert_classifier_correct("TADM") diff --git a/pipeline/nltk/test/unit/test_collocations.py b/pipeline/nltk/test/unit/test_collocations.py deleted file mode 100644 index 2351c61f42942f497d66cbcd4ac2fa845433c2db..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_collocations.py +++ /dev/null @@ -1,120 +0,0 @@ -from nltk.collocations import BigramCollocationFinder -from nltk.metrics import BigramAssocMeasures - -## Test bigram counters with discontinuous bigrams and repeated words - -_EPSILON = 1e-8 -SENT = "this this is is a a test test".split() - - -def close_enough(x, y): - """Verify that two sequences of n-gram association values are within - _EPSILON of each other. - """ - - return all(abs(x1[1] - y1[1]) <= _EPSILON for x1, y1 in zip(x, y)) - - -def test_bigram2(): - b = BigramCollocationFinder.from_words(SENT) - - assert sorted(b.ngram_fd.items()) == [ - (("a", "a"), 1), - (("a", "test"), 1), - (("is", "a"), 1), - (("is", "is"), 1), - (("test", "test"), 1), - (("this", "is"), 1), - (("this", "this"), 1), - ] - assert sorted(b.word_fd.items()) == [("a", 2), ("is", 2), ("test", 2), ("this", 2)] - - assert len(SENT) == sum(b.word_fd.values()) == sum(b.ngram_fd.values()) + 1 - assert close_enough( - sorted(b.score_ngrams(BigramAssocMeasures.pmi)), - [ - (("a", "a"), 1.0), - (("a", "test"), 1.0), - (("is", "a"), 1.0), - (("is", "is"), 1.0), - (("test", "test"), 1.0), - (("this", "is"), 1.0), - (("this", "this"), 1.0), - ], - ) - - -def test_bigram3(): - b = BigramCollocationFinder.from_words(SENT, window_size=3) - assert sorted(b.ngram_fd.items()) == sorted( - [ - (("a", "test"), 3), - (("is", "a"), 3), - (("this", "is"), 3), - (("a", "a"), 1), - (("is", "is"), 1), - (("test", "test"), 1), - (("this", "this"), 1), - ] - ) - - assert sorted(b.word_fd.items()) == sorted( - [("a", 2), ("is", 2), ("test", 2), ("this", 2)] - ) - - assert ( - len(SENT) == sum(b.word_fd.values()) == (sum(b.ngram_fd.values()) + 2 + 1) / 2.0 - ) - assert close_enough( - sorted(b.score_ngrams(BigramAssocMeasures.pmi)), - sorted( - [ - (("a", "test"), 1.584962500721156), - (("is", "a"), 1.584962500721156), - (("this", "is"), 1.584962500721156), - (("a", "a"), 0.0), - (("is", "is"), 0.0), - (("test", "test"), 0.0), - (("this", "this"), 0.0), - ] - ), - ) - - -def test_bigram5(): - b = BigramCollocationFinder.from_words(SENT, window_size=5) - assert sorted(b.ngram_fd.items()) == sorted( - [ - (("a", "test"), 4), - (("is", "a"), 4), - (("this", "is"), 4), - (("is", "test"), 3), - (("this", "a"), 3), - (("a", "a"), 1), - (("is", "is"), 1), - (("test", "test"), 1), - (("this", "this"), 1), - ] - ) - assert sorted(b.word_fd.items()) == sorted( - [("a", 2), ("is", 2), ("test", 2), ("this", 2)] - ) - n_word_fd = sum(b.word_fd.values()) - n_ngram_fd = (sum(b.ngram_fd.values()) + 4 + 3 + 2 + 1) / 4.0 - assert len(SENT) == n_word_fd == n_ngram_fd - assert close_enough( - sorted(b.score_ngrams(BigramAssocMeasures.pmi)), - sorted( - [ - (("a", "test"), 1.0), - (("is", "a"), 1.0), - (("this", "is"), 1.0), - (("is", "test"), 0.5849625007211562), - (("this", "a"), 0.5849625007211562), - (("a", "a"), -1.0), - (("is", "is"), -1.0), - (("test", "test"), -1.0), - (("this", "this"), -1.0), - ] - ), - ) diff --git a/pipeline/nltk/test/unit/test_concordance.py b/pipeline/nltk/test/unit/test_concordance.py deleted file mode 100644 index 02fc5f35a4901598617a71c266ef0448c27694c6..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_concordance.py +++ /dev/null @@ -1,98 +0,0 @@ -import contextlib -import sys -import unittest -from io import StringIO - -from nltk.corpus import gutenberg -from nltk.text import Text - - -@contextlib.contextmanager -def stdout_redirect(where): - sys.stdout = where - try: - yield where - finally: - sys.stdout = sys.__stdout__ - - -class TestConcordance(unittest.TestCase): - """Text constructed using: https://www.nltk.org/book/ch01.html""" - - @classmethod - def setUpClass(cls): - cls.corpus = gutenberg.words("melville-moby_dick.txt") - - @classmethod - def tearDownClass(cls): - pass - - def setUp(self): - self.text = Text(TestConcordance.corpus) - self.query = "monstrous" - self.maxDiff = None - self.list_out = [ - "ong the former , one was of a most monstrous size . ... This came towards us , ", - 'ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r', - "ll over with a heathenish array of monstrous clubs and spears . Some were thick", - "d as you gazed , and wondered what monstrous cannibal and savage could ever hav", - "that has survived the flood ; most monstrous and most mountainous ! That Himmal", - "they might scout at Moby Dick as a monstrous fable , or still worse and more de", - "th of Radney .'\" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l", - "ing Scenes . In connexion with the monstrous pictures of whales , I am strongly", - "ere to enter upon those still more monstrous stories of them which are to be fo", - "ght have been rummaged out of this monstrous cabinet there is no telling . But ", - "of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u", - ] - - def tearDown(self): - pass - - def test_concordance_list(self): - concordance_out = self.text.concordance_list(self.query) - self.assertEqual(self.list_out, [c.line for c in concordance_out]) - - def test_concordance_width(self): - list_out = [ - "monstrous", - "monstrous", - "monstrous", - "monstrous", - "monstrous", - "monstrous", - "Monstrous", - "monstrous", - "monstrous", - "monstrous", - "monstrous", - ] - - concordance_out = self.text.concordance_list(self.query, width=0) - self.assertEqual(list_out, [c.query for c in concordance_out]) - - def test_concordance_lines(self): - concordance_out = self.text.concordance_list(self.query, lines=3) - self.assertEqual(self.list_out[:3], [c.line for c in concordance_out]) - - def test_concordance_print(self): - print_out = """Displaying 11 of 11 matches: - ong the former , one was of a most monstrous size . ... This came towards us , - ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r - ll over with a heathenish array of monstrous clubs and spears . Some were thick - d as you gazed , and wondered what monstrous cannibal and savage could ever hav - that has survived the flood ; most monstrous and most mountainous ! That Himmal - they might scout at Moby Dick as a monstrous fable , or still worse and more de - th of Radney .'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l - ing Scenes . In connexion with the monstrous pictures of whales , I am strongly - ere to enter upon those still more monstrous stories of them which are to be fo - ght have been rummaged out of this monstrous cabinet there is no telling . But - of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u - """ - - with stdout_redirect(StringIO()) as stdout: - self.text.concordance(self.query) - - def strip_space(raw_str): - return raw_str.replace(" ", "") - - self.assertEqual(strip_space(print_out), strip_space(stdout.getvalue())) diff --git a/pipeline/nltk/test/unit/test_corenlp.py b/pipeline/nltk/test/unit/test_corenlp.py deleted file mode 100644 index 8b0024b11470c1bee501c898ff508622e783287f..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_corenlp.py +++ /dev/null @@ -1,1436 +0,0 @@ -""" -Mock test for Stanford CoreNLP wrappers. -""" - -from unittest import TestCase -from unittest.mock import MagicMock - -import pytest - -from nltk.parse import corenlp -from nltk.tree import Tree - - -def setup_module(module): - global server - - try: - server = corenlp.CoreNLPServer(port=9000) - except LookupError: - pytest.skip("Could not instantiate CoreNLPServer.") - - try: - server.start() - except corenlp.CoreNLPServerError as e: - pytest.skip( - "Skipping CoreNLP tests because the server could not be started. " - "Make sure that the 9000 port is free. " - "{}".format(e.strerror) - ) - - -def teardown_module(module): - server.stop() - - -class TestTokenizerAPI(TestCase): - def test_tokenize(self): - corenlp_tokenizer = corenlp.CoreNLPParser() - - api_return_value = { - "sentences": [ - { - "index": 0, - "tokens": [ - { - "after": " ", - "before": "", - "characterOffsetBegin": 0, - "characterOffsetEnd": 4, - "index": 1, - "originalText": "Good", - "word": "Good", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 5, - "characterOffsetEnd": 12, - "index": 2, - "originalText": "muffins", - "word": "muffins", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 13, - "characterOffsetEnd": 17, - "index": 3, - "originalText": "cost", - "word": "cost", - }, - { - "after": "", - "before": " ", - "characterOffsetBegin": 18, - "characterOffsetEnd": 19, - "index": 4, - "originalText": "$", - "word": "$", - }, - { - "after": "\n", - "before": "", - "characterOffsetBegin": 19, - "characterOffsetEnd": 23, - "index": 5, - "originalText": "3.88", - "word": "3.88", - }, - { - "after": " ", - "before": "\n", - "characterOffsetBegin": 24, - "characterOffsetEnd": 26, - "index": 6, - "originalText": "in", - "word": "in", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 27, - "characterOffsetEnd": 30, - "index": 7, - "originalText": "New", - "word": "New", - }, - { - "after": "", - "before": " ", - "characterOffsetBegin": 31, - "characterOffsetEnd": 35, - "index": 8, - "originalText": "York", - "word": "York", - }, - { - "after": " ", - "before": "", - "characterOffsetBegin": 35, - "characterOffsetEnd": 36, - "index": 9, - "originalText": ".", - "word": ".", - }, - ], - }, - { - "index": 1, - "tokens": [ - { - "after": " ", - "before": " ", - "characterOffsetBegin": 38, - "characterOffsetEnd": 44, - "index": 1, - "originalText": "Please", - "word": "Please", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 45, - "characterOffsetEnd": 48, - "index": 2, - "originalText": "buy", - "word": "buy", - }, - { - "after": "\n", - "before": " ", - "characterOffsetBegin": 49, - "characterOffsetEnd": 51, - "index": 3, - "originalText": "me", - "word": "me", - }, - { - "after": " ", - "before": "\n", - "characterOffsetBegin": 52, - "characterOffsetEnd": 55, - "index": 4, - "originalText": "two", - "word": "two", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 56, - "characterOffsetEnd": 58, - "index": 5, - "originalText": "of", - "word": "of", - }, - { - "after": "", - "before": " ", - "characterOffsetBegin": 59, - "characterOffsetEnd": 63, - "index": 6, - "originalText": "them", - "word": "them", - }, - { - "after": "\n", - "before": "", - "characterOffsetBegin": 63, - "characterOffsetEnd": 64, - "index": 7, - "originalText": ".", - "word": ".", - }, - ], - }, - { - "index": 2, - "tokens": [ - { - "after": "", - "before": "\n", - "characterOffsetBegin": 65, - "characterOffsetEnd": 71, - "index": 1, - "originalText": "Thanks", - "word": "Thanks", - }, - { - "after": "", - "before": "", - "characterOffsetBegin": 71, - "characterOffsetEnd": 72, - "index": 2, - "originalText": ".", - "word": ".", - }, - ], - }, - ] - } - corenlp_tokenizer.api_call = MagicMock(return_value=api_return_value) - - input_string = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks." - - expected_output = [ - "Good", - "muffins", - "cost", - "$", - "3.88", - "in", - "New", - "York", - ".", - "Please", - "buy", - "me", - "two", - "of", - "them", - ".", - "Thanks", - ".", - ] - - tokenized_output = list(corenlp_tokenizer.tokenize(input_string)) - - corenlp_tokenizer.api_call.assert_called_once_with( - "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.", - properties={"annotators": "tokenize,ssplit"}, - ) - self.assertEqual(expected_output, tokenized_output) - - -class TestTaggerAPI(TestCase): - def test_pos_tagger(self): - corenlp_tagger = corenlp.CoreNLPParser(tagtype="pos") - - api_return_value = { - "sentences": [ - { - "basicDependencies": [ - { - "dep": "ROOT", - "dependent": 1, - "dependentGloss": "What", - "governor": 0, - "governorGloss": "ROOT", - }, - { - "dep": "cop", - "dependent": 2, - "dependentGloss": "is", - "governor": 1, - "governorGloss": "What", - }, - { - "dep": "det", - "dependent": 3, - "dependentGloss": "the", - "governor": 4, - "governorGloss": "airspeed", - }, - { - "dep": "nsubj", - "dependent": 4, - "dependentGloss": "airspeed", - "governor": 1, - "governorGloss": "What", - }, - { - "dep": "case", - "dependent": 5, - "dependentGloss": "of", - "governor": 8, - "governorGloss": "swallow", - }, - { - "dep": "det", - "dependent": 6, - "dependentGloss": "an", - "governor": 8, - "governorGloss": "swallow", - }, - { - "dep": "compound", - "dependent": 7, - "dependentGloss": "unladen", - "governor": 8, - "governorGloss": "swallow", - }, - { - "dep": "nmod", - "dependent": 8, - "dependentGloss": "swallow", - "governor": 4, - "governorGloss": "airspeed", - }, - { - "dep": "punct", - "dependent": 9, - "dependentGloss": "?", - "governor": 1, - "governorGloss": "What", - }, - ], - "enhancedDependencies": [ - { - "dep": "ROOT", - "dependent": 1, - "dependentGloss": "What", - "governor": 0, - "governorGloss": "ROOT", - }, - { - "dep": "cop", - "dependent": 2, - "dependentGloss": "is", - "governor": 1, - "governorGloss": "What", - }, - { - "dep": "det", - "dependent": 3, - "dependentGloss": "the", - "governor": 4, - "governorGloss": "airspeed", - }, - { - "dep": "nsubj", - "dependent": 4, - "dependentGloss": "airspeed", - "governor": 1, - "governorGloss": "What", - }, - { - "dep": "case", - "dependent": 5, - "dependentGloss": "of", - "governor": 8, - "governorGloss": "swallow", - }, - { - "dep": "det", - "dependent": 6, - "dependentGloss": "an", - "governor": 8, - "governorGloss": "swallow", - }, - { - "dep": "compound", - "dependent": 7, - "dependentGloss": "unladen", - "governor": 8, - "governorGloss": "swallow", - }, - { - "dep": "nmod:of", - "dependent": 8, - "dependentGloss": "swallow", - "governor": 4, - "governorGloss": "airspeed", - }, - { - "dep": "punct", - "dependent": 9, - "dependentGloss": "?", - "governor": 1, - "governorGloss": "What", - }, - ], - "enhancedPlusPlusDependencies": [ - { - "dep": "ROOT", - "dependent": 1, - "dependentGloss": "What", - "governor": 0, - "governorGloss": "ROOT", - }, - { - "dep": "cop", - "dependent": 2, - "dependentGloss": "is", - "governor": 1, - "governorGloss": "What", - }, - { - "dep": "det", - "dependent": 3, - "dependentGloss": "the", - "governor": 4, - "governorGloss": "airspeed", - }, - { - "dep": "nsubj", - "dependent": 4, - "dependentGloss": "airspeed", - "governor": 1, - "governorGloss": "What", - }, - { - "dep": "case", - "dependent": 5, - "dependentGloss": "of", - "governor": 8, - "governorGloss": "swallow", - }, - { - "dep": "det", - "dependent": 6, - "dependentGloss": "an", - "governor": 8, - "governorGloss": "swallow", - }, - { - "dep": "compound", - "dependent": 7, - "dependentGloss": "unladen", - "governor": 8, - "governorGloss": "swallow", - }, - { - "dep": "nmod:of", - "dependent": 8, - "dependentGloss": "swallow", - "governor": 4, - "governorGloss": "airspeed", - }, - { - "dep": "punct", - "dependent": 9, - "dependentGloss": "?", - "governor": 1, - "governorGloss": "What", - }, - ], - "index": 0, - "parse": "(ROOT\n (SBARQ\n (WHNP (WP What))\n (SQ (VBZ is)\n (NP\n (NP (DT the) (NN airspeed))\n (PP (IN of)\n (NP (DT an) (NN unladen) (NN swallow)))))\n (. ?)))", - "tokens": [ - { - "after": " ", - "before": "", - "characterOffsetBegin": 0, - "characterOffsetEnd": 4, - "index": 1, - "lemma": "what", - "originalText": "What", - "pos": "WP", - "word": "What", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 5, - "characterOffsetEnd": 7, - "index": 2, - "lemma": "be", - "originalText": "is", - "pos": "VBZ", - "word": "is", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 8, - "characterOffsetEnd": 11, - "index": 3, - "lemma": "the", - "originalText": "the", - "pos": "DT", - "word": "the", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 12, - "characterOffsetEnd": 20, - "index": 4, - "lemma": "airspeed", - "originalText": "airspeed", - "pos": "NN", - "word": "airspeed", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 21, - "characterOffsetEnd": 23, - "index": 5, - "lemma": "of", - "originalText": "of", - "pos": "IN", - "word": "of", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 24, - "characterOffsetEnd": 26, - "index": 6, - "lemma": "a", - "originalText": "an", - "pos": "DT", - "word": "an", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 27, - "characterOffsetEnd": 34, - "index": 7, - "lemma": "unladen", - "originalText": "unladen", - "pos": "JJ", - "word": "unladen", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 35, - "characterOffsetEnd": 42, - "index": 8, - "lemma": "swallow", - "originalText": "swallow", - "pos": "VB", - "word": "swallow", - }, - { - "after": "", - "before": " ", - "characterOffsetBegin": 43, - "characterOffsetEnd": 44, - "index": 9, - "lemma": "?", - "originalText": "?", - "pos": ".", - "word": "?", - }, - ], - } - ] - } - corenlp_tagger.api_call = MagicMock(return_value=api_return_value) - - input_tokens = "What is the airspeed of an unladen swallow ?".split() - expected_output = [ - ("What", "WP"), - ("is", "VBZ"), - ("the", "DT"), - ("airspeed", "NN"), - ("of", "IN"), - ("an", "DT"), - ("unladen", "JJ"), - ("swallow", "VB"), - ("?", "."), - ] - tagged_output = corenlp_tagger.tag(input_tokens) - - corenlp_tagger.api_call.assert_called_once_with( - "What is the airspeed of an unladen swallow ?", - properties={ - "ssplit.isOneSentence": "true", - "annotators": "tokenize,ssplit,pos", - }, - ) - self.assertEqual(expected_output, tagged_output) - - def test_ner_tagger(self): - corenlp_tagger = corenlp.CoreNLPParser(tagtype="ner") - - api_return_value = { - "sentences": [ - { - "index": 0, - "tokens": [ - { - "after": " ", - "before": "", - "characterOffsetBegin": 0, - "characterOffsetEnd": 4, - "index": 1, - "lemma": "Rami", - "ner": "PERSON", - "originalText": "Rami", - "pos": "NNP", - "word": "Rami", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 5, - "characterOffsetEnd": 8, - "index": 2, - "lemma": "Eid", - "ner": "PERSON", - "originalText": "Eid", - "pos": "NNP", - "word": "Eid", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 9, - "characterOffsetEnd": 11, - "index": 3, - "lemma": "be", - "ner": "O", - "originalText": "is", - "pos": "VBZ", - "word": "is", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 12, - "characterOffsetEnd": 20, - "index": 4, - "lemma": "study", - "ner": "O", - "originalText": "studying", - "pos": "VBG", - "word": "studying", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 21, - "characterOffsetEnd": 23, - "index": 5, - "lemma": "at", - "ner": "O", - "originalText": "at", - "pos": "IN", - "word": "at", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 24, - "characterOffsetEnd": 29, - "index": 6, - "lemma": "Stony", - "ner": "ORGANIZATION", - "originalText": "Stony", - "pos": "NNP", - "word": "Stony", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 30, - "characterOffsetEnd": 35, - "index": 7, - "lemma": "Brook", - "ner": "ORGANIZATION", - "originalText": "Brook", - "pos": "NNP", - "word": "Brook", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 36, - "characterOffsetEnd": 46, - "index": 8, - "lemma": "University", - "ner": "ORGANIZATION", - "originalText": "University", - "pos": "NNP", - "word": "University", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 47, - "characterOffsetEnd": 49, - "index": 9, - "lemma": "in", - "ner": "O", - "originalText": "in", - "pos": "IN", - "word": "in", - }, - { - "after": "", - "before": " ", - "characterOffsetBegin": 50, - "characterOffsetEnd": 52, - "index": 10, - "lemma": "NY", - "ner": "O", - "originalText": "NY", - "pos": "NNP", - "word": "NY", - }, - ], - } - ] - } - - corenlp_tagger.api_call = MagicMock(return_value=api_return_value) - - input_tokens = "Rami Eid is studying at Stony Brook University in NY".split() - expected_output = [ - ("Rami", "PERSON"), - ("Eid", "PERSON"), - ("is", "O"), - ("studying", "O"), - ("at", "O"), - ("Stony", "ORGANIZATION"), - ("Brook", "ORGANIZATION"), - ("University", "ORGANIZATION"), - ("in", "O"), - ("NY", "O"), - ] - tagged_output = corenlp_tagger.tag(input_tokens) - - corenlp_tagger.api_call.assert_called_once_with( - "Rami Eid is studying at Stony Brook University in NY", - properties={ - "ssplit.isOneSentence": "true", - "annotators": "tokenize,ssplit,ner", - }, - ) - self.assertEqual(expected_output, tagged_output) - - def test_unexpected_tagtype(self): - with self.assertRaises(ValueError): - corenlp_tagger = corenlp.CoreNLPParser(tagtype="test") - - -class TestParserAPI(TestCase): - def test_parse(self): - corenlp_parser = corenlp.CoreNLPParser() - - api_return_value = { - "sentences": [ - { - "basicDependencies": [ - { - "dep": "ROOT", - "dependent": 4, - "dependentGloss": "fox", - "governor": 0, - "governorGloss": "ROOT", - }, - { - "dep": "det", - "dependent": 1, - "dependentGloss": "The", - "governor": 4, - "governorGloss": "fox", - }, - { - "dep": "amod", - "dependent": 2, - "dependentGloss": "quick", - "governor": 4, - "governorGloss": "fox", - }, - { - "dep": "amod", - "dependent": 3, - "dependentGloss": "brown", - "governor": 4, - "governorGloss": "fox", - }, - { - "dep": "dep", - "dependent": 5, - "dependentGloss": "jumps", - "governor": 4, - "governorGloss": "fox", - }, - { - "dep": "case", - "dependent": 6, - "dependentGloss": "over", - "governor": 9, - "governorGloss": "dog", - }, - { - "dep": "det", - "dependent": 7, - "dependentGloss": "the", - "governor": 9, - "governorGloss": "dog", - }, - { - "dep": "amod", - "dependent": 8, - "dependentGloss": "lazy", - "governor": 9, - "governorGloss": "dog", - }, - { - "dep": "nmod", - "dependent": 9, - "dependentGloss": "dog", - "governor": 5, - "governorGloss": "jumps", - }, - ], - "enhancedDependencies": [ - { - "dep": "ROOT", - "dependent": 4, - "dependentGloss": "fox", - "governor": 0, - "governorGloss": "ROOT", - }, - { - "dep": "det", - "dependent": 1, - "dependentGloss": "The", - "governor": 4, - "governorGloss": "fox", - }, - { - "dep": "amod", - "dependent": 2, - "dependentGloss": "quick", - "governor": 4, - "governorGloss": "fox", - }, - { - "dep": "amod", - "dependent": 3, - "dependentGloss": "brown", - "governor": 4, - "governorGloss": "fox", - }, - { - "dep": "dep", - "dependent": 5, - "dependentGloss": "jumps", - "governor": 4, - "governorGloss": "fox", - }, - { - "dep": "case", - "dependent": 6, - "dependentGloss": "over", - "governor": 9, - "governorGloss": "dog", - }, - { - "dep": "det", - "dependent": 7, - "dependentGloss": "the", - "governor": 9, - "governorGloss": "dog", - }, - { - "dep": "amod", - "dependent": 8, - "dependentGloss": "lazy", - "governor": 9, - "governorGloss": "dog", - }, - { - "dep": "nmod:over", - "dependent": 9, - "dependentGloss": "dog", - "governor": 5, - "governorGloss": "jumps", - }, - ], - "enhancedPlusPlusDependencies": [ - { - "dep": "ROOT", - "dependent": 4, - "dependentGloss": "fox", - "governor": 0, - "governorGloss": "ROOT", - }, - { - "dep": "det", - "dependent": 1, - "dependentGloss": "The", - "governor": 4, - "governorGloss": "fox", - }, - { - "dep": "amod", - "dependent": 2, - "dependentGloss": "quick", - "governor": 4, - "governorGloss": "fox", - }, - { - "dep": "amod", - "dependent": 3, - "dependentGloss": "brown", - "governor": 4, - "governorGloss": "fox", - }, - { - "dep": "dep", - "dependent": 5, - "dependentGloss": "jumps", - "governor": 4, - "governorGloss": "fox", - }, - { - "dep": "case", - "dependent": 6, - "dependentGloss": "over", - "governor": 9, - "governorGloss": "dog", - }, - { - "dep": "det", - "dependent": 7, - "dependentGloss": "the", - "governor": 9, - "governorGloss": "dog", - }, - { - "dep": "amod", - "dependent": 8, - "dependentGloss": "lazy", - "governor": 9, - "governorGloss": "dog", - }, - { - "dep": "nmod:over", - "dependent": 9, - "dependentGloss": "dog", - "governor": 5, - "governorGloss": "jumps", - }, - ], - "index": 0, - "parse": "(ROOT\n (NP\n (NP (DT The) (JJ quick) (JJ brown) (NN fox))\n (NP\n (NP (NNS jumps))\n (PP (IN over)\n (NP (DT the) (JJ lazy) (NN dog))))))", - "tokens": [ - { - "after": " ", - "before": "", - "characterOffsetBegin": 0, - "characterOffsetEnd": 3, - "index": 1, - "lemma": "the", - "originalText": "The", - "pos": "DT", - "word": "The", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 4, - "characterOffsetEnd": 9, - "index": 2, - "lemma": "quick", - "originalText": "quick", - "pos": "JJ", - "word": "quick", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 10, - "characterOffsetEnd": 15, - "index": 3, - "lemma": "brown", - "originalText": "brown", - "pos": "JJ", - "word": "brown", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 16, - "characterOffsetEnd": 19, - "index": 4, - "lemma": "fox", - "originalText": "fox", - "pos": "NN", - "word": "fox", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 20, - "characterOffsetEnd": 25, - "index": 5, - "lemma": "jump", - "originalText": "jumps", - "pos": "VBZ", - "word": "jumps", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 26, - "characterOffsetEnd": 30, - "index": 6, - "lemma": "over", - "originalText": "over", - "pos": "IN", - "word": "over", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 31, - "characterOffsetEnd": 34, - "index": 7, - "lemma": "the", - "originalText": "the", - "pos": "DT", - "word": "the", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 35, - "characterOffsetEnd": 39, - "index": 8, - "lemma": "lazy", - "originalText": "lazy", - "pos": "JJ", - "word": "lazy", - }, - { - "after": "", - "before": " ", - "characterOffsetBegin": 40, - "characterOffsetEnd": 43, - "index": 9, - "lemma": "dog", - "originalText": "dog", - "pos": "NN", - "word": "dog", - }, - ], - } - ] - } - - corenlp_parser.api_call = MagicMock(return_value=api_return_value) - - input_string = "The quick brown fox jumps over the lazy dog".split() - expected_output = Tree( - "ROOT", - [ - Tree( - "NP", - [ - Tree( - "NP", - [ - Tree("DT", ["The"]), - Tree("JJ", ["quick"]), - Tree("JJ", ["brown"]), - Tree("NN", ["fox"]), - ], - ), - Tree( - "NP", - [ - Tree("NP", [Tree("NNS", ["jumps"])]), - Tree( - "PP", - [ - Tree("IN", ["over"]), - Tree( - "NP", - [ - Tree("DT", ["the"]), - Tree("JJ", ["lazy"]), - Tree("NN", ["dog"]), - ], - ), - ], - ), - ], - ), - ], - ) - ], - ) - - parsed_data = next(corenlp_parser.parse(input_string)) - - corenlp_parser.api_call.assert_called_once_with( - "The quick brown fox jumps over the lazy dog", - properties={"ssplit.eolonly": "true"}, - ) - self.assertEqual(expected_output, parsed_data) - - def test_dependency_parser(self): - corenlp_parser = corenlp.CoreNLPDependencyParser() - - api_return_value = { - "sentences": [ - { - "basicDependencies": [ - { - "dep": "ROOT", - "dependent": 5, - "dependentGloss": "jumps", - "governor": 0, - "governorGloss": "ROOT", - }, - { - "dep": "det", - "dependent": 1, - "dependentGloss": "The", - "governor": 4, - "governorGloss": "fox", - }, - { - "dep": "amod", - "dependent": 2, - "dependentGloss": "quick", - "governor": 4, - "governorGloss": "fox", - }, - { - "dep": "amod", - "dependent": 3, - "dependentGloss": "brown", - "governor": 4, - "governorGloss": "fox", - }, - { - "dep": "nsubj", - "dependent": 4, - "dependentGloss": "fox", - "governor": 5, - "governorGloss": "jumps", - }, - { - "dep": "case", - "dependent": 6, - "dependentGloss": "over", - "governor": 9, - "governorGloss": "dog", - }, - { - "dep": "det", - "dependent": 7, - "dependentGloss": "the", - "governor": 9, - "governorGloss": "dog", - }, - { - "dep": "amod", - "dependent": 8, - "dependentGloss": "lazy", - "governor": 9, - "governorGloss": "dog", - }, - { - "dep": "nmod", - "dependent": 9, - "dependentGloss": "dog", - "governor": 5, - "governorGloss": "jumps", - }, - ], - "enhancedDependencies": [ - { - "dep": "ROOT", - "dependent": 5, - "dependentGloss": "jumps", - "governor": 0, - "governorGloss": "ROOT", - }, - { - "dep": "det", - "dependent": 1, - "dependentGloss": "The", - "governor": 4, - "governorGloss": "fox", - }, - { - "dep": "amod", - "dependent": 2, - "dependentGloss": "quick", - "governor": 4, - "governorGloss": "fox", - }, - { - "dep": "amod", - "dependent": 3, - "dependentGloss": "brown", - "governor": 4, - "governorGloss": "fox", - }, - { - "dep": "nsubj", - "dependent": 4, - "dependentGloss": "fox", - "governor": 5, - "governorGloss": "jumps", - }, - { - "dep": "case", - "dependent": 6, - "dependentGloss": "over", - "governor": 9, - "governorGloss": "dog", - }, - { - "dep": "det", - "dependent": 7, - "dependentGloss": "the", - "governor": 9, - "governorGloss": "dog", - }, - { - "dep": "amod", - "dependent": 8, - "dependentGloss": "lazy", - "governor": 9, - "governorGloss": "dog", - }, - { - "dep": "nmod:over", - "dependent": 9, - "dependentGloss": "dog", - "governor": 5, - "governorGloss": "jumps", - }, - ], - "enhancedPlusPlusDependencies": [ - { - "dep": "ROOT", - "dependent": 5, - "dependentGloss": "jumps", - "governor": 0, - "governorGloss": "ROOT", - }, - { - "dep": "det", - "dependent": 1, - "dependentGloss": "The", - "governor": 4, - "governorGloss": "fox", - }, - { - "dep": "amod", - "dependent": 2, - "dependentGloss": "quick", - "governor": 4, - "governorGloss": "fox", - }, - { - "dep": "amod", - "dependent": 3, - "dependentGloss": "brown", - "governor": 4, - "governorGloss": "fox", - }, - { - "dep": "nsubj", - "dependent": 4, - "dependentGloss": "fox", - "governor": 5, - "governorGloss": "jumps", - }, - { - "dep": "case", - "dependent": 6, - "dependentGloss": "over", - "governor": 9, - "governorGloss": "dog", - }, - { - "dep": "det", - "dependent": 7, - "dependentGloss": "the", - "governor": 9, - "governorGloss": "dog", - }, - { - "dep": "amod", - "dependent": 8, - "dependentGloss": "lazy", - "governor": 9, - "governorGloss": "dog", - }, - { - "dep": "nmod:over", - "dependent": 9, - "dependentGloss": "dog", - "governor": 5, - "governorGloss": "jumps", - }, - ], - "index": 0, - "tokens": [ - { - "after": " ", - "before": "", - "characterOffsetBegin": 0, - "characterOffsetEnd": 3, - "index": 1, - "lemma": "the", - "originalText": "The", - "pos": "DT", - "word": "The", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 4, - "characterOffsetEnd": 9, - "index": 2, - "lemma": "quick", - "originalText": "quick", - "pos": "JJ", - "word": "quick", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 10, - "characterOffsetEnd": 15, - "index": 3, - "lemma": "brown", - "originalText": "brown", - "pos": "JJ", - "word": "brown", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 16, - "characterOffsetEnd": 19, - "index": 4, - "lemma": "fox", - "originalText": "fox", - "pos": "NN", - "word": "fox", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 20, - "characterOffsetEnd": 25, - "index": 5, - "lemma": "jump", - "originalText": "jumps", - "pos": "VBZ", - "word": "jumps", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 26, - "characterOffsetEnd": 30, - "index": 6, - "lemma": "over", - "originalText": "over", - "pos": "IN", - "word": "over", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 31, - "characterOffsetEnd": 34, - "index": 7, - "lemma": "the", - "originalText": "the", - "pos": "DT", - "word": "the", - }, - { - "after": " ", - "before": " ", - "characterOffsetBegin": 35, - "characterOffsetEnd": 39, - "index": 8, - "lemma": "lazy", - "originalText": "lazy", - "pos": "JJ", - "word": "lazy", - }, - { - "after": "", - "before": " ", - "characterOffsetBegin": 40, - "characterOffsetEnd": 43, - "index": 9, - "lemma": "dog", - "originalText": "dog", - "pos": "NN", - "word": "dog", - }, - ], - } - ] - } - - corenlp_parser.api_call = MagicMock(return_value=api_return_value) - - input_string = "The quick brown fox jumps over the lazy dog".split() - expected_output = Tree( - "jumps", - [ - Tree("fox", ["The", "quick", "brown"]), - Tree("dog", ["over", "the", "lazy"]), - ], - ) - - parsed_data = next(corenlp_parser.parse(input_string)) - - corenlp_parser.api_call.assert_called_once_with( - "The quick brown fox jumps over the lazy dog", - properties={"ssplit.eolonly": "true"}, - ) - self.assertEqual(expected_output, parsed_data.tree()) diff --git a/pipeline/nltk/test/unit/test_corpora.py b/pipeline/nltk/test/unit/test_corpora.py deleted file mode 100644 index 888dd20b5af2f798d966d0a138d4c453675ae0f6..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_corpora.py +++ /dev/null @@ -1,274 +0,0 @@ -import unittest - -import pytest - -from nltk.corpus import ( # mwa_ppdb - cess_cat, - cess_esp, - conll2007, - floresta, - indian, - ptb, - sinica_treebank, - udhr, -) -from nltk.tree import Tree - - -class TestUdhr(unittest.TestCase): - def test_words(self): - for name in udhr.fileids(): - words = list(udhr.words(name)) - self.assertTrue(words) - - def test_raw_unicode(self): - for name in udhr.fileids(): - txt = udhr.raw(name) - assert not isinstance(txt, bytes), name - - def test_polish_encoding(self): - text_pl = udhr.raw("Polish-Latin2")[:164] - text_ppl = udhr.raw("Polish_Polski-Latin2")[:164] - expected = """POWSZECHNA DEKLARACJA PRAW CZŁOWIEKA -[Preamble] -Trzecia Sesja Ogólnego Zgromadzenia ONZ, obradująca w Paryżu, \ -uchwaliła 10 grudnia 1948 roku jednomyślnie Powszechną""" - assert text_pl == expected, "Polish-Latin2" - assert text_ppl == expected, "Polish_Polski-Latin2" - - -class TestIndian(unittest.TestCase): - def test_words(self): - words = indian.words()[:3] - self.assertEqual(words, ["মহিষের", "সন্তান", ":"]) - - def test_tagged_words(self): - tagged_words = indian.tagged_words()[:3] - self.assertEqual( - tagged_words, [("মহিষের", "NN"), ("সন্তান", "NN"), (":", "SYM")] - ) - - -class TestCess(unittest.TestCase): - def test_catalan(self): - words = cess_cat.words()[:15] - txt = "El Tribunal_Suprem -Fpa- TS -Fpt- ha confirmat la condemna a quatre anys d' inhabilitació especial" - self.assertEqual(words, txt.split()) - self.assertEqual(cess_cat.tagged_sents()[0][34][0], "càrrecs") - - def test_esp(self): - words = cess_esp.words()[:15] - txt = "El grupo estatal Electricité_de_France -Fpa- EDF -Fpt- anunció hoy , jueves , la compra del" - self.assertEqual(words, txt.split()) - self.assertEqual(cess_esp.words()[115], "años") - - -class TestFloresta(unittest.TestCase): - def test_words(self): - words = floresta.words()[:10] - txt = "Um revivalismo refrescante O 7_e_Meio é um ex-libris de a" - self.assertEqual(words, txt.split()) - - -class TestSinicaTreebank(unittest.TestCase): - def test_sents(self): - first_3_sents = sinica_treebank.sents()[:3] - self.assertEqual( - first_3_sents, [["一"], ["友情"], ["嘉珍", "和", "我", "住在", "同一條", "巷子"]] - ) - - def test_parsed_sents(self): - parsed_sents = sinica_treebank.parsed_sents()[25] - self.assertEqual( - parsed_sents, - Tree( - "S", - [ - Tree("NP", [Tree("Nba", ["嘉珍"])]), - Tree("V‧地", [Tree("VA11", ["不停"]), Tree("DE", ["的"])]), - Tree("VA4", ["哭泣"]), - ], - ), - ) - - -class TestCoNLL2007(unittest.TestCase): - # Reading the CoNLL 2007 Dependency Treebanks - - def test_sents(self): - sents = conll2007.sents("esp.train")[0] - self.assertEqual( - sents[:6], ["El", "aumento", "del", "índice", "de", "desempleo"] - ) - - def test_parsed_sents(self): - - parsed_sents = conll2007.parsed_sents("esp.train")[0] - - self.assertEqual( - parsed_sents.tree(), - Tree( - "fortaleció", - [ - Tree( - "aumento", - [ - "El", - Tree( - "del", - [ - Tree( - "índice", - [ - Tree( - "de", - [Tree("desempleo", ["estadounidense"])], - ) - ], - ) - ], - ), - ], - ), - "hoy", - "considerablemente", - Tree( - "al", - [ - Tree( - "euro", - [ - Tree( - "cotizaba", - [ - ",", - "que", - Tree("a", [Tree("15.35", ["las", "GMT"])]), - "se", - Tree( - "en", - [ - Tree( - "mercado", - [ - "el", - Tree("de", ["divisas"]), - Tree("de", ["Fráncfort"]), - ], - ) - ], - ), - Tree("a", ["0,9452_dólares"]), - Tree( - "frente_a", - [ - ",", - Tree( - "0,9349_dólares", - [ - "los", - Tree( - "de", - [ - Tree( - "mañana", - ["esta"], - ) - ], - ), - ], - ), - ], - ), - ], - ) - ], - ) - ], - ), - ".", - ], - ), - ) - - -@pytest.mark.skipif( - not ptb.fileids(), - reason="A full installation of the Penn Treebank is not available", -) -class TestPTB(unittest.TestCase): - def test_fileids(self): - self.assertEqual( - ptb.fileids()[:4], - [ - "BROWN/CF/CF01.MRG", - "BROWN/CF/CF02.MRG", - "BROWN/CF/CF03.MRG", - "BROWN/CF/CF04.MRG", - ], - ) - - def test_words(self): - self.assertEqual( - ptb.words("WSJ/00/WSJ_0003.MRG")[:7], - ["A", "form", "of", "asbestos", "once", "used", "*"], - ) - - def test_tagged_words(self): - self.assertEqual( - ptb.tagged_words("WSJ/00/WSJ_0003.MRG")[:3], - [("A", "DT"), ("form", "NN"), ("of", "IN")], - ) - - def test_categories(self): - self.assertEqual( - ptb.categories(), - [ - "adventure", - "belles_lettres", - "fiction", - "humor", - "lore", - "mystery", - "news", - "romance", - "science_fiction", - ], - ) - - def test_news_fileids(self): - self.assertEqual( - ptb.fileids("news")[:3], - ["WSJ/00/WSJ_0001.MRG", "WSJ/00/WSJ_0002.MRG", "WSJ/00/WSJ_0003.MRG"], - ) - - def test_category_words(self): - self.assertEqual( - ptb.words(categories=["humor", "fiction"])[:6], - ["Thirty-three", "Scotty", "did", "not", "go", "back"], - ) - - -@pytest.mark.skip("Skipping test for mwa_ppdb.") -class TestMWAPPDB(unittest.TestCase): - def test_fileids(self): - self.assertEqual( - mwa_ppdb.fileids(), ["ppdb-1.0-xxxl-lexical.extended.synonyms.uniquepairs"] - ) - - def test_entries(self): - self.assertEqual( - mwa_ppdb.entries()[:10], - [ - ("10/17/01", "17/10/2001"), - ("102,70", "102.70"), - ("13,53", "13.53"), - ("3.2.5.3.2.1", "3.2.5.3.2.1."), - ("53,76", "53.76"), - ("6.9.5", "6.9.5."), - ("7.7.6.3", "7.7.6.3."), - ("76,20", "76.20"), - ("79,85", "79.85"), - ("93,65", "93.65"), - ], - ) diff --git a/pipeline/nltk/test/unit/test_corpus_views.py b/pipeline/nltk/test/unit/test_corpus_views.py deleted file mode 100644 index 890825fa8d65b761f661417656f3ae37075cdc6f..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_corpus_views.py +++ /dev/null @@ -1,48 +0,0 @@ -""" -Corpus View Regression Tests -""" -import unittest - -import nltk.data -from nltk.corpus.reader.util import ( - StreamBackedCorpusView, - read_line_block, - read_whitespace_block, -) - - -class TestCorpusViews(unittest.TestCase): - - linetok = nltk.LineTokenizer(blanklines="keep") - names = [ - "corpora/inaugural/README", # A very short file (160 chars) - "corpora/inaugural/1793-Washington.txt", # A relatively short file (791 chars) - "corpora/inaugural/1909-Taft.txt", # A longer file (32k chars) - ] - - def data(self): - for name in self.names: - f = nltk.data.find(name) - with f.open() as fp: - file_data = fp.read().decode("utf8") - yield f, file_data - - def test_correct_values(self): - # Check that corpus views produce the correct sequence of values. - - for f, file_data in self.data(): - v = StreamBackedCorpusView(f, read_whitespace_block) - self.assertEqual(list(v), file_data.split()) - - v = StreamBackedCorpusView(f, read_line_block) - self.assertEqual(list(v), self.linetok.tokenize(file_data)) - - def test_correct_length(self): - # Check that the corpus views report the correct lengths: - - for f, file_data in self.data(): - v = StreamBackedCorpusView(f, read_whitespace_block) - self.assertEqual(len(v), len(file_data.split())) - - v = StreamBackedCorpusView(f, read_line_block) - self.assertEqual(len(v), len(self.linetok.tokenize(file_data))) diff --git a/pipeline/nltk/test/unit/test_data.py b/pipeline/nltk/test/unit/test_data.py deleted file mode 100644 index b05eea84bfaaca4e439f319057d56821764f75c7..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_data.py +++ /dev/null @@ -1,15 +0,0 @@ -import pytest - -import nltk.data - - -def test_find_raises_exception(): - with pytest.raises(LookupError): - nltk.data.find("no_such_resource/foo") - - -def test_find_raises_exception_with_full_resource_name(): - no_such_thing = "no_such_thing/bar" - with pytest.raises(LookupError) as exc: - nltk.data.find(no_such_thing) - assert no_such_thing in str(exc) diff --git a/pipeline/nltk/test/unit/test_disagreement.py b/pipeline/nltk/test/unit/test_disagreement.py deleted file mode 100644 index 1f29add9058e25b2a2736ce513734655c85d4abf..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_disagreement.py +++ /dev/null @@ -1,144 +0,0 @@ -import unittest - -from nltk.metrics.agreement import AnnotationTask - - -class TestDisagreement(unittest.TestCase): - - """ - Class containing unit tests for nltk.metrics.agreement.Disagreement. - """ - - def test_easy(self): - """ - Simple test, based on - https://github.com/foolswood/krippendorffs_alpha/raw/master/krippendorff.pdf. - """ - data = [ - ("coder1", "dress1", "YES"), - ("coder2", "dress1", "NO"), - ("coder3", "dress1", "NO"), - ("coder1", "dress2", "YES"), - ("coder2", "dress2", "NO"), - ("coder3", "dress3", "NO"), - ] - annotation_task = AnnotationTask(data) - self.assertAlmostEqual(annotation_task.alpha(), -0.3333333) - - def test_easy2(self): - """ - Same simple test with 1 rating removed. - Removal of that rating should not matter: K-Apha ignores items with - only 1 rating. - """ - data = [ - ("coder1", "dress1", "YES"), - ("coder2", "dress1", "NO"), - ("coder3", "dress1", "NO"), - ("coder1", "dress2", "YES"), - ("coder2", "dress2", "NO"), - ] - annotation_task = AnnotationTask(data) - self.assertAlmostEqual(annotation_task.alpha(), -0.3333333) - - def test_advanced(self): - """ - More advanced test, based on - http://www.agreestat.com/research_papers/onkrippendorffalpha.pdf - """ - data = [ - ("A", "1", "1"), - ("B", "1", "1"), - ("D", "1", "1"), - ("A", "2", "2"), - ("B", "2", "2"), - ("C", "2", "3"), - ("D", "2", "2"), - ("A", "3", "3"), - ("B", "3", "3"), - ("C", "3", "3"), - ("D", "3", "3"), - ("A", "4", "3"), - ("B", "4", "3"), - ("C", "4", "3"), - ("D", "4", "3"), - ("A", "5", "2"), - ("B", "5", "2"), - ("C", "5", "2"), - ("D", "5", "2"), - ("A", "6", "1"), - ("B", "6", "2"), - ("C", "6", "3"), - ("D", "6", "4"), - ("A", "7", "4"), - ("B", "7", "4"), - ("C", "7", "4"), - ("D", "7", "4"), - ("A", "8", "1"), - ("B", "8", "1"), - ("C", "8", "2"), - ("D", "8", "1"), - ("A", "9", "2"), - ("B", "9", "2"), - ("C", "9", "2"), - ("D", "9", "2"), - ("B", "10", "5"), - ("C", "10", "5"), - ("D", "10", "5"), - ("C", "11", "1"), - ("D", "11", "1"), - ("C", "12", "3"), - ] - annotation_task = AnnotationTask(data) - self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632) - - def test_advanced2(self): - """ - Same more advanced example, but with 1 rating removed. - Again, removal of that 1 rating should not matter. - """ - data = [ - ("A", "1", "1"), - ("B", "1", "1"), - ("D", "1", "1"), - ("A", "2", "2"), - ("B", "2", "2"), - ("C", "2", "3"), - ("D", "2", "2"), - ("A", "3", "3"), - ("B", "3", "3"), - ("C", "3", "3"), - ("D", "3", "3"), - ("A", "4", "3"), - ("B", "4", "3"), - ("C", "4", "3"), - ("D", "4", "3"), - ("A", "5", "2"), - ("B", "5", "2"), - ("C", "5", "2"), - ("D", "5", "2"), - ("A", "6", "1"), - ("B", "6", "2"), - ("C", "6", "3"), - ("D", "6", "4"), - ("A", "7", "4"), - ("B", "7", "4"), - ("C", "7", "4"), - ("D", "7", "4"), - ("A", "8", "1"), - ("B", "8", "1"), - ("C", "8", "2"), - ("D", "8", "1"), - ("A", "9", "2"), - ("B", "9", "2"), - ("C", "9", "2"), - ("D", "9", "2"), - ("B", "10", "5"), - ("C", "10", "5"), - ("D", "10", "5"), - ("C", "11", "1"), - ("D", "11", "1"), - ("C", "12", "3"), - ] - annotation_task = AnnotationTask(data) - self.assertAlmostEqual(annotation_task.alpha(), 0.743421052632) diff --git a/pipeline/nltk/test/unit/test_distance.py b/pipeline/nltk/test/unit/test_distance.py deleted file mode 100644 index 71e0bf06a6a5d3e75ceefa06670542303803d3eb..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_distance.py +++ /dev/null @@ -1,129 +0,0 @@ -from typing import Tuple - -import pytest - -from nltk.metrics.distance import edit_distance - - -class TestEditDistance: - @pytest.mark.parametrize( - "left,right,substitution_cost,expecteds", - [ - # Allowing transpositions reduces the number of edits required. - # with transpositions: - # e.g. "abc" -T-> "cba" -D-> "ca": 2 steps - # - # without transpositions: - # e.g. "abc" -D-> "ab" -D-> "a" -I-> "ca": 3 steps - ("abc", "ca", 1, (2, 3)), - ("abc", "ca", 5, (2, 3)), # Doesn't *require* substitutions - # Note, a substition_cost of higher than 2 doesn't make much - # sense, as a deletion + insertion is identical, and always - # costs 2. - # - # - # Transpositions don't always reduce the number of edits required: - # with or without transpositions: - # e.g. "wants" -D-> "wats" -D-> "was" -I-> "wasp": 3 steps - ("wants", "wasp", 1, (3, 3)), - ("wants", "wasp", 5, (3, 3)), # Doesn't *require* substitutions - # - # - # Ought to have the same results with and without transpositions - # with or without transpositions: - # e.g. "rain" -S-> "sain" -S-> "shin" -I-> "shine": 3 steps - # (but cost 5 if substitution_cost=2) - ("rain", "shine", 1, (3, 3)), - ("rain", "shine", 2, (5, 5)), # Does *require* substitutions - # - # - # Several potentially interesting typos - # with transpositions: - # e.g. "acbdef" -T-> "abcdef": 1 step - # - # without transpositions: - # e.g. "acbdef" -D-> "abdef" -I-> "abcdef": 2 steps - ("acbdef", "abcdef", 1, (1, 2)), - ("acbdef", "abcdef", 2, (1, 2)), # Doesn't *require* substitutions - # - # - # with transpositions: - # e.g. "lnaguaeg" -T-> "languaeg" -T-> "language": 2 steps - # - # without transpositions: - # e.g. "lnaguaeg" -D-> "laguaeg" -I-> "languaeg" -D-> "languag" -I-> "language": 4 steps - ("lnaguaeg", "language", 1, (2, 4)), - ("lnaguaeg", "language", 2, (2, 4)), # Doesn't *require* substitutions - # - # - # with transpositions: - # e.g. "lnaugage" -T-> "lanugage" -T-> "language": 2 steps - # - # without transpositions: - # e.g. "lnaugage" -S-> "lnangage" -D-> "langage" -I-> "language": 3 steps - # (but one substitution, so a cost of 4 if substition_cost = 2) - ("lnaugage", "language", 1, (2, 3)), - ("lnaugage", "language", 2, (2, 4)), - # Does *require* substitutions if no transpositions - # - # - # with transpositions: - # e.g. "lngauage" -T-> "lnaguage" -T-> "language": 2 steps - # without transpositions: - # e.g. "lngauage" -I-> "lanaguage" -D-> "language": 2 steps - ("lngauage", "language", 1, (2, 2)), - ("lngauage", "language", 2, (2, 2)), # Doesn't *require* substitutions - # - # - # with or without transpositions: - # e.g. "wants" -S-> "sants" -S-> "swnts" -S-> "swits" -S-> "swims" -D-> "swim": 5 steps - # - # with substitution_cost=2 and transpositions: - # e.g. "wants" -T-> "santw" -D-> "sntw" -D-> "stw" -D-> "sw" - # -I-> "swi" -I-> "swim": 6 steps - # - # with substitution_cost=2 and no transpositions: - # e.g. "wants" -I-> "swants" -D-> "swant" -D-> "swan" -D-> "swa" -D-> "sw" - # -I-> "swi" -I-> "swim": 7 steps - ("wants", "swim", 1, (5, 5)), - ("wants", "swim", 2, (6, 7)), - # - # - # with or without transpositions: - # e.g. "kitten" -S-> "sitten" -s-> "sittin" -I-> "sitting": 3 steps - # (but cost 5 if substitution_cost=2) - ("kitten", "sitting", 1, (3, 3)), - ("kitten", "sitting", 2, (5, 5)), - # - # duplicated letter - # e.g. "duplicated" -D-> "duplicated" - ("duplicated", "duuplicated", 1, (1, 1)), - ("duplicated", "duuplicated", 2, (1, 1)), - ("very duplicated", "very duuplicateed", 2, (2, 2)), - ], - ) - def test_with_transpositions( - self, left: str, right: str, substitution_cost: int, expecteds: Tuple[int, int] - ): - """ - Test `edit_distance` between two strings, given some `substitution_cost`, - and whether transpositions are allowed. - - :param str left: First input string to `edit_distance`. - :param str right: Second input string to `edit_distance`. - :param int substitution_cost: The cost of a substitution action in `edit_distance`. - :param Tuple[int, int] expecteds: A tuple of expected outputs, such that `expecteds[0]` is - the expected output with `transpositions=True`, and `expecteds[1]` is - the expected output with `transpositions=False`. - """ - # Test the input strings in both orderings - for s1, s2 in ((left, right), (right, left)): - # zip with [True, False] to get the transpositions value - for expected, transpositions in zip(expecteds, [True, False]): - predicted = edit_distance( - s1, - s2, - substitution_cost=substitution_cost, - transpositions=transpositions, - ) - assert predicted == expected diff --git a/pipeline/nltk/test/unit/test_downloader.py b/pipeline/nltk/test/unit/test_downloader.py deleted file mode 100644 index 408372259142592003a3689cb35a0430e0bec190..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_downloader.py +++ /dev/null @@ -1,19 +0,0 @@ -from nltk import download - - -def test_downloader_using_existing_parent_download_dir(tmp_path): - """Test that download works properly when the parent folder of the download_dir exists""" - - download_dir = str(tmp_path.joinpath("another_dir")) - download_status = download("mwa_ppdb", download_dir) - assert download_status is True - - -def test_downloader_using_non_existing_parent_download_dir(tmp_path): - """Test that download works properly when the parent folder of the download_dir does not exist""" - - download_dir = str( - tmp_path.joinpath("non-existing-parent-folder", "another-non-existing-folder") - ) - download_status = download("mwa_ppdb", download_dir) - assert download_status is True diff --git a/pipeline/nltk/test/unit/test_freqdist.py b/pipeline/nltk/test/unit/test_freqdist.py deleted file mode 100644 index ace95421dcca1ded7e403f7f9b5db7d0276e983f..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_freqdist.py +++ /dev/null @@ -1,7 +0,0 @@ -import nltk - - -def test_iterating_returns_an_iterator_ordered_by_frequency(): - samples = ["one", "two", "two"] - distribution = nltk.FreqDist(samples) - assert list(distribution) == ["two", "one"] diff --git a/pipeline/nltk/test/unit/test_hmm.py b/pipeline/nltk/test/unit/test_hmm.py deleted file mode 100644 index 2ce5213230ae4c58ba58ff94872b7240f5884d79..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_hmm.py +++ /dev/null @@ -1,82 +0,0 @@ -import pytest - -from nltk.tag import hmm - - -def _wikipedia_example_hmm(): - # Example from wikipedia - # (https://en.wikipedia.org/wiki/Forward%E2%80%93backward_algorithm) - - states = ["rain", "no rain"] - symbols = ["umbrella", "no umbrella"] - - A = [[0.7, 0.3], [0.3, 0.7]] # transition probabilities - B = [[0.9, 0.1], [0.2, 0.8]] # emission probabilities - pi = [0.5, 0.5] # initial probabilities - - seq = ["umbrella", "umbrella", "no umbrella", "umbrella", "umbrella"] - seq = list(zip(seq, [None] * len(seq))) - - model = hmm._create_hmm_tagger(states, symbols, A, B, pi) - return model, states, symbols, seq - - -def test_forward_probability(): - from numpy.testing import assert_array_almost_equal - - # example from p. 385, Huang et al - model, states, symbols = hmm._market_hmm_example() - seq = [("up", None), ("up", None)] - expected = [[0.35, 0.02, 0.09], [0.1792, 0.0085, 0.0357]] - - fp = 2 ** model._forward_probability(seq) - - assert_array_almost_equal(fp, expected) - - -def test_forward_probability2(): - from numpy.testing import assert_array_almost_equal - - model, states, symbols, seq = _wikipedia_example_hmm() - fp = 2 ** model._forward_probability(seq) - - # examples in wikipedia are normalized - fp = (fp.T / fp.sum(axis=1)).T - - wikipedia_results = [ - [0.8182, 0.1818], - [0.8834, 0.1166], - [0.1907, 0.8093], - [0.7308, 0.2692], - [0.8673, 0.1327], - ] - - assert_array_almost_equal(wikipedia_results, fp, 4) - - -def test_backward_probability(): - from numpy.testing import assert_array_almost_equal - - model, states, symbols, seq = _wikipedia_example_hmm() - - bp = 2 ** model._backward_probability(seq) - # examples in wikipedia are normalized - - bp = (bp.T / bp.sum(axis=1)).T - - wikipedia_results = [ - # Forward-backward algorithm doesn't need b0_5, - # so .backward_probability doesn't compute it. - # [0.6469, 0.3531], - [0.5923, 0.4077], - [0.3763, 0.6237], - [0.6533, 0.3467], - [0.6273, 0.3727], - [0.5, 0.5], - ] - - assert_array_almost_equal(wikipedia_results, bp, 4) - - -def setup_module(module): - pytest.importorskip("numpy") diff --git a/pipeline/nltk/test/unit/test_json2csv_corpus.py b/pipeline/nltk/test/unit/test_json2csv_corpus.py deleted file mode 100644 index f54ee94053b636225c0b7008625f9dfd3643508b..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_json2csv_corpus.py +++ /dev/null @@ -1,210 +0,0 @@ -# Natural Language Toolkit: Twitter client -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Lorenzo Rubio -# URL: -# For license information, see LICENSE.TXT - -""" -Regression tests for `json2csv()` and `json2csv_entities()` in Twitter -package. -""" -from pathlib import Path - -import pytest - -from nltk.corpus import twitter_samples -from nltk.twitter.common import json2csv, json2csv_entities - - -def files_are_identical(pathA, pathB): - """ - Compare two files, ignoring carriage returns, - leading whitespace, and trailing whitespace - """ - f1 = [l.strip() for l in pathA.read_bytes().splitlines()] - f2 = [l.strip() for l in pathB.read_bytes().splitlines()] - return f1 == f2 - - -subdir = Path(__file__).parent / "files" - - -@pytest.fixture -def infile(): - with open(twitter_samples.abspath("tweets.20150430-223406.json")) as infile: - return [next(infile) for x in range(100)] - - -def test_textoutput(tmp_path, infile): - ref_fn = subdir / "tweets.20150430-223406.text.csv.ref" - outfn = tmp_path / "tweets.20150430-223406.text.csv" - json2csv(infile, outfn, ["text"], gzip_compress=False) - assert files_are_identical(outfn, ref_fn) - - -def test_tweet_metadata(tmp_path, infile): - ref_fn = subdir / "tweets.20150430-223406.tweet.csv.ref" - fields = [ - "created_at", - "favorite_count", - "id", - "in_reply_to_status_id", - "in_reply_to_user_id", - "retweet_count", - "retweeted", - "text", - "truncated", - "user.id", - ] - - outfn = tmp_path / "tweets.20150430-223406.tweet.csv" - json2csv(infile, outfn, fields, gzip_compress=False) - assert files_are_identical(outfn, ref_fn) - - -def test_user_metadata(tmp_path, infile): - ref_fn = subdir / "tweets.20150430-223406.user.csv.ref" - fields = ["id", "text", "user.id", "user.followers_count", "user.friends_count"] - - outfn = tmp_path / "tweets.20150430-223406.user.csv" - json2csv(infile, outfn, fields, gzip_compress=False) - assert files_are_identical(outfn, ref_fn) - - -def test_tweet_hashtag(tmp_path, infile): - ref_fn = subdir / "tweets.20150430-223406.hashtag.csv.ref" - outfn = tmp_path / "tweets.20150430-223406.hashtag.csv" - json2csv_entities( - infile, - outfn, - ["id", "text"], - "hashtags", - ["text"], - gzip_compress=False, - ) - assert files_are_identical(outfn, ref_fn) - - -def test_tweet_usermention(tmp_path, infile): - ref_fn = subdir / "tweets.20150430-223406.usermention.csv.ref" - outfn = tmp_path / "tweets.20150430-223406.usermention.csv" - json2csv_entities( - infile, - outfn, - ["id", "text"], - "user_mentions", - ["id", "screen_name"], - gzip_compress=False, - ) - assert files_are_identical(outfn, ref_fn) - - -def test_tweet_media(tmp_path, infile): - ref_fn = subdir / "tweets.20150430-223406.media.csv.ref" - outfn = tmp_path / "tweets.20150430-223406.media.csv" - json2csv_entities( - infile, - outfn, - ["id"], - "media", - ["media_url", "url"], - gzip_compress=False, - ) - - assert files_are_identical(outfn, ref_fn) - - -def test_tweet_url(tmp_path, infile): - ref_fn = subdir / "tweets.20150430-223406.url.csv.ref" - outfn = tmp_path / "tweets.20150430-223406.url.csv" - json2csv_entities( - infile, - outfn, - ["id"], - "urls", - ["url", "expanded_url"], - gzip_compress=False, - ) - - assert files_are_identical(outfn, ref_fn) - - -def test_userurl(tmp_path, infile): - ref_fn = subdir / "tweets.20150430-223406.userurl.csv.ref" - outfn = tmp_path / "tweets.20150430-223406.userurl.csv" - json2csv_entities( - infile, - outfn, - ["id", "screen_name"], - "user.urls", - ["url", "expanded_url"], - gzip_compress=False, - ) - - assert files_are_identical(outfn, ref_fn) - - -def test_tweet_place(tmp_path, infile): - ref_fn = subdir / "tweets.20150430-223406.place.csv.ref" - outfn = tmp_path / "tweets.20150430-223406.place.csv" - json2csv_entities( - infile, - outfn, - ["id", "text"], - "place", - ["name", "country"], - gzip_compress=False, - ) - - assert files_are_identical(outfn, ref_fn) - - -def test_tweet_place_boundingbox(tmp_path, infile): - ref_fn = subdir / "tweets.20150430-223406.placeboundingbox.csv.ref" - outfn = tmp_path / "tweets.20150430-223406.placeboundingbox.csv" - json2csv_entities( - infile, - outfn, - ["id", "name"], - "place.bounding_box", - ["coordinates"], - gzip_compress=False, - ) - - assert files_are_identical(outfn, ref_fn) - - -def test_retweet_original_tweet(tmp_path, infile): - ref_fn = subdir / "tweets.20150430-223406.retweet.csv.ref" - outfn = tmp_path / "tweets.20150430-223406.retweet.csv" - json2csv_entities( - infile, - outfn, - ["id"], - "retweeted_status", - [ - "created_at", - "favorite_count", - "id", - "in_reply_to_status_id", - "in_reply_to_user_id", - "retweet_count", - "text", - "truncated", - "user.id", - ], - gzip_compress=False, - ) - - assert files_are_identical(outfn, ref_fn) - - -def test_file_is_wrong(tmp_path, infile): - """ - Sanity check that file comparison is not giving false positives. - """ - ref_fn = subdir / "tweets.20150430-223406.retweet.csv.ref" - outfn = tmp_path / "tweets.20150430-223406.text.csv" - json2csv(infile, outfn, ["text"], gzip_compress=False) - assert not files_are_identical(outfn, ref_fn) diff --git a/pipeline/nltk/test/unit/test_json_serialization.py b/pipeline/nltk/test/unit/test_json_serialization.py deleted file mode 100644 index 1b9dc11b6eec30fe14e6cd419a853e8ce516cd62..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_json_serialization.py +++ /dev/null @@ -1,95 +0,0 @@ -import unittest - -from nltk.corpus import brown -from nltk.jsontags import JSONTaggedDecoder, JSONTaggedEncoder -from nltk.tag import ( - AffixTagger, - BigramTagger, - BrillTagger, - BrillTaggerTrainer, - DefaultTagger, - NgramTagger, - PerceptronTagger, - RegexpTagger, - TrigramTagger, - UnigramTagger, -) -from nltk.tag.brill import nltkdemo18 - - -class TestJSONSerialization(unittest.TestCase): - def setUp(self): - self.corpus = brown.tagged_sents()[:35] - self.decoder = JSONTaggedDecoder() - self.encoder = JSONTaggedEncoder() - self.default_tagger = DefaultTagger("NN") - - def test_default_tagger(self): - encoded = self.encoder.encode(self.default_tagger) - decoded = self.decoder.decode(encoded) - - self.assertEqual(repr(self.default_tagger), repr(decoded)) - self.assertEqual(self.default_tagger._tag, decoded._tag) - - def test_regexp_tagger(self): - tagger = RegexpTagger([(r".*", "NN")], backoff=self.default_tagger) - - encoded = self.encoder.encode(tagger) - decoded = self.decoder.decode(encoded) - - self.assertEqual(repr(tagger), repr(decoded)) - self.assertEqual(repr(tagger.backoff), repr(decoded.backoff)) - self.assertEqual(tagger._regexps, decoded._regexps) - - def test_affix_tagger(self): - tagger = AffixTagger(self.corpus, backoff=self.default_tagger) - - encoded = self.encoder.encode(tagger) - decoded = self.decoder.decode(encoded) - - self.assertEqual(repr(tagger), repr(decoded)) - self.assertEqual(repr(tagger.backoff), repr(decoded.backoff)) - self.assertEqual(tagger._affix_length, decoded._affix_length) - self.assertEqual(tagger._min_word_length, decoded._min_word_length) - self.assertEqual(tagger._context_to_tag, decoded._context_to_tag) - - def test_ngram_taggers(self): - unitagger = UnigramTagger(self.corpus, backoff=self.default_tagger) - bitagger = BigramTagger(self.corpus, backoff=unitagger) - tritagger = TrigramTagger(self.corpus, backoff=bitagger) - ntagger = NgramTagger(4, self.corpus, backoff=tritagger) - - encoded = self.encoder.encode(ntagger) - decoded = self.decoder.decode(encoded) - - self.assertEqual(repr(ntagger), repr(decoded)) - self.assertEqual(repr(tritagger), repr(decoded.backoff)) - self.assertEqual(repr(bitagger), repr(decoded.backoff.backoff)) - self.assertEqual(repr(unitagger), repr(decoded.backoff.backoff.backoff)) - self.assertEqual( - repr(self.default_tagger), repr(decoded.backoff.backoff.backoff.backoff) - ) - - def test_perceptron_tagger(self): - tagger = PerceptronTagger(load=False) - tagger.train(self.corpus) - - encoded = self.encoder.encode(tagger) - decoded = self.decoder.decode(encoded) - - self.assertEqual(tagger.model.weights, decoded.model.weights) - self.assertEqual(tagger.tagdict, decoded.tagdict) - self.assertEqual(tagger.classes, decoded.classes) - - def test_brill_tagger(self): - trainer = BrillTaggerTrainer( - self.default_tagger, nltkdemo18(), deterministic=True - ) - tagger = trainer.train(self.corpus, max_rules=30) - - encoded = self.encoder.encode(tagger) - decoded = self.decoder.decode(encoded) - - self.assertEqual(repr(tagger._initial_tagger), repr(decoded._initial_tagger)) - self.assertEqual(tagger._rules, decoded._rules) - self.assertEqual(tagger._training_stats, decoded._training_stats) diff --git a/pipeline/nltk/test/unit/test_metrics.py b/pipeline/nltk/test/unit/test_metrics.py deleted file mode 100644 index ab99d31d6a3255a388747487c969ece568324f52..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_metrics.py +++ /dev/null @@ -1,66 +0,0 @@ -import unittest - -from nltk.metrics import ( - BigramAssocMeasures, - QuadgramAssocMeasures, - TrigramAssocMeasures, -) - -## Test the likelihood ratio metric - -_DELTA = 1e-8 - - -class TestLikelihoodRatio(unittest.TestCase): - def test_lr_bigram(self): - self.assertAlmostEqual( - BigramAssocMeasures.likelihood_ratio(2, (4, 4), 20), - 2.4142743368419755, - delta=_DELTA, - ) - self.assertAlmostEqual( - BigramAssocMeasures.likelihood_ratio(1, (1, 1), 1), 0.0, delta=_DELTA - ) - self.assertRaises( - ValueError, - BigramAssocMeasures.likelihood_ratio, - *(0, (2, 2), 2), - ) - - def test_lr_trigram(self): - self.assertAlmostEqual( - TrigramAssocMeasures.likelihood_ratio(1, (1, 1, 1), (1, 1, 1), 2), - 5.545177444479562, - delta=_DELTA, - ) - self.assertAlmostEqual( - TrigramAssocMeasures.likelihood_ratio(1, (1, 1, 1), (1, 1, 1), 1), - 0.0, - delta=_DELTA, - ) - self.assertRaises( - ValueError, - TrigramAssocMeasures.likelihood_ratio, - *(1, (1, 1, 2), (1, 1, 2), 2), - ) - - def test_lr_quadgram(self): - self.assertAlmostEqual( - QuadgramAssocMeasures.likelihood_ratio( - 1, (1, 1, 1, 1), (1, 1, 1, 1, 1, 1), (1, 1, 1, 1), 2 - ), - 8.317766166719343, - delta=_DELTA, - ) - self.assertAlmostEqual( - QuadgramAssocMeasures.likelihood_ratio( - 1, (1, 1, 1, 1), (1, 1, 1, 1, 1, 1), (1, 1, 1, 1), 1 - ), - 0.0, - delta=_DELTA, - ) - self.assertRaises( - ValueError, - QuadgramAssocMeasures.likelihood_ratio, - *(1, (1, 1, 1, 1), (1, 1, 1, 1, 1, 2), (1, 1, 1, 1), 1), - ) diff --git a/pipeline/nltk/test/unit/test_naivebayes.py b/pipeline/nltk/test/unit/test_naivebayes.py deleted file mode 100644 index a5acf29ba05ed17da4179f6991256199dd739f63..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_naivebayes.py +++ /dev/null @@ -1,21 +0,0 @@ -import unittest - -from nltk.classify.naivebayes import NaiveBayesClassifier - - -class NaiveBayesClassifierTest(unittest.TestCase): - def test_simple(self): - training_features = [ - ({"nice": True, "good": True}, "positive"), - ({"bad": True, "mean": True}, "negative"), - ] - - classifier = NaiveBayesClassifier.train(training_features) - - result = classifier.prob_classify({"nice": True}) - self.assertTrue(result.prob("positive") > result.prob("negative")) - self.assertEqual(result.max(), "positive") - - result = classifier.prob_classify({"bad": True}) - self.assertTrue(result.prob("positive") < result.prob("negative")) - self.assertEqual(result.max(), "negative") diff --git a/pipeline/nltk/test/unit/test_nombank.py b/pipeline/nltk/test/unit/test_nombank.py deleted file mode 100644 index 395d7bb3cab90c00ae3775faa15092a343d929a2..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_nombank.py +++ /dev/null @@ -1,27 +0,0 @@ -""" -Unit tests for nltk.corpus.nombank -""" - -import unittest - -from nltk.corpus import nombank - -# Load the nombank once. -nombank.nouns() - - -class NombankDemo(unittest.TestCase): - def test_numbers(self): - # No. of instances. - self.assertEqual(len(nombank.instances()), 114574) - # No. of rolesets - self.assertEqual(len(nombank.rolesets()), 5577) - # No. of nouns. - self.assertEqual(len(nombank.nouns()), 4704) - - def test_instance(self): - self.assertEqual(nombank.instances()[0].roleset, "perc-sign.01") - - def test_framefiles_fileids(self): - self.assertEqual(len(nombank.fileids()), 4705) - self.assertTrue(all(fileid.endswith(".xml") for fileid in nombank.fileids())) diff --git a/pipeline/nltk/test/unit/test_pl196x.py b/pipeline/nltk/test/unit/test_pl196x.py deleted file mode 100644 index e175f8dc0061a983af011010e48fe9567c9d314a..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_pl196x.py +++ /dev/null @@ -1,13 +0,0 @@ -import unittest - -import nltk -from nltk.corpus.reader import pl196x - - -class TestCorpusViews(unittest.TestCase): - def test_corpus_reader(self): - pl196x_dir = nltk.data.find("corpora/pl196x") - pl = pl196x.Pl196xCorpusReader( - pl196x_dir, r".*\.xml", textids="textids.txt", cat_file="cats.txt" - ) - pl.tagged_words(fileids=pl.fileids(), categories="cats.txt") diff --git a/pipeline/nltk/test/unit/test_pos_tag.py b/pipeline/nltk/test/unit/test_pos_tag.py deleted file mode 100644 index 4e2dc20967969ec2cb38ea925906886ef50a7a69..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_pos_tag.py +++ /dev/null @@ -1,83 +0,0 @@ -""" -Tests for nltk.pos_tag -""" - - -import unittest - -from nltk import pos_tag, word_tokenize - - -class TestPosTag(unittest.TestCase): - def test_pos_tag_eng(self): - text = "John's big idea isn't all that bad." - expected_tagged = [ - ("John", "NNP"), - ("'s", "POS"), - ("big", "JJ"), - ("idea", "NN"), - ("is", "VBZ"), - ("n't", "RB"), - ("all", "PDT"), - ("that", "DT"), - ("bad", "JJ"), - (".", "."), - ] - assert pos_tag(word_tokenize(text)) == expected_tagged - - def test_pos_tag_eng_universal(self): - text = "John's big idea isn't all that bad." - expected_tagged = [ - ("John", "NOUN"), - ("'s", "PRT"), - ("big", "ADJ"), - ("idea", "NOUN"), - ("is", "VERB"), - ("n't", "ADV"), - ("all", "DET"), - ("that", "DET"), - ("bad", "ADJ"), - (".", "."), - ] - assert pos_tag(word_tokenize(text), tagset="universal") == expected_tagged - - def test_pos_tag_rus(self): - text = "Илья оторопел и дважды перечитал бумажку." - expected_tagged = [ - ("Илья", "S"), - ("оторопел", "V"), - ("и", "CONJ"), - ("дважды", "ADV"), - ("перечитал", "V"), - ("бумажку", "S"), - (".", "NONLEX"), - ] - assert pos_tag(word_tokenize(text), lang="rus") == expected_tagged - - def test_pos_tag_rus_universal(self): - text = "Илья оторопел и дважды перечитал бумажку." - expected_tagged = [ - ("Илья", "NOUN"), - ("оторопел", "VERB"), - ("и", "CONJ"), - ("дважды", "ADV"), - ("перечитал", "VERB"), - ("бумажку", "NOUN"), - (".", "."), - ] - assert ( - pos_tag(word_tokenize(text), tagset="universal", lang="rus") - == expected_tagged - ) - - def test_pos_tag_unknown_lang(self): - text = "모르겠 습니 다" - self.assertRaises(NotImplementedError, pos_tag, word_tokenize(text), lang="kor") - # Test for default kwarg, `lang=None` - self.assertRaises(NotImplementedError, pos_tag, word_tokenize(text), lang=None) - - def test_unspecified_lang(self): - # Tries to force the lang='eng' option. - text = "모르겠 습니 다" - expected_but_wrong = [("모르겠", "JJ"), ("습니", "NNP"), ("다", "NN")] - assert pos_tag(word_tokenize(text)) == expected_but_wrong diff --git a/pipeline/nltk/test/unit/test_ribes.py b/pipeline/nltk/test/unit/test_ribes.py deleted file mode 100644 index f1efcdad195766451423e721e2f09242e8bf7de5..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_ribes.py +++ /dev/null @@ -1,246 +0,0 @@ -from nltk.translate.ribes_score import corpus_ribes, word_rank_alignment - - -def test_ribes_empty_worder(): # worder as in word order - # Verifies that these two sentences have no alignment, - # and hence have the lowest possible RIBES score. - hyp = "This is a nice sentence which I quite like".split() - ref = "Okay well that's neat and all but the reference's different".split() - - assert word_rank_alignment(ref, hyp) == [] - - list_of_refs = [[ref]] - hypotheses = [hyp] - assert corpus_ribes(list_of_refs, hypotheses) == 0.0 - - -def test_ribes_one_worder(): - # Verifies that these two sentences have just one match, - # and the RIBES score for this sentence with very little - # correspondence is 0. - hyp = "This is a nice sentence which I quite like".split() - ref = "Okay well that's nice and all but the reference's different".split() - - assert word_rank_alignment(ref, hyp) == [3] - - list_of_refs = [[ref]] - hypotheses = [hyp] - assert corpus_ribes(list_of_refs, hypotheses) == 0.0 - - -def test_ribes_two_worder(): - # Verifies that these two sentences have two matches, - # but still get the lowest possible RIBES score due - # to the lack of similarity. - hyp = "This is a nice sentence which I quite like".split() - ref = "Okay well that's nice and all but the reference is different".split() - - assert word_rank_alignment(ref, hyp) == [9, 3] - - list_of_refs = [[ref]] - hypotheses = [hyp] - assert corpus_ribes(list_of_refs, hypotheses) == 0.0 - - -def test_ribes(): - # Based on the doctest of the corpus_ribes function - hyp1 = [ - "It", - "is", - "a", - "guide", - "to", - "action", - "which", - "ensures", - "that", - "the", - "military", - "always", - "obeys", - "the", - "commands", - "of", - "the", - "party", - ] - ref1a = [ - "It", - "is", - "a", - "guide", - "to", - "action", - "that", - "ensures", - "that", - "the", - "military", - "will", - "forever", - "heed", - "Party", - "commands", - ] - ref1b = [ - "It", - "is", - "the", - "guiding", - "principle", - "which", - "guarantees", - "the", - "military", - "forces", - "always", - "being", - "under", - "the", - "command", - "of", - "the", - "Party", - ] - ref1c = [ - "It", - "is", - "the", - "practical", - "guide", - "for", - "the", - "army", - "always", - "to", - "heed", - "the", - "directions", - "of", - "the", - "party", - ] - - hyp2 = [ - "he", - "read", - "the", - "book", - "because", - "he", - "was", - "interested", - "in", - "world", - "history", - ] - ref2a = [ - "he", - "was", - "interested", - "in", - "world", - "history", - "because", - "he", - "read", - "the", - "book", - ] - - list_of_refs = [[ref1a, ref1b, ref1c], [ref2a]] - hypotheses = [hyp1, hyp2] - - score = corpus_ribes(list_of_refs, hypotheses) - - assert round(score, 4) == 0.3597 - - -def test_no_zero_div(): - # Regression test for Issue 2529, assure that no ZeroDivisionError is thrown. - hyp1 = [ - "It", - "is", - "a", - "guide", - "to", - "action", - "which", - "ensures", - "that", - "the", - "military", - "always", - "obeys", - "the", - "commands", - "of", - "the", - "party", - ] - ref1a = [ - "It", - "is", - "a", - "guide", - "to", - "action", - "that", - "ensures", - "that", - "the", - "military", - "will", - "forever", - "heed", - "Party", - "commands", - ] - ref1b = [ - "It", - "is", - "the", - "guiding", - "principle", - "which", - "guarantees", - "the", - "military", - "forces", - "always", - "being", - "under", - "the", - "command", - "of", - "the", - "Party", - ] - ref1c = [ - "It", - "is", - "the", - "practical", - "guide", - "for", - "the", - "army", - "always", - "to", - "heed", - "the", - "directions", - "of", - "the", - "party", - ] - - hyp2 = ["he", "read", "the"] - ref2a = ["he", "was", "interested", "in", "world", "history", "because", "he"] - - list_of_refs = [[ref1a, ref1b, ref1c], [ref2a]] - hypotheses = [hyp1, hyp2] - - score = corpus_ribes(list_of_refs, hypotheses) - - assert round(score, 4) == 0.1688 diff --git a/pipeline/nltk/test/unit/test_rte_classify.py b/pipeline/nltk/test/unit/test_rte_classify.py deleted file mode 100644 index 9bda6cb0b7abf95b252f3abb6f4ad534857b70b2..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_rte_classify.py +++ /dev/null @@ -1,94 +0,0 @@ -import pytest - -from nltk import config_megam -from nltk.classify.rte_classify import RTEFeatureExtractor, rte_classifier, rte_features -from nltk.corpus import rte as rte_corpus - -expected_from_rte_feature_extration = """ -alwayson => True -ne_hyp_extra => 0 -ne_overlap => 1 -neg_hyp => 0 -neg_txt => 0 -word_hyp_extra => 3 -word_overlap => 3 - -alwayson => True -ne_hyp_extra => 0 -ne_overlap => 1 -neg_hyp => 0 -neg_txt => 0 -word_hyp_extra => 2 -word_overlap => 1 - -alwayson => True -ne_hyp_extra => 1 -ne_overlap => 1 -neg_hyp => 0 -neg_txt => 0 -word_hyp_extra => 1 -word_overlap => 2 - -alwayson => True -ne_hyp_extra => 1 -ne_overlap => 0 -neg_hyp => 0 -neg_txt => 0 -word_hyp_extra => 6 -word_overlap => 2 - -alwayson => True -ne_hyp_extra => 1 -ne_overlap => 0 -neg_hyp => 0 -neg_txt => 0 -word_hyp_extra => 4 -word_overlap => 0 - -alwayson => True -ne_hyp_extra => 1 -ne_overlap => 0 -neg_hyp => 0 -neg_txt => 0 -word_hyp_extra => 3 -word_overlap => 1 -""" - - -class TestRTEClassifier: - # Test the feature extraction method. - def test_rte_feature_extraction(self): - pairs = rte_corpus.pairs(["rte1_dev.xml"])[:6] - test_output = [ - f"{key:<15} => {rte_features(pair)[key]}" - for pair in pairs - for key in sorted(rte_features(pair)) - ] - expected_output = expected_from_rte_feature_extration.strip().split("\n") - # Remove null strings. - expected_output = list(filter(None, expected_output)) - assert test_output == expected_output - - # Test the RTEFeatureExtractor object. - def test_feature_extractor_object(self): - rtepair = rte_corpus.pairs(["rte3_dev.xml"])[33] - extractor = RTEFeatureExtractor(rtepair) - - assert extractor.hyp_words == {"member", "China", "SCO."} - assert extractor.overlap("word") == set() - assert extractor.overlap("ne") == {"China"} - assert extractor.hyp_extra("word") == {"member"} - - # Test the RTE classifier training. - def test_rte_classification_without_megam(self): - # Use a sample size for unit testing, since we - # don't need to fully train these classifiers - clf = rte_classifier("IIS", sample_N=100) - clf = rte_classifier("GIS", sample_N=100) - - def test_rte_classification_with_megam(self): - try: - config_megam() - except (LookupError, AttributeError) as e: - pytest.skip("Skipping tests with dependencies on MEGAM") - clf = rte_classifier("megam", sample_N=100) diff --git a/pipeline/nltk/test/unit/test_seekable_unicode_stream_reader.py b/pipeline/nltk/test/unit/test_seekable_unicode_stream_reader.py deleted file mode 100644 index e4a51e46d107a8c3e467ec2d2e88c964f7778883..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_seekable_unicode_stream_reader.py +++ /dev/null @@ -1,86 +0,0 @@ -import os -from io import BytesIO - -import pytest - -from nltk.corpus.reader import SeekableUnicodeStreamReader - - -def check_reader(unicode_string, encoding): - bytestr = unicode_string.encode(encoding) - stream = BytesIO(bytestr) - reader = SeekableUnicodeStreamReader(stream, encoding) - - # Should open at the start of the file - assert reader.tell() == 0 - - # Compare original string to contents from `.readlines()` - assert unicode_string == "".join(reader.readlines()) - - # Should be at the end of the file now - stream.seek(0, os.SEEK_END) - assert reader.tell() == stream.tell() - - reader.seek(0) # go back to start - - # Compare original string to contents from `.read()` - contents = "" - char = None - while char != "": - char = reader.read(1) - contents += char - assert unicode_string == contents - - -# Call `check_reader` with a variety of input strings and encodings. -ENCODINGS = ["ascii", "latin1", "greek", "hebrew", "utf-16", "utf-8"] - -STRINGS = [ - """ - This is a test file. - It is fairly short. - """, - "This file can be encoded with latin1. \x83", - """\ - This is a test file. - Here's a blank line: - - And here's some unicode: \xee \u0123 \uffe3 - """, - """\ - This is a test file. - Unicode characters: \xf3 \u2222 \u3333\u4444 \u5555 - """, - """\ - This is a larger file. It has some lines that are longer \ - than 72 characters. It's got lots of repetition. Here's \ - some unicode chars: \xee \u0123 \uffe3 \ueeee \u2345 - - How fun! Let's repeat it twenty times. - """ - * 20, -] - - -@pytest.mark.parametrize("string", STRINGS) -def test_reader(string): - for encoding in ENCODINGS: - # skip strings that can't be encoded with the current encoding - try: - string.encode(encoding) - except UnicodeEncodeError: - continue - check_reader(string, encoding) - - -def test_reader_stream_closes_when_deleted(): - reader = SeekableUnicodeStreamReader(BytesIO(b""), "ascii") - assert not reader.stream.closed - reader.__del__() - assert reader.stream.closed - - -def teardown_module(module=None): - import gc - - gc.collect() diff --git a/pipeline/nltk/test/unit/test_senna.py b/pipeline/nltk/test/unit/test_senna.py deleted file mode 100644 index 067f9e30c09a4b963b01cb0c825741a208005874..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_senna.py +++ /dev/null @@ -1,112 +0,0 @@ -""" -Unit tests for Senna -""" - -import unittest -from os import environ, path, sep - -from nltk.classify import Senna -from nltk.tag import SennaChunkTagger, SennaNERTagger, SennaTagger - -# Set Senna executable path for tests if it is not specified as an environment variable -if "SENNA" in environ: - SENNA_EXECUTABLE_PATH = path.normpath(environ["SENNA"]) + sep -else: - SENNA_EXECUTABLE_PATH = "/usr/share/senna-v3.0" - -senna_is_installed = path.exists(SENNA_EXECUTABLE_PATH) - - -@unittest.skipUnless(senna_is_installed, "Requires Senna executable") -class TestSennaPipeline(unittest.TestCase): - """Unittest for nltk.classify.senna""" - - def test_senna_pipeline(self): - """Senna pipeline interface""" - - pipeline = Senna(SENNA_EXECUTABLE_PATH, ["pos", "chk", "ner"]) - sent = "Dusseldorf is an international business center".split() - result = [ - (token["word"], token["chk"], token["ner"], token["pos"]) - for token in pipeline.tag(sent) - ] - expected = [ - ("Dusseldorf", "B-NP", "B-LOC", "NNP"), - ("is", "B-VP", "O", "VBZ"), - ("an", "B-NP", "O", "DT"), - ("international", "I-NP", "O", "JJ"), - ("business", "I-NP", "O", "NN"), - ("center", "I-NP", "O", "NN"), - ] - self.assertEqual(result, expected) - - -@unittest.skipUnless(senna_is_installed, "Requires Senna executable") -class TestSennaTagger(unittest.TestCase): - """Unittest for nltk.tag.senna""" - - def test_senna_tagger(self): - tagger = SennaTagger(SENNA_EXECUTABLE_PATH) - result = tagger.tag("What is the airspeed of an unladen swallow ?".split()) - expected = [ - ("What", "WP"), - ("is", "VBZ"), - ("the", "DT"), - ("airspeed", "NN"), - ("of", "IN"), - ("an", "DT"), - ("unladen", "NN"), - ("swallow", "NN"), - ("?", "."), - ] - self.assertEqual(result, expected) - - def test_senna_chunk_tagger(self): - chktagger = SennaChunkTagger(SENNA_EXECUTABLE_PATH) - result_1 = chktagger.tag("What is the airspeed of an unladen swallow ?".split()) - expected_1 = [ - ("What", "B-NP"), - ("is", "B-VP"), - ("the", "B-NP"), - ("airspeed", "I-NP"), - ("of", "B-PP"), - ("an", "B-NP"), - ("unladen", "I-NP"), - ("swallow", "I-NP"), - ("?", "O"), - ] - - result_2 = list(chktagger.bio_to_chunks(result_1, chunk_type="NP")) - expected_2 = [ - ("What", "0"), - ("the airspeed", "2-3"), - ("an unladen swallow", "5-6-7"), - ] - self.assertEqual(result_1, expected_1) - self.assertEqual(result_2, expected_2) - - def test_senna_ner_tagger(self): - nertagger = SennaNERTagger(SENNA_EXECUTABLE_PATH) - result_1 = nertagger.tag("Shakespeare theatre was in London .".split()) - expected_1 = [ - ("Shakespeare", "B-PER"), - ("theatre", "O"), - ("was", "O"), - ("in", "O"), - ("London", "B-LOC"), - (".", "O"), - ] - - result_2 = nertagger.tag("UN headquarters are in NY , USA .".split()) - expected_2 = [ - ("UN", "B-ORG"), - ("headquarters", "O"), - ("are", "O"), - ("in", "O"), - ("NY", "B-LOC"), - (",", "O"), - ("USA", "B-LOC"), - (".", "O"), - ] - self.assertEqual(result_1, expected_1) - self.assertEqual(result_2, expected_2) diff --git a/pipeline/nltk/test/unit/test_stem.py b/pipeline/nltk/test/unit/test_stem.py deleted file mode 100644 index 0b0b0404ece1cd64a7539967a2d36e8d80ab5cea..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_stem.py +++ /dev/null @@ -1,157 +0,0 @@ -import unittest -from contextlib import closing - -from nltk import data -from nltk.stem.porter import PorterStemmer -from nltk.stem.snowball import SnowballStemmer - - -class SnowballTest(unittest.TestCase): - def test_arabic(self): - """ - this unit testing for test the snowball arabic light stemmer - this stemmer deals with prefixes and suffixes - """ - # Test where the ignore_stopwords=True. - ar_stemmer = SnowballStemmer("arabic", True) - assert ar_stemmer.stem("الْعَرَبِــــــيَّة") == "عرب" - assert ar_stemmer.stem("العربية") == "عرب" - assert ar_stemmer.stem("فقالوا") == "قال" - assert ar_stemmer.stem("الطالبات") == "طالب" - assert ar_stemmer.stem("فالطالبات") == "طالب" - assert ar_stemmer.stem("والطالبات") == "طالب" - assert ar_stemmer.stem("الطالبون") == "طالب" - assert ar_stemmer.stem("اللذان") == "اللذان" - assert ar_stemmer.stem("من") == "من" - # Test where the ignore_stopwords=False. - ar_stemmer = SnowballStemmer("arabic", False) - assert ar_stemmer.stem("اللذان") == "اللذ" # this is a stop word - assert ar_stemmer.stem("الطالبات") == "طالب" - assert ar_stemmer.stem("الكلمات") == "كلم" - # test where create the arabic stemmer without given init value to ignore_stopwords - ar_stemmer = SnowballStemmer("arabic") - assert ar_stemmer.stem("الْعَرَبِــــــيَّة") == "عرب" - assert ar_stemmer.stem("العربية") == "عرب" - assert ar_stemmer.stem("فقالوا") == "قال" - assert ar_stemmer.stem("الطالبات") == "طالب" - assert ar_stemmer.stem("الكلمات") == "كلم" - - def test_russian(self): - stemmer_russian = SnowballStemmer("russian") - assert stemmer_russian.stem("авантненькая") == "авантненьк" - - def test_german(self): - stemmer_german = SnowballStemmer("german") - stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True) - - assert stemmer_german.stem("Schr\xe4nke") == "schrank" - assert stemmer_german2.stem("Schr\xe4nke") == "schrank" - - assert stemmer_german.stem("keinen") == "kein" - assert stemmer_german2.stem("keinen") == "keinen" - - def test_spanish(self): - stemmer = SnowballStemmer("spanish") - - assert stemmer.stem("Visionado") == "vision" - - # The word 'algue' was raising an IndexError - assert stemmer.stem("algue") == "algu" - - def test_short_strings_bug(self): - stemmer = SnowballStemmer("english") - assert stemmer.stem("y's") == "y" - - -class PorterTest(unittest.TestCase): - def _vocabulary(self): - with closing( - data.find("stemmers/porter_test/porter_vocabulary.txt").open( - encoding="utf-8" - ) - ) as fp: - return fp.read().splitlines() - - def _test_against_expected_output(self, stemmer_mode, expected_stems): - stemmer = PorterStemmer(mode=stemmer_mode) - for word, true_stem in zip(self._vocabulary(), expected_stems): - our_stem = stemmer.stem(word) - assert ( - our_stem == true_stem - ), "{} should stem to {} in {} mode but got {}".format( - word, - true_stem, - stemmer_mode, - our_stem, - ) - - def test_vocabulary_martin_mode(self): - """Tests all words from the test vocabulary provided by M Porter - - The sample vocabulary and output were sourced from - https://tartarus.org/martin/PorterStemmer/voc.txt and - https://tartarus.org/martin/PorterStemmer/output.txt - and are linked to from the Porter Stemmer algorithm's homepage - at https://tartarus.org/martin/PorterStemmer/ - """ - with closing( - data.find("stemmers/porter_test/porter_martin_output.txt").open( - encoding="utf-8" - ) - ) as fp: - self._test_against_expected_output( - PorterStemmer.MARTIN_EXTENSIONS, fp.read().splitlines() - ) - - def test_vocabulary_nltk_mode(self): - with closing( - data.find("stemmers/porter_test/porter_nltk_output.txt").open( - encoding="utf-8" - ) - ) as fp: - self._test_against_expected_output( - PorterStemmer.NLTK_EXTENSIONS, fp.read().splitlines() - ) - - def test_vocabulary_original_mode(self): - # The list of stems for this test was generated by taking the - # Martin-blessed stemmer from - # https://tartarus.org/martin/PorterStemmer/c.txt - # and removing all the --DEPARTURE-- sections from it and - # running it against Martin's test vocabulary. - - with closing( - data.find("stemmers/porter_test/porter_original_output.txt").open( - encoding="utf-8" - ) - ) as fp: - self._test_against_expected_output( - PorterStemmer.ORIGINAL_ALGORITHM, fp.read().splitlines() - ) - - self._test_against_expected_output( - PorterStemmer.ORIGINAL_ALGORITHM, - data.find("stemmers/porter_test/porter_original_output.txt") - .open(encoding="utf-8") - .read() - .splitlines(), - ) - - def test_oed_bug(self): - """Test for bug https://github.com/nltk/nltk/issues/1581 - - Ensures that 'oed' can be stemmed without throwing an error. - """ - assert PorterStemmer().stem("oed") == "o" - - def test_lowercase_option(self): - """Test for improvement on https://github.com/nltk/nltk/issues/2507 - - Ensures that stems are lowercased when `to_lowercase=True` - """ - porter = PorterStemmer() - assert porter.stem("On") == "on" - assert porter.stem("I") == "i" - assert porter.stem("I", to_lowercase=False) == "I" - assert porter.stem("Github") == "github" - assert porter.stem("Github", to_lowercase=False) == "Github" diff --git a/pipeline/nltk/test/unit/test_tag.py b/pipeline/nltk/test/unit/test_tag.py deleted file mode 100644 index 2074b1bbc5f11e06d6a30e741e6618888b7b7511..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_tag.py +++ /dev/null @@ -1,23 +0,0 @@ -def test_basic(): - from nltk.tag import pos_tag - from nltk.tokenize import word_tokenize - - result = pos_tag(word_tokenize("John's big idea isn't all that bad.")) - assert result == [ - ("John", "NNP"), - ("'s", "POS"), - ("big", "JJ"), - ("idea", "NN"), - ("is", "VBZ"), - ("n't", "RB"), - ("all", "PDT"), - ("that", "DT"), - ("bad", "JJ"), - (".", "."), - ] - - -def setup_module(module): - import pytest - - pytest.importorskip("numpy") diff --git a/pipeline/nltk/test/unit/test_tgrep.py b/pipeline/nltk/test/unit/test_tgrep.py deleted file mode 100644 index bf3c08bb7a034748f3e4b70273e8c171b45f4183..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_tgrep.py +++ /dev/null @@ -1,780 +0,0 @@ -#!/usr/bin/env python -# -# Natural Language Toolkit: TGrep search -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Will Roberts -# URL: -# For license information, see LICENSE.TXT - -""" -Unit tests for nltk.tgrep. -""" - - -import unittest - -from nltk import tgrep -from nltk.tree import ParentedTree - - -class TestSequenceFunctions(unittest.TestCase): - - """ - Class containing unit tests for nltk.tgrep. - """ - - def test_tokenize_simple(self): - """ - Simple test of tokenization. - """ - tokens = tgrep.tgrep_tokenize("A .. (B !< C . D) | ![<< (E , F) $ G]") - self.assertEqual( - tokens, - [ - "A", - "..", - "(", - "B", - "!", - "<", - "C", - ".", - "D", - ")", - "|", - "!", - "[", - "<<", - "(", - "E", - ",", - "F", - ")", - "$", - "G", - "]", - ], - ) - - def test_tokenize_encoding(self): - """ - Test that tokenization handles bytes and strs the same way. - """ - self.assertEqual( - tgrep.tgrep_tokenize(b"A .. (B !< C . D) | ![<< (E , F) $ G]"), - tgrep.tgrep_tokenize("A .. (B !< C . D) | ![<< (E , F) $ G]"), - ) - - def test_tokenize_link_types(self): - """ - Test tokenization of basic link types. - """ - self.assertEqual(tgrep.tgrep_tokenize("AB"), ["A", ">", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A<3B"), ["A", "<3", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A>3B"), ["A", ">3", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A<,B"), ["A", "<,", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A>,B"), ["A", ">,", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A<-3B"), ["A", "<-3", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A>-3B"), ["A", ">-3", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A<-B"), ["A", "<-", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A>-B"), ["A", ">-", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A<'B"), ["A", "<'", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A>'B"), ["A", ">'", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A<:B"), ["A", "<:", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A>:B"), ["A", ">:", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A<>B"), ["A", ">>", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A<<,B"), ["A", "<<,", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A>>,B"), ["A", ">>,", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A<<'B"), ["A", "<<'", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A>>'B"), ["A", ">>'", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A<<:B"), ["A", "<<:", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A>>:B"), ["A", ">>:", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A.B"), ["A", ".", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A,B"), ["A", ",", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A..B"), ["A", "..", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A,,B"), ["A", ",,", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A$B"), ["A", "$", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A$.B"), ["A", "$.", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A$,B"), ["A", "$,", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A$..B"), ["A", "$..", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A$,,B"), ["A", "$,,", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!B"), ["A", "!", ">", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!<3B"), ["A", "!", "<3", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!>3B"), ["A", "!", ">3", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!<,B"), ["A", "!", "<,", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!>,B"), ["A", "!", ">,", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!<-3B"), ["A", "!", "<-3", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!>-3B"), ["A", "!", ">-3", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!<-B"), ["A", "!", "<-", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!>-B"), ["A", "!", ">-", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!<'B"), ["A", "!", "<'", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!>'B"), ["A", "!", ">'", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!<:B"), ["A", "!", "<:", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!>:B"), ["A", "!", ">:", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!<>B"), ["A", "!", ">>", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!<<,B"), ["A", "!", "<<,", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!>>,B"), ["A", "!", ">>,", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!<<'B"), ["A", "!", "<<'", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!>>'B"), ["A", "!", ">>'", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!<<:B"), ["A", "!", "<<:", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!>>:B"), ["A", "!", ">>:", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!.B"), ["A", "!", ".", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!,B"), ["A", "!", ",", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!..B"), ["A", "!", "..", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!,,B"), ["A", "!", ",,", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!$B"), ["A", "!", "$", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!$.B"), ["A", "!", "$.", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!$,B"), ["A", "!", "$,", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!$..B"), ["A", "!", "$..", "B"]) - self.assertEqual(tgrep.tgrep_tokenize("A!$,,B"), ["A", "!", "$,,", "B"]) - - def test_tokenize_examples(self): - """ - Test tokenization of the TGrep2 manual example patterns. - """ - self.assertEqual(tgrep.tgrep_tokenize("NP < PP"), ["NP", "<", "PP"]) - self.assertEqual(tgrep.tgrep_tokenize("/^NP/"), ["/^NP/"]) - self.assertEqual( - tgrep.tgrep_tokenize("NP << PP . VP"), ["NP", "<<", "PP", ".", "VP"] - ) - self.assertEqual( - tgrep.tgrep_tokenize("NP << PP | . VP"), ["NP", "<<", "PP", "|", ".", "VP"] - ) - self.assertEqual( - tgrep.tgrep_tokenize("NP !<< PP [> NP | >> VP]"), - ["NP", "!", "<<", "PP", "[", ">", "NP", "|", ">>", "VP", "]"], - ) - self.assertEqual( - tgrep.tgrep_tokenize("NP << (PP . VP)"), - ["NP", "<<", "(", "PP", ".", "VP", ")"], - ) - self.assertEqual( - tgrep.tgrep_tokenize("NP <' (PP <, (IN < on))"), - ["NP", "<'", "(", "PP", "<,", "(", "IN", "<", "on", ")", ")"], - ) - self.assertEqual( - tgrep.tgrep_tokenize("S < (A < B) < C"), - ["S", "<", "(", "A", "<", "B", ")", "<", "C"], - ) - self.assertEqual( - tgrep.tgrep_tokenize("S < ((A < B) < C)"), - ["S", "<", "(", "(", "A", "<", "B", ")", "<", "C", ")"], - ) - self.assertEqual( - tgrep.tgrep_tokenize("S < (A < B < C)"), - ["S", "<", "(", "A", "<", "B", "<", "C", ")"], - ) - self.assertEqual(tgrep.tgrep_tokenize("A3B"3B"', "<", "C"], - ) - - def test_tokenize_nodenames(self): - """ - Test tokenization of node names. - """ - self.assertEqual(tgrep.tgrep_tokenize("Robert"), ["Robert"]) - self.assertEqual(tgrep.tgrep_tokenize("/^[Bb]ob/"), ["/^[Bb]ob/"]) - self.assertEqual(tgrep.tgrep_tokenize("*"), ["*"]) - self.assertEqual(tgrep.tgrep_tokenize("__"), ["__"]) - # test tokenization of NLTK tree position syntax - self.assertEqual(tgrep.tgrep_tokenize("N()"), ["N(", ")"]) - self.assertEqual(tgrep.tgrep_tokenize("N(0,)"), ["N(", "0", ",", ")"]) - self.assertEqual(tgrep.tgrep_tokenize("N(0,0)"), ["N(", "0", ",", "0", ")"]) - self.assertEqual( - tgrep.tgrep_tokenize("N(0,0,)"), ["N(", "0", ",", "0", ",", ")"] - ) - - def test_tokenize_macros(self): - """ - Test tokenization of macro definitions. - """ - self.assertEqual( - tgrep.tgrep_tokenize( - "@ NP /^NP/;\n@ NN /^NN/;\n@NP [!< NP | < @NN] !$.. @NN" - ), - [ - "@", - "NP", - "/^NP/", - ";", - "@", - "NN", - "/^NN/", - ";", - "@NP", - "[", - "!", - "<", - "NP", - "|", - "<", - "@NN", - "]", - "!", - "$..", - "@NN", - ], - ) - - def test_node_simple(self): - """ - Test a simple use of tgrep for finding nodes matching a given - pattern. - """ - tree = ParentedTree.fromstring( - "(S (NP (DT the) (JJ big) (NN dog)) " "(VP bit) (NP (DT a) (NN cat)))" - ) - self.assertEqual(list(tgrep.tgrep_positions("NN", [tree])), [[(0, 2), (2, 1)]]) - self.assertEqual( - list(tgrep.tgrep_nodes("NN", [tree])), [[tree[0, 2], tree[2, 1]]] - ) - self.assertEqual( - list(tgrep.tgrep_positions("NN|JJ", [tree])), [[(0, 1), (0, 2), (2, 1)]] - ) - - def test_node_printing(self): - """Test that the tgrep print operator ' is properly ignored.""" - tree = ParentedTree.fromstring("(S (n x) (N x))") - self.assertEqual( - list(tgrep.tgrep_positions("N", [tree])), - list(tgrep.tgrep_positions("'N", [tree])), - ) - self.assertEqual( - list(tgrep.tgrep_positions("/[Nn]/", [tree])), - list(tgrep.tgrep_positions("'/[Nn]/", [tree])), - ) - - def test_node_encoding(self): - """ - Test that tgrep search strings handles bytes and strs the same - way. - """ - tree = ParentedTree.fromstring( - "(S (NP (DT the) (JJ big) (NN dog)) " "(VP bit) (NP (DT a) (NN cat)))" - ) - self.assertEqual( - list(tgrep.tgrep_positions(b"NN", [tree])), - list(tgrep.tgrep_positions(b"NN", [tree])), - ) - self.assertEqual( - list(tgrep.tgrep_nodes(b"NN", [tree])), - list(tgrep.tgrep_nodes("NN", [tree])), - ) - self.assertEqual( - list(tgrep.tgrep_positions(b"NN|JJ", [tree])), - list(tgrep.tgrep_positions("NN|JJ", [tree])), - ) - - def test_node_nocase(self): - """ - Test selecting nodes using case insensitive node names. - """ - tree = ParentedTree.fromstring("(S (n x) (N x))") - self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[(1,)]]) - self.assertEqual(list(tgrep.tgrep_positions('i@"N"', [tree])), [[(0,), (1,)]]) - - def test_node_quoted(self): - """ - Test selecting nodes using quoted node names. - """ - tree = ParentedTree.fromstring('(N ("N" x) (N" x) ("\\" x))') - self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[()]]) - self.assertEqual(list(tgrep.tgrep_positions('"\\"N\\""', [tree])), [[(0,)]]) - self.assertEqual(list(tgrep.tgrep_positions('"N\\""', [tree])), [[(1,)]]) - self.assertEqual(list(tgrep.tgrep_positions('"\\"\\\\\\""', [tree])), [[(2,)]]) - - def test_node_regex(self): - """ - Test regex matching on nodes. - """ - tree = ParentedTree.fromstring("(S (NP-SBJ x) (NP x) (NNP x) (VP x))") - # This is a regular expression that matches any node whose - # name starts with NP, including NP-SBJ: - self.assertEqual(list(tgrep.tgrep_positions("/^NP/", [tree])), [[(0,), (1,)]]) - - def test_node_regex_2(self): - """ - Test regex matching on nodes. - """ - tree = ParentedTree.fromstring("(S (SBJ x) (SBJ1 x) (NP-SBJ x))") - self.assertEqual(list(tgrep.tgrep_positions("/^SBJ/", [tree])), [[(0,), (1,)]]) - # This is a regular expression that matches any node whose - # name includes SBJ, including NP-SBJ: - self.assertEqual( - list(tgrep.tgrep_positions("/SBJ/", [tree])), [[(0,), (1,), (2,)]] - ) - - def test_node_tree_position(self): - """ - Test matching on nodes based on NLTK tree position. - """ - tree = ParentedTree.fromstring("(S (NP-SBJ x) (NP x) (NNP x) (VP x))") - # test all tree positions that are not leaves - leaf_positions = {tree.leaf_treeposition(x) for x in range(len(tree.leaves()))} - tree_positions = [x for x in tree.treepositions() if x not in leaf_positions] - for position in tree_positions: - node_id = f"N{position}" - tgrep_positions = list(tgrep.tgrep_positions(node_id, [tree])) - self.assertEqual(len(tgrep_positions[0]), 1) - self.assertEqual(tgrep_positions[0][0], position) - - def test_node_noleaves(self): - """ - Test node name matching with the search_leaves flag set to False. - """ - tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))") - self.assertEqual( - list(tgrep.tgrep_positions("x", [tree])), [[(0, 0, 0), (1, 0, 0)]] - ) - self.assertEqual(list(tgrep.tgrep_positions("x", [tree], False)), [[]]) - - def tests_rel_dominance(self): - """ - Test matching nodes based on dominance relations. - """ - tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))") - self.assertEqual(list(tgrep.tgrep_positions("* < T", [tree])), [[(0,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* < T > S", [tree])), [[(0,)]]) - self.assertEqual( - list(tgrep.tgrep_positions("* !< T", [tree])), - [[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]], - ) - self.assertEqual(list(tgrep.tgrep_positions("* !< T > S", [tree])), [[(1,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* > A", [tree])), [[(0, 0)]]) - self.assertEqual(list(tgrep.tgrep_positions("* > B", [tree])), [[(1, 0)]]) - self.assertEqual( - list(tgrep.tgrep_positions("* !> B", [tree])), - [[(), (0,), (0, 0), (0, 0, 0), (1,), (1, 0, 0)]], - ) - self.assertEqual( - list(tgrep.tgrep_positions("* !> B >> S", [tree])), [[(0,), (0, 0), (1,)]] - ) - self.assertEqual( - list(tgrep.tgrep_positions("* >> S", [tree])), - [[(0,), (0, 0), (1,), (1, 0)]], - ) - self.assertEqual( - list(tgrep.tgrep_positions("* >>, S", [tree])), [[(0,), (0, 0)]] - ) - self.assertEqual( - list(tgrep.tgrep_positions("* >>' S", [tree])), [[(1,), (1, 0)]] - ) - # Known issue: - # self.assertEqual(list(tgrep.tgrep_positions('* !>> S', [tree])), - # [[()]]) - self.assertEqual(list(tgrep.tgrep_positions("* << T", [tree])), [[(), (0,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* <<' T", [tree])), [[(0,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* <<1 N", [tree])), [[(1,)]]) - self.assertEqual( - list(tgrep.tgrep_positions("* !<< T", [tree])), - [[(0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]], - ) - tree = ParentedTree.fromstring("(S (A (T x)) (B (T x) (N x )))") - self.assertEqual(list(tgrep.tgrep_positions("* <: T", [tree])), [[(0,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* < T", [tree])), [[(0,), (1,)]]) - self.assertEqual( - list(tgrep.tgrep_positions("* !<: T", [tree])), - [[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0), (1, 1), (1, 1, 0)]], - ) - self.assertEqual(list(tgrep.tgrep_positions("* !<: T > S", [tree])), [[(1,)]]) - tree = ParentedTree.fromstring("(S (T (A x) (B x)) (T (C x)))") - self.assertEqual(list(tgrep.tgrep_positions("* >: T", [tree])), [[(1, 0)]]) - self.assertEqual( - list(tgrep.tgrep_positions("* !>: T", [tree])), - [[(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0, 0)]], - ) - tree = ParentedTree.fromstring( - "(S (A (B (C (D (E (T x))))))" " (A (B (C (D (E (T x))) (N x)))))" - ) - self.assertEqual( - list(tgrep.tgrep_positions("* <<: T", [tree])), - [ - [ - (0,), - (0, 0), - (0, 0, 0), - (0, 0, 0, 0), - (0, 0, 0, 0, 0), - (1, 0, 0, 0), - (1, 0, 0, 0, 0), - ] - ], - ) - self.assertEqual( - list(tgrep.tgrep_positions("* >>: A", [tree])), - [ - [ - (0, 0), - (0, 0, 0), - (0, 0, 0, 0), - (0, 0, 0, 0, 0), - (0, 0, 0, 0, 0, 0), - (1, 0), - (1, 0, 0), - ] - ], - ) - - def test_bad_operator(self): - """ - Test error handling of undefined tgrep operators. - """ - tree = ParentedTree.fromstring("(S (A (T x)) (B (N x)))") - self.assertRaises( - tgrep.TgrepException, list, tgrep.tgrep_positions("* >>> S", [tree]) - ) - - def test_comments(self): - """ - Test that comments are correctly filtered out of tgrep search - strings. - """ - tree = ParentedTree.fromstring("(S (NN x) (NP x) (NN x))") - search1 = """ - @ NP /^NP/; - @ NN /^NN/; - @NN - """ - self.assertEqual(list(tgrep.tgrep_positions(search1, [tree])), [[(0,), (2,)]]) - search2 = """ - # macros - @ NP /^NP/; - @ NN /^NN/; - - # search string - @NN - """ - self.assertEqual(list(tgrep.tgrep_positions(search2, [tree])), [[(0,), (2,)]]) - - def test_rel_sister_nodes(self): - """ - Test matching sister nodes in a tree. - """ - tree = ParentedTree.fromstring("(S (A x) (B x) (C x))") - self.assertEqual(list(tgrep.tgrep_positions("* $. B", [tree])), [[(0,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* $.. B", [tree])), [[(0,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* $, B", [tree])), [[(2,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* $,, B", [tree])), [[(2,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* $ B", [tree])), [[(0,), (2,)]]) - - def tests_rel_indexed_children(self): - """ - Test matching nodes based on their index in their parent node. - """ - tree = ParentedTree.fromstring("(S (A x) (B x) (C x))") - self.assertEqual(list(tgrep.tgrep_positions("* >, S", [tree])), [[(0,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* >1 S", [tree])), [[(0,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* >2 S", [tree])), [[(1,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* >3 S", [tree])), [[(2,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* >' S", [tree])), [[(2,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* >-1 S", [tree])), [[(2,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* >-2 S", [tree])), [[(1,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* >-3 S", [tree])), [[(0,)]]) - tree = ParentedTree.fromstring( - "(S (D (A x) (B x) (C x)) (E (B x) (C x) (A x)) " "(F (C x) (A x) (B x)))" - ) - self.assertEqual(list(tgrep.tgrep_positions("* <, A", [tree])), [[(0,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* <1 A", [tree])), [[(0,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* <2 A", [tree])), [[(2,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* <3 A", [tree])), [[(1,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* <' A", [tree])), [[(1,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* <-1 A", [tree])), [[(1,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* <-2 A", [tree])), [[(2,)]]) - self.assertEqual(list(tgrep.tgrep_positions("* <-3 A", [tree])), [[(0,)]]) - - def test_rel_precedence(self): - """ - Test matching nodes based on precedence relations. - """ - tree = ParentedTree.fromstring( - "(S (NP (NP (PP x)) (NP (AP x)))" - " (VP (AP (X (PP x)) (Y (AP x))))" - " (NP (RC (NP (AP x)))))" - ) - self.assertEqual( - list(tgrep.tgrep_positions("* . X", [tree])), [[(0,), (0, 1), (0, 1, 0)]] - ) - self.assertEqual( - list(tgrep.tgrep_positions("* . Y", [tree])), [[(1, 0, 0), (1, 0, 0, 0)]] - ) - self.assertEqual( - list(tgrep.tgrep_positions("* .. X", [tree])), - [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]], - ) - self.assertEqual( - list(tgrep.tgrep_positions("* .. Y", [tree])), - [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1, 0, 0), (1, 0, 0, 0)]], - ) - self.assertEqual( - list(tgrep.tgrep_positions("* , X", [tree])), [[(1, 0, 1), (1, 0, 1, 0)]] - ) - self.assertEqual( - list(tgrep.tgrep_positions("* , Y", [tree])), - [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]], - ) - self.assertEqual( - list(tgrep.tgrep_positions("* ,, X", [tree])), - [[(1, 0, 1), (1, 0, 1, 0), (2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]], - ) - self.assertEqual( - list(tgrep.tgrep_positions("* ,, Y", [tree])), - [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]], - ) - - def test_examples(self): - """ - Test the Basic Examples from the TGrep2 manual. - """ - tree = ParentedTree.fromstring("(S (NP (AP x)) (NP (PP x)))") - # This matches any NP node that immediately dominates a PP: - self.assertEqual(list(tgrep.tgrep_positions("NP < PP", [tree])), [[(1,)]]) - - tree = ParentedTree.fromstring("(S (NP x) (VP x) (NP (PP x)) (VP x))") - # This matches an NP that dominates a PP and is immediately - # followed by a VP: - self.assertEqual(list(tgrep.tgrep_positions("NP << PP . VP", [tree])), [[(2,)]]) - - tree = ParentedTree.fromstring( - "(S (NP (AP x)) (NP (PP x)) " "(NP (DET x) (NN x)) (VP x))" - ) - # This matches an NP that dominates a PP or is immediately - # followed by a VP: - self.assertEqual( - list(tgrep.tgrep_positions("NP << PP | . VP", [tree])), [[(1,), (2,)]] - ) - - tree = ParentedTree.fromstring( - "(S (NP (NP (PP x)) (NP (AP x)))" - " (VP (AP (NP (PP x)) (NP (AP x))))" - " (NP (RC (NP (AP x)))))" - ) - # This matches an NP that does not dominate a PP. Also, the NP - # must either have a parent that is an NP or be dominated by a - # VP: - self.assertEqual( - list(tgrep.tgrep_positions("NP !<< PP [> NP | >> VP]", [tree])), - [[(0, 1), (1, 0, 1)]], - ) - - tree = ParentedTree.fromstring( - "(S (NP (AP (PP x) (VP x))) " "(NP (AP (PP x) (NP x))) (NP x))" - ) - # This matches an NP that dominates a PP which itself is - # immediately followed by a VP. Note the use of parentheses to - # group ". VP" with the PP rather than with the NP: - self.assertEqual( - list(tgrep.tgrep_positions("NP << (PP . VP)", [tree])), [[(0,)]] - ) - - tree = ParentedTree.fromstring( - "(S (NP (DET a) (NN cat) (PP (IN on) (NP x)))" - " (NP (DET a) (NN cat) (PP (IN on) (NP x)) (PP x))" - " (NP x))" - ) - # This matches an NP whose last child is a PP that begins with - # the preposition "on": - self.assertEqual( - list(tgrep.tgrep_positions("NP <' (PP <, (IN < on))", [tree])), [[(0,)]] - ) - - tree = ParentedTree.fromstring( - "(S (S (C x) (A (B x))) (S (C x) (A x)) " "(S (D x) (A (B x))))" - ) - # The following pattern matches an S which has a child A and - # another child that is a C and that the A has a child B: - self.assertEqual( - list(tgrep.tgrep_positions("S < (A < B) < C", [tree])), [[(0,)]] - ) - - tree = ParentedTree.fromstring( - "(S (S (A (B x) (C x))) (S (S (C x) (A (B x)))))" - ) - # However, this pattern means that S has child A and that A - # has children B and C: - self.assertEqual( - list(tgrep.tgrep_positions("S < ((A < B) < C)", [tree])), [[(0,)]] - ) - - # It is equivalent to this: - self.assertEqual( - list(tgrep.tgrep_positions("S < (A < B < C)", [tree])), [[(0,)]] - ) - - def test_use_macros(self): - """ - Test defining and using tgrep2 macros. - """ - tree = ParentedTree.fromstring( - "(VP (VB sold) (NP (DET the) " - "(NN heiress)) (NP (NN deed) (PREP to) " - "(NP (DET the) (NN school) (NN house))))" - ) - self.assertEqual( - list( - tgrep.tgrep_positions( - "@ NP /^NP/;\n@ NN /^NN/;\n@NP !< @NP !$.. @NN", [tree] - ) - ), - [[(1,), (2, 2)]], - ) - # use undefined macro @CNP - self.assertRaises( - tgrep.TgrepException, - list, - tgrep.tgrep_positions( - "@ NP /^NP/;\n@ NN /^NN/;\n@CNP !< @NP !$.. @NN", [tree] - ), - ) - - def test_tokenize_node_labels(self): - """Test tokenization of labeled nodes.""" - self.assertEqual( - tgrep.tgrep_tokenize("S < @SBJ < (@VP < (@VB $.. @OBJ))"), - [ - "S", - "<", - "@SBJ", - "<", - "(", - "@VP", - "<", - "(", - "@VB", - "$..", - "@OBJ", - ")", - ")", - ], - ) - self.assertEqual( - tgrep.tgrep_tokenize("S < @SBJ=s < (@VP=v < (@VB $.. @OBJ))"), - [ - "S", - "<", - "@SBJ", - "=", - "s", - "<", - "(", - "@VP", - "=", - "v", - "<", - "(", - "@VB", - "$..", - "@OBJ", - ")", - ")", - ], - ) - - def test_tokenize_segmented_patterns(self): - """Test tokenization of segmented patterns.""" - self.assertEqual( - tgrep.tgrep_tokenize("S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v"), - [ - "S", - "<", - "@SBJ", - "=", - "s", - "<", - "(", - "@VP", - "=", - "v", - "<", - "(", - "@VB", - "$..", - "@OBJ", - ")", - ")", - ":", - "=s", - "..", - "=v", - ], - ) - - def test_labeled_nodes(self): - """ - Test labeled nodes. - - Test case from Emily M. Bender. - """ - search = """ - # macros - @ SBJ /SBJ/; - @ VP /VP/; - @ VB /VB/; - @ VPoB /V[PB]/; - @ OBJ /OBJ/; - - # 1 svo - S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v""" - sent1 = ParentedTree.fromstring( - "(S (NP-SBJ I) (VP (VB eat) (NP-OBJ (NNS apples))))" - ) - sent2 = ParentedTree.fromstring( - "(S (VP (VB eat) (NP-OBJ (NNS apples))) (NP-SBJ I))" - ) - search_firsthalf = search.split("\n\n")[0] + "S < @SBJ < (@VP < (@VB $.. @OBJ))" - search_rewrite = "S < (/.*SBJ/ $.. (/VP/ < (/VB/ $.. /.*OBJ/)))" - - self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent1]))[0]) - self.assertTrue(list(tgrep.tgrep_positions(search, [sent1]))[0]) - self.assertTrue(list(tgrep.tgrep_positions(search_rewrite, [sent1]))[0]) - self.assertEqual( - list(tgrep.tgrep_positions(search, [sent1])), - list(tgrep.tgrep_positions(search_rewrite, [sent1])), - ) - self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent2]))[0]) - self.assertFalse(list(tgrep.tgrep_positions(search, [sent2]))[0]) - self.assertFalse(list(tgrep.tgrep_positions(search_rewrite, [sent2]))[0]) - self.assertEqual( - list(tgrep.tgrep_positions(search, [sent2])), - list(tgrep.tgrep_positions(search_rewrite, [sent2])), - ) - - def test_multiple_conjs(self): - """ - Test that multiple (3 or more) conjunctions of node relations are - handled properly. - """ - sent = ParentedTree.fromstring("((A (B b) (C c)) (A (B b) (C c) (D d)))") - # search = '(A < B < C < D)' - # search_tworels = '(A < B < C)' - self.assertEqual( - list(tgrep.tgrep_positions("(A < B < C < D)", [sent])), [[(1,)]] - ) - self.assertEqual( - list(tgrep.tgrep_positions("(A < B < C)", [sent])), [[(0,), (1,)]] - ) - - def test_trailing_semicolon(self): - """ - Test that semicolons at the end of a tgrep2 search string won't - cause a parse failure. - """ - tree = ParentedTree.fromstring( - "(S (NP (DT the) (JJ big) (NN dog)) " "(VP bit) (NP (DT a) (NN cat)))" - ) - self.assertEqual(list(tgrep.tgrep_positions("NN", [tree])), [[(0, 2), (2, 1)]]) - self.assertEqual(list(tgrep.tgrep_positions("NN;", [tree])), [[(0, 2), (2, 1)]]) - self.assertEqual( - list(tgrep.tgrep_positions("NN;;", [tree])), [[(0, 2), (2, 1)]] - ) diff --git a/pipeline/nltk/test/unit/test_tokenize.py b/pipeline/nltk/test/unit/test_tokenize.py deleted file mode 100644 index c88ee788565fce19f25697aeffec67896fada573..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_tokenize.py +++ /dev/null @@ -1,867 +0,0 @@ -""" -Unit tests for nltk.tokenize. -See also nltk/test/tokenize.doctest -""" -from typing import List, Tuple - -import pytest - -from nltk.tokenize import ( - LegalitySyllableTokenizer, - StanfordSegmenter, - SyllableTokenizer, - TreebankWordTokenizer, - TweetTokenizer, - punkt, - sent_tokenize, - word_tokenize, -) - - -def load_stanford_segmenter(): - try: - seg = StanfordSegmenter() - seg.default_config("ar") - seg.default_config("zh") - return True - except LookupError: - return False - - -check_stanford_segmenter = pytest.mark.skipif( - not load_stanford_segmenter(), - reason="NLTK was unable to find stanford-segmenter.jar.", -) - - -class TestTokenize: - def test_tweet_tokenizer(self): - """ - Test TweetTokenizer using words with special and accented characters. - """ - - tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True) - s9 = "@myke: Let's test these words: resumé España München français" - tokens = tokenizer.tokenize(s9) - expected = [ - ":", - "Let's", - "test", - "these", - "words", - ":", - "resumé", - "España", - "München", - "français", - ] - assert tokens == expected - - @pytest.mark.parametrize( - "test_input, expecteds", - [ - ( - "My text 0106404243030 is great text", - ( - ["My", "text", "01064042430", "30", "is", "great", "text"], - ["My", "text", "0106404243030", "is", "great", "text"], - ), - ), - ( - "My ticket id is 1234543124123", - ( - ["My", "ticket", "id", "is", "12345431241", "23"], - ["My", "ticket", "id", "is", "1234543124123"], - ), - ), - ( - "@remy: This is waaaaayyyy too much for you!!!!!! 01064042430", - ( - [ - ":", - "This", - "is", - "waaayyy", - "too", - "much", - "for", - "you", - "!", - "!", - "!", - "01064042430", - ], - [ - ":", - "This", - "is", - "waaayyy", - "too", - "much", - "for", - "you", - "!", - "!", - "!", - "01064042430", - ], - ), - ), - # Further tests from https://github.com/nltk/nltk/pull/2798#issuecomment-922533085, - # showing the TweetTokenizer performance for `match_phone_numbers=True` and - # `match_phone_numbers=False`. - ( - # Some phone numbers are always tokenized, even with `match_phone_numbers=`False` - "My number is 06-46124080, except it's not.", - ( - [ - "My", - "number", - "is", - "06-46124080", - ",", - "except", - "it's", - "not", - ".", - ], - [ - "My", - "number", - "is", - "06-46124080", - ",", - "except", - "it's", - "not", - ".", - ], - ), - ), - ( - # Phone number here is only tokenized correctly if `match_phone_numbers=True` - "My number is 601-984-4813, except it's not.", - ( - [ - "My", - "number", - "is", - "601-984-4813", - ",", - "except", - "it's", - "not", - ".", - ], - [ - "My", - "number", - "is", - "601-984-", - "4813", - ",", - "except", - "it's", - "not", - ".", - ], - ), - ), - ( - # Phone number here is only tokenized correctly if `match_phone_numbers=True` - "My number is (393) 928 -3010, except it's not.", - ( - [ - "My", - "number", - "is", - "(393) 928 -3010", - ",", - "except", - "it's", - "not", - ".", - ], - [ - "My", - "number", - "is", - "(", - "393", - ")", - "928", - "-", - "3010", - ",", - "except", - "it's", - "not", - ".", - ], - ), - ), - ( - # A long number is tokenized correctly only if `match_phone_numbers=False` - "The product identification number is 48103284512.", - ( - [ - "The", - "product", - "identification", - "number", - "is", - "4810328451", - "2", - ".", - ], - [ - "The", - "product", - "identification", - "number", - "is", - "48103284512", - ".", - ], - ), - ), - ( - # `match_phone_numbers=True` can have some unforeseen - "My favourite substraction is 240 - 1353.", - ( - ["My", "favourite", "substraction", "is", "240 - 1353", "."], - ["My", "favourite", "substraction", "is", "240", "-", "1353", "."], - ), - ), - ], - ) - def test_tweet_tokenizer_expanded( - self, test_input: str, expecteds: Tuple[List[str], List[str]] - ): - """ - Test `match_phone_numbers` in TweetTokenizer. - - Note that TweetTokenizer is also passed the following for these tests: - * strip_handles=True - * reduce_len=True - - :param test_input: The input string to tokenize using TweetTokenizer. - :type test_input: str - :param expecteds: A 2-tuple of tokenized sentences. The first of the two - tokenized is the expected output of tokenization with `match_phone_numbers=True`. - The second of the two tokenized lists is the expected output of tokenization - with `match_phone_numbers=False`. - :type expecteds: Tuple[List[str], List[str]] - """ - for match_phone_numbers, expected in zip([True, False], expecteds): - tokenizer = TweetTokenizer( - strip_handles=True, - reduce_len=True, - match_phone_numbers=match_phone_numbers, - ) - predicted = tokenizer.tokenize(test_input) - assert predicted == expected - - def test_sonority_sequencing_syllable_tokenizer(self): - """ - Test SyllableTokenizer tokenizer. - """ - tokenizer = SyllableTokenizer() - tokens = tokenizer.tokenize("justification") - assert tokens == ["jus", "ti", "fi", "ca", "tion"] - - def test_syllable_tokenizer_numbers(self): - """ - Test SyllableTokenizer tokenizer. - """ - tokenizer = SyllableTokenizer() - text = "9" * 10000 - tokens = tokenizer.tokenize(text) - assert tokens == [text] - - def test_legality_principle_syllable_tokenizer(self): - """ - Test LegalitySyllableTokenizer tokenizer. - """ - from nltk.corpus import words - - test_word = "wonderful" - tokenizer = LegalitySyllableTokenizer(words.words()) - tokens = tokenizer.tokenize(test_word) - assert tokens == ["won", "der", "ful"] - - @check_stanford_segmenter - def test_stanford_segmenter_arabic(self): - """ - Test the Stanford Word Segmenter for Arabic (default config) - """ - seg = StanfordSegmenter() - seg.default_config("ar") - sent = "يبحث علم الحاسوب استخدام الحوسبة بجميع اشكالها لحل المشكلات" - segmented_sent = seg.segment(sent.split()) - assert segmented_sent.split() == [ - "يبحث", - "علم", - "الحاسوب", - "استخدام", - "الحوسبة", - "ب", - "جميع", - "اشكال", - "ها", - "ل", - "حل", - "المشكلات", - ] - - @check_stanford_segmenter - def test_stanford_segmenter_chinese(self): - """ - Test the Stanford Word Segmenter for Chinese (default config) - """ - seg = StanfordSegmenter() - seg.default_config("zh") - sent = "这是斯坦福中文分词器测试" - segmented_sent = seg.segment(sent.split()) - assert segmented_sent.split() == ["这", "是", "斯坦福", "中文", "分词器", "测试"] - - def test_phone_tokenizer(self): - """ - Test a string that resembles a phone number but contains a newline - """ - - # Should be recognized as a phone number, albeit one with multiple spaces - tokenizer = TweetTokenizer() - test1 = "(393) 928 -3010" - expected = ["(393) 928 -3010"] - result = tokenizer.tokenize(test1) - assert result == expected - - # Due to newline, first three elements aren't part of a phone number; - # fourth is - test2 = "(393)\n928 -3010" - expected = ["(", "393", ")", "928 -3010"] - result = tokenizer.tokenize(test2) - assert result == expected - - def test_emoji_tokenizer(self): - """ - Test a string that contains Emoji ZWJ Sequences and skin tone modifier - """ - tokenizer = TweetTokenizer() - - # A Emoji ZWJ Sequences, they together build as a single emoji, should not be split. - test1 = "👨‍👩‍👧‍👧" - expected = ["👨‍👩‍👧‍👧"] - result = tokenizer.tokenize(test1) - assert result == expected - - # A Emoji with skin tone modifier, the two characters build a single emoji, should not be split. - test2 = "👨🏿" - expected = ["👨🏿"] - result = tokenizer.tokenize(test2) - assert result == expected - - # A string containing both skin tone modifier and ZWJ Sequences - test3 = "🤔 🙈 me así, se😌 ds 💕👭👙 hello 👩🏾‍🎓 emoji hello 👨‍👩‍👦‍👦 how are 😊 you today🙅🏽🙅🏽" - expected = [ - "🤔", - "🙈", - "me", - "así", - ",", - "se", - "😌", - "ds", - "💕", - "👭", - "👙", - "hello", - "👩🏾\u200d🎓", - "emoji", - "hello", - "👨\u200d👩\u200d👦\u200d👦", - "how", - "are", - "😊", - "you", - "today", - "🙅🏽", - "🙅🏽", - ] - result = tokenizer.tokenize(test3) - assert result == expected - - # emoji flag sequences, including enclosed letter pairs - # Expected behavior from #3034 - test4 = "🇦🇵🇵🇱🇪" - expected = ["🇦🇵", "🇵🇱", "🇪"] - result = tokenizer.tokenize(test4) - assert result == expected - - test5 = "Hi 🇨🇦, 😍!!" - expected = ["Hi", "🇨🇦", ",", "😍", "!", "!"] - result = tokenizer.tokenize(test5) - assert result == expected - - test6 = "<3 🇨🇦 🤝 🇵🇱 <3" - expected = ["<3", "🇨🇦", "🤝", "🇵🇱", "<3"] - result = tokenizer.tokenize(test6) - assert result == expected - - def test_pad_asterisk(self): - """ - Test padding of asterisk for word tokenization. - """ - text = "This is a, *weird sentence with *asterisks in it." - expected = [ - "This", - "is", - "a", - ",", - "*", - "weird", - "sentence", - "with", - "*", - "asterisks", - "in", - "it", - ".", - ] - assert word_tokenize(text) == expected - - def test_pad_dotdot(self): - """ - Test padding of dotdot* for word tokenization. - """ - text = "Why did dotdot.. not get tokenized but dotdotdot... did? How about manydots....." - expected = [ - "Why", - "did", - "dotdot", - "..", - "not", - "get", - "tokenized", - "but", - "dotdotdot", - "...", - "did", - "?", - "How", - "about", - "manydots", - ".....", - ] - assert word_tokenize(text) == expected - - def test_remove_handle(self): - """ - Test remove_handle() from casual.py with specially crafted edge cases - """ - - tokenizer = TweetTokenizer(strip_handles=True) - - # Simple example. Handles with just numbers should be allowed - test1 = "@twitter hello @twi_tter_. hi @12345 @123news" - expected = ["hello", ".", "hi"] - result = tokenizer.tokenize(test1) - assert result == expected - - # Handles are allowed to follow any of the following characters - test2 = "@n`@n~@n(@n)@n-@n=@n+@n\\@n|@n[@n]@n{@n}@n;@n:@n'@n\"@n/@n?@n.@n,@n<@n>@n @n\n@n ñ@n.ü@n.ç@n." - expected = [ - "`", - "~", - "(", - ")", - "-", - "=", - "+", - "\\", - "|", - "[", - "]", - "{", - "}", - ";", - ":", - "'", - '"', - "/", - "?", - ".", - ",", - "<", - ">", - "ñ", - ".", - "ü", - ".", - "ç", - ".", - ] - result = tokenizer.tokenize(test2) - assert result == expected - - # Handles are NOT allowed to follow any of the following characters - test3 = "a@n j@n z@n A@n L@n Z@n 1@n 4@n 7@n 9@n 0@n _@n !@n @@n #@n $@n %@n &@n *@n" - expected = [ - "a", - "@n", - "j", - "@n", - "z", - "@n", - "A", - "@n", - "L", - "@n", - "Z", - "@n", - "1", - "@n", - "4", - "@n", - "7", - "@n", - "9", - "@n", - "0", - "@n", - "_", - "@n", - "!", - "@n", - "@", - "@n", - "#", - "@n", - "$", - "@n", - "%", - "@n", - "&", - "@n", - "*", - "@n", - ] - result = tokenizer.tokenize(test3) - assert result == expected - - # Handles are allowed to precede the following characters - test4 = "@n!a @n#a @n$a @n%a @n&a @n*a" - expected = ["!", "a", "#", "a", "$", "a", "%", "a", "&", "a", "*", "a"] - result = tokenizer.tokenize(test4) - assert result == expected - - # Tests interactions with special symbols and multiple @ - test5 = "@n!@n @n#@n @n$@n @n%@n @n&@n @n*@n @n@n @@n @n@@n @n_@n @n7@n @nj@n" - expected = [ - "!", - "@n", - "#", - "@n", - "$", - "@n", - "%", - "@n", - "&", - "@n", - "*", - "@n", - "@n", - "@n", - "@", - "@n", - "@n", - "@", - "@n", - "@n_", - "@n", - "@n7", - "@n", - "@nj", - "@n", - ] - result = tokenizer.tokenize(test5) - assert result == expected - - # Tests that handles can have a max length of 15 - test6 = "@abcdefghijklmnopqrstuvwxyz @abcdefghijklmno1234 @abcdefghijklmno_ @abcdefghijklmnoendofhandle" - expected = ["pqrstuvwxyz", "1234", "_", "endofhandle"] - result = tokenizer.tokenize(test6) - assert result == expected - - # Edge case where an @ comes directly after a long handle - test7 = "@abcdefghijklmnop@abcde @abcdefghijklmno@abcde @abcdefghijklmno_@abcde @abcdefghijklmno5@abcde" - expected = [ - "p", - "@abcde", - "@abcdefghijklmno", - "@abcde", - "_", - "@abcde", - "5", - "@abcde", - ] - result = tokenizer.tokenize(test7) - assert result == expected - - def test_treebank_span_tokenizer(self): - """ - Test TreebankWordTokenizer.span_tokenize function - """ - - tokenizer = TreebankWordTokenizer() - - # Test case in the docstring - test1 = "Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks)." - expected = [ - (0, 4), - (5, 12), - (13, 17), - (18, 19), - (19, 23), - (24, 26), - (27, 30), - (31, 32), - (32, 36), - (36, 37), - (37, 38), - (40, 46), - (47, 48), - (48, 51), - (51, 52), - (53, 55), - (56, 59), - (60, 62), - (63, 68), - (69, 70), - (70, 76), - (76, 77), - (77, 78), - ] - result = list(tokenizer.span_tokenize(test1)) - assert result == expected - - # Test case with double quotation - test2 = 'The DUP is similar to the "religious right" in the United States and takes a hardline stance on social issues' - expected = [ - (0, 3), - (4, 7), - (8, 10), - (11, 18), - (19, 21), - (22, 25), - (26, 27), - (27, 36), - (37, 42), - (42, 43), - (44, 46), - (47, 50), - (51, 57), - (58, 64), - (65, 68), - (69, 74), - (75, 76), - (77, 85), - (86, 92), - (93, 95), - (96, 102), - (103, 109), - ] - result = list(tokenizer.span_tokenize(test2)) - assert result == expected - - # Test case with double qoutation as well as converted quotations - test3 = "The DUP is similar to the \"religious right\" in the United States and takes a ``hardline'' stance on social issues" - expected = [ - (0, 3), - (4, 7), - (8, 10), - (11, 18), - (19, 21), - (22, 25), - (26, 27), - (27, 36), - (37, 42), - (42, 43), - (44, 46), - (47, 50), - (51, 57), - (58, 64), - (65, 68), - (69, 74), - (75, 76), - (77, 79), - (79, 87), - (87, 89), - (90, 96), - (97, 99), - (100, 106), - (107, 113), - ] - result = list(tokenizer.span_tokenize(test3)) - assert result == expected - - def test_word_tokenize(self): - """ - Test word_tokenize function - """ - - sentence = "The 'v', I've been fooled but I'll seek revenge." - expected = [ - "The", - "'", - "v", - "'", - ",", - "I", - "'ve", - "been", - "fooled", - "but", - "I", - "'ll", - "seek", - "revenge", - ".", - ] - assert word_tokenize(sentence) == expected - - sentence = "'v' 're'" - expected = ["'", "v", "'", "'re", "'"] - assert word_tokenize(sentence) == expected - - def test_punkt_pair_iter(self): - - test_cases = [ - ("12", [("1", "2"), ("2", None)]), - ("123", [("1", "2"), ("2", "3"), ("3", None)]), - ("1234", [("1", "2"), ("2", "3"), ("3", "4"), ("4", None)]), - ] - - for (test_input, expected_output) in test_cases: - actual_output = [x for x in punkt._pair_iter(test_input)] - - assert actual_output == expected_output - - def test_punkt_pair_iter_handles_stop_iteration_exception(self): - # test input to trigger StopIteration from next() - it = iter([]) - # call method under test and produce a generator - gen = punkt._pair_iter(it) - # unpack generator, ensure that no error is raised - list(gen) - - def test_punkt_tokenize_words_handles_stop_iteration_exception(self): - obj = punkt.PunktBaseClass() - - class TestPunktTokenizeWordsMock: - def word_tokenize(self, s): - return iter([]) - - obj._lang_vars = TestPunktTokenizeWordsMock() - # unpack generator, ensure that no error is raised - list(obj._tokenize_words("test")) - - def test_punkt_tokenize_custom_lang_vars(self): - - # Create LangVars including a full stop end character as used in Bengali - class BengaliLanguageVars(punkt.PunktLanguageVars): - sent_end_chars = (".", "?", "!", "\u0964") - - obj = punkt.PunktSentenceTokenizer(lang_vars=BengaliLanguageVars()) - - # We now expect these sentences to be split up into the individual sentences - sentences = "উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।" - expected = [ - "উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন।", - "অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন।", - "এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।", - ] - - assert obj.tokenize(sentences) == expected - - def test_punkt_tokenize_no_custom_lang_vars(self): - - obj = punkt.PunktSentenceTokenizer() - - # We expect these sentences to not be split properly, as the Bengali full stop '।' is not included in the default language vars - sentences = "উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।" - expected = [ - "উপরাষ্ট্রপতি শ্রী এম ভেঙ্কাইয়া নাইডু সোমবার আই আই টি দিল্লির হীরক জয়ন্তী উদযাপনের উদ্বোধন করেছেন। অনলাইনের মাধ্যমে এই অনুষ্ঠানে কেন্দ্রীয় মানব সম্পদ উন্নয়নমন্ত্রী শ্রী রমেশ পোখরিয়াল ‘নিশাঙ্ক’ উপস্থিত ছিলেন। এই উপলক্ষ্যে উপরাষ্ট্রপতি হীরকজয়ন্তীর লোগো এবং ২০৩০-এর জন্য প্রতিষ্ঠানের লক্ষ্য ও পরিকল্পনার নথি প্রকাশ করেছেন।" - ] - - assert obj.tokenize(sentences) == expected - - @pytest.mark.parametrize( - "input_text,n_sents,n_splits,lang_vars", - [ - # Test debug_decisions on a text with two sentences, split by a dot. - ("Subject: Some subject. Attachments: Some attachments", 2, 1), - # The sentence should be split into two sections, - # with one split and hence one decision. - # Test debug_decisions on a text with two sentences, split by an exclamation mark. - ("Subject: Some subject! Attachments: Some attachments", 2, 1), - # The sentence should be split into two sections, - # with one split and hence one decision. - # Test debug_decisions on a text with one sentences, - # which is not split. - ("This is just a normal sentence, just like any other.", 1, 0) - # Hence just 1 - ], - ) - def punkt_debug_decisions(self, input_text, n_sents, n_splits, lang_vars=None): - tokenizer = punkt.PunktSentenceTokenizer() - if lang_vars != None: - tokenizer._lang_vars = lang_vars - - assert len(tokenizer.tokenize(input_text)) == n_sents - assert len(list(tokenizer.debug_decisions(input_text))) == n_splits - - def test_punkt_debug_decisions_custom_end(self): - # Test debug_decisions on a text with two sentences, - # split by a custom end character, based on Issue #2519 - class ExtLangVars(punkt.PunktLanguageVars): - sent_end_chars = (".", "?", "!", "^") - - self.punkt_debug_decisions( - "Subject: Some subject^ Attachments: Some attachments", - n_sents=2, - n_splits=1, - lang_vars=ExtLangVars(), - ) - # The sentence should be split into two sections, - # with one split and hence one decision. - - @pytest.mark.parametrize( - "sentences, expected", - [ - ( - "this is a test. . new sentence.", - ["this is a test.", ".", "new sentence."], - ), - ("This. . . That", ["This.", ".", ".", "That"]), - ("This..... That", ["This..... That"]), - ("This... That", ["This... That"]), - ("This.. . That", ["This.. .", "That"]), - ("This. .. That", ["This.", ".. That"]), - ("This. ,. That", ["This.", ",.", "That"]), - ("This!!! That", ["This!!!", "That"]), - ("This! That", ["This!", "That"]), - ( - "1. This is R .\n2. This is A .\n3. That's all", - ["1.", "This is R .", "2.", "This is A .", "3.", "That's all"], - ), - ( - "1. This is R .\t2. This is A .\t3. That's all", - ["1.", "This is R .", "2.", "This is A .", "3.", "That's all"], - ), - ("Hello.\tThere", ["Hello.", "There"]), - ], - ) - def test_sent_tokenize(self, sentences: str, expected: List[str]): - assert sent_tokenize(sentences) == expected diff --git a/pipeline/nltk/test/unit/test_twitter_auth.py b/pipeline/nltk/test/unit/test_twitter_auth.py deleted file mode 100644 index 5f9a830a0ad0158c6bba26364189fd8ad19907f8..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_twitter_auth.py +++ /dev/null @@ -1,77 +0,0 @@ -""" -Tests for static parts of Twitter package -""" - -import os - -import pytest - -pytest.importorskip("twython") - -from nltk.twitter import Authenticate - - -@pytest.fixture -def auth(): - return Authenticate() - - -class TestCredentials: - """ - Tests that Twitter credentials from a file are handled correctly. - """ - - @classmethod - def setup_class(self): - self.subdir = os.path.join(os.path.dirname(__file__), "files") - os.environ["TWITTER"] = "twitter-files" - - def test_environment(self, auth): - """ - Test that environment variable has been read correctly. - """ - fn = os.path.basename(auth.creds_subdir) - assert fn == os.environ["TWITTER"] - - @pytest.mark.parametrize( - "kwargs", - [ - # Each of the following scenarios should raise an error: - # An empty subdir path - {"subdir": ""}, - # A subdir path of None - {"subdir": None}, - # A nonexistent directory - {"subdir": "/nosuchdir"}, - # 'credentials.txt' is not in default subdir, as read from `os.environ['TWITTER']` - {}, - # Nonexistent credentials file ('foobar') - {"creds_file": "foobar"}, - # 'bad_oauth1-1.txt' is incomplete - {"creds_file": "bad_oauth1-1.txt"}, - # The first key in credentials file 'bad_oauth1-2.txt' is ill-formed - {"creds_file": "bad_oauth1-2.txt"}, - # The first two lines in 'bad_oauth1-3.txt' are collapsed - {"creds_file": "bad_oauth1-3.txt"}, - ], - ) - def test_scenarios_that_should_raise_errors(self, kwargs, auth): - """Various scenarios that should raise errors""" - try: - auth.load_creds(**kwargs) - # raises ValueError (zero length field name in format) for python 2.6 - # OSError for the rest - except (OSError, ValueError): - pass - except Exception as e: - pytest.fail("Unexpected exception thrown: %s" % e) - else: - pytest.fail("OSError exception not thrown.") - - def test_correct_file(self, auth): - """Test that a proper file succeeds and is read correctly""" - oauth = auth.load_creds(subdir=self.subdir) - - assert auth.creds_fullpath == os.path.join(self.subdir, auth.creds_file) - assert auth.creds_file == "credentials.txt" - assert oauth["app_key"] == "a" diff --git a/pipeline/nltk/test/unit/test_util.py b/pipeline/nltk/test/unit/test_util.py deleted file mode 100644 index 31bb8611d34e52c62a80c459acad93e4d9fe3782..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_util.py +++ /dev/null @@ -1,82 +0,0 @@ -import pytest - -from nltk.util import everygrams - - -@pytest.fixture -def everygram_input(): - """Form test data for tests.""" - return iter(["a", "b", "c"]) - - -def test_everygrams_without_padding(everygram_input): - expected_output = [ - ("a",), - ("a", "b"), - ("a", "b", "c"), - ("b",), - ("b", "c"), - ("c",), - ] - output = list(everygrams(everygram_input)) - assert output == expected_output - - -def test_everygrams_max_len(everygram_input): - expected_output = [ - ("a",), - ("a", "b"), - ("b",), - ("b", "c"), - ("c",), - ] - output = list(everygrams(everygram_input, max_len=2)) - assert output == expected_output - - -def test_everygrams_min_len(everygram_input): - expected_output = [ - ("a", "b"), - ("a", "b", "c"), - ("b", "c"), - ] - output = list(everygrams(everygram_input, min_len=2)) - assert output == expected_output - - -def test_everygrams_pad_right(everygram_input): - expected_output = [ - ("a",), - ("a", "b"), - ("a", "b", "c"), - ("b",), - ("b", "c"), - ("b", "c", None), - ("c",), - ("c", None), - ("c", None, None), - (None,), - (None, None), - (None,), - ] - output = list(everygrams(everygram_input, max_len=3, pad_right=True)) - assert output == expected_output - - -def test_everygrams_pad_left(everygram_input): - expected_output = [ - (None,), - (None, None), - (None, None, "a"), - (None,), - (None, "a"), - (None, "a", "b"), - ("a",), - ("a", "b"), - ("a", "b", "c"), - ("b",), - ("b", "c"), - ("c",), - ] - output = list(everygrams(everygram_input, max_len=3, pad_left=True)) - assert output == expected_output diff --git a/pipeline/nltk/test/unit/test_wordnet.py b/pipeline/nltk/test/unit/test_wordnet.py deleted file mode 100644 index d4039e749c76dceacbf239beca01cd403a79e03f..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/test_wordnet.py +++ /dev/null @@ -1,240 +0,0 @@ -""" -Unit tests for nltk.corpus.wordnet -See also nltk/test/wordnet.doctest -""" -import unittest - -from nltk.corpus import wordnet as wn -from nltk.corpus import wordnet_ic as wnic - -wn.ensure_loaded() -S = wn.synset -L = wn.lemma - - -class WordnNetDemo(unittest.TestCase): - def test_retrieve_synset(self): - move_synset = S("go.v.21") - self.assertEqual(move_synset.name(), "move.v.15") - self.assertEqual(move_synset.lemma_names(), ["move", "go"]) - self.assertEqual( - move_synset.definition(), "have a turn; make one's move in a game" - ) - self.assertEqual(move_synset.examples(), ["Can I go now?"]) - - def test_retrieve_synsets(self): - self.assertEqual(sorted(wn.synsets("zap", pos="n")), [S("zap.n.01")]) - self.assertEqual( - sorted(wn.synsets("zap", pos="v")), - [S("microwave.v.01"), S("nuke.v.01"), S("zap.v.01"), S("zap.v.02")], - ) - - def test_hyperhyponyms(self): - # Not every synset as hypernyms() - self.assertEqual(S("travel.v.01").hypernyms(), []) - self.assertEqual(S("travel.v.02").hypernyms(), [S("travel.v.03")]) - self.assertEqual(S("travel.v.03").hypernyms(), []) - - # Test hyper-/hyponyms. - self.assertEqual(S("breakfast.n.1").hypernyms(), [S("meal.n.01")]) - first_five_meal_hypo = [ - S("banquet.n.02"), - S("bite.n.04"), - S("breakfast.n.01"), - S("brunch.n.01"), - S("buffet.n.02"), - ] - self.assertEqual(sorted(S("meal.n.1").hyponyms()[:5]), first_five_meal_hypo) - self.assertEqual(S("Austen.n.1").instance_hypernyms(), [S("writer.n.01")]) - first_five_composer_hypo = [ - S("ambrose.n.01"), - S("bach.n.01"), - S("barber.n.01"), - S("bartok.n.01"), - S("beethoven.n.01"), - ] - self.assertEqual( - S("composer.n.1").instance_hyponyms()[:5], first_five_composer_hypo - ) - - # Test root hyper-/hyponyms - self.assertEqual(S("person.n.01").root_hypernyms(), [S("entity.n.01")]) - self.assertEqual(S("sail.v.01").root_hypernyms(), [S("travel.v.01")]) - self.assertEqual( - S("fall.v.12").root_hypernyms(), [S("act.v.01"), S("fall.v.17")] - ) - - def test_derivationally_related_forms(self): - # Test `derivationally_related_forms()` - self.assertEqual( - L("zap.v.03.nuke").derivationally_related_forms(), - [L("atomic_warhead.n.01.nuke")], - ) - self.assertEqual( - L("zap.v.03.atomize").derivationally_related_forms(), - [L("atomization.n.02.atomization")], - ) - self.assertEqual( - L("zap.v.03.atomise").derivationally_related_forms(), - [L("atomization.n.02.atomisation")], - ) - self.assertEqual(L("zap.v.03.zap").derivationally_related_forms(), []) - - def test_meronyms_holonyms(self): - # Test meronyms, holonyms. - self.assertEqual( - S("dog.n.01").member_holonyms(), [S("canis.n.01"), S("pack.n.06")] - ) - self.assertEqual(S("dog.n.01").part_meronyms(), [S("flag.n.07")]) - - self.assertEqual(S("faculty.n.2").member_meronyms(), [S("professor.n.01")]) - self.assertEqual(S("copilot.n.1").member_holonyms(), [S("crew.n.01")]) - - self.assertEqual( - S("table.n.2").part_meronyms(), - [S("leg.n.03"), S("tabletop.n.01"), S("tableware.n.01")], - ) - self.assertEqual(S("course.n.7").part_holonyms(), [S("meal.n.01")]) - - self.assertEqual( - S("water.n.1").substance_meronyms(), [S("hydrogen.n.01"), S("oxygen.n.01")] - ) - self.assertEqual( - S("gin.n.1").substance_holonyms(), - [ - S("gin_and_it.n.01"), - S("gin_and_tonic.n.01"), - S("martini.n.01"), - S("pink_lady.n.01"), - ], - ) - - def test_antonyms(self): - # Test antonyms. - self.assertEqual( - L("leader.n.1.leader").antonyms(), [L("follower.n.01.follower")] - ) - self.assertEqual( - L("increase.v.1.increase").antonyms(), [L("decrease.v.01.decrease")] - ) - - def test_misc_relations(self): - # Test misc relations. - self.assertEqual(S("snore.v.1").entailments(), [S("sleep.v.01")]) - self.assertEqual( - S("heavy.a.1").similar_tos(), - [ - S("dense.s.03"), - S("doughy.s.01"), - S("heavier-than-air.s.01"), - S("hefty.s.02"), - S("massive.s.04"), - S("non-buoyant.s.01"), - S("ponderous.s.02"), - ], - ) - self.assertEqual(S("light.a.1").attributes(), [S("weight.n.01")]) - self.assertEqual(S("heavy.a.1").attributes(), [S("weight.n.01")]) - - # Test pertainyms. - self.assertEqual( - L("English.a.1.English").pertainyms(), [L("england.n.01.England")] - ) - - def test_lch(self): - # Test LCH. - self.assertEqual( - S("person.n.01").lowest_common_hypernyms(S("dog.n.01")), - [S("organism.n.01")], - ) - self.assertEqual( - S("woman.n.01").lowest_common_hypernyms(S("girlfriend.n.02")), - [S("woman.n.01")], - ) - - def test_domains(self): - # Test domains. - self.assertEqual(S("code.n.03").topic_domains(), [S("computer_science.n.01")]) - self.assertEqual(S("pukka.a.01").region_domains(), [S("india.n.01")]) - self.assertEqual(S("freaky.a.01").usage_domains(), [S("slang.n.02")]) - - def test_in_topic_domains(self): - # Test in domains. - self.assertEqual( - S("computer_science.n.01").in_topic_domains()[0], S("access.n.05") - ) - self.assertEqual(S("germany.n.01").in_region_domains()[23], S("trillion.n.02")) - self.assertEqual(S("slang.n.02").in_usage_domains()[1], S("airhead.n.01")) - - def test_wordnet_similarities(self): - # Path based similarities. - self.assertAlmostEqual(S("cat.n.01").path_similarity(S("cat.n.01")), 1.0) - self.assertAlmostEqual(S("dog.n.01").path_similarity(S("cat.n.01")), 0.2) - self.assertAlmostEqual( - S("car.n.01").path_similarity(S("automobile.v.01")), - S("automobile.v.01").path_similarity(S("car.n.01")), - ) - self.assertAlmostEqual( - S("big.a.01").path_similarity(S("dog.n.01")), - S("dog.n.01").path_similarity(S("big.a.01")), - ) - self.assertAlmostEqual( - S("big.a.01").path_similarity(S("long.a.01")), - S("long.a.01").path_similarity(S("big.a.01")), - ) - self.assertAlmostEqual( - S("dog.n.01").lch_similarity(S("cat.n.01")), 2.028, places=3 - ) - self.assertAlmostEqual( - S("dog.n.01").wup_similarity(S("cat.n.01")), 0.8571, places=3 - ) - self.assertAlmostEqual( - S("car.n.01").wup_similarity(S("automobile.v.01")), - S("automobile.v.01").wup_similarity(S("car.n.01")), - ) - self.assertAlmostEqual( - S("big.a.01").wup_similarity(S("dog.n.01")), - S("dog.n.01").wup_similarity(S("big.a.01")), - ) - self.assertAlmostEqual( - S("big.a.01").wup_similarity(S("long.a.01")), - S("long.a.01").wup_similarity(S("big.a.01")), - ) - self.assertAlmostEqual( - S("big.a.01").lch_similarity(S("long.a.01")), - S("long.a.01").lch_similarity(S("big.a.01")), - ) - # Information Content similarities. - brown_ic = wnic.ic("ic-brown.dat") - self.assertAlmostEqual( - S("dog.n.01").jcn_similarity(S("cat.n.01"), brown_ic), 0.4497, places=3 - ) - semcor_ic = wnic.ic("ic-semcor.dat") - self.assertAlmostEqual( - S("dog.n.01").lin_similarity(S("cat.n.01"), semcor_ic), 0.8863, places=3 - ) - - def test_omw_lemma_no_trailing_underscore(self): - expected = sorted( - [ - "popolna_sprememba_v_mišljenju", - "popoln_obrat", - "preobrat", - "preobrat_v_mišljenju", - ] - ) - self.assertEqual(sorted(S("about-face.n.02").lemma_names(lang="slv")), expected) - - def test_iterable_type_for_all_lemma_names(self): - # Duck-test for iterables. - # See https://stackoverflow.com/a/36230057/610569 - cat_lemmas = wn.all_lemma_names(lang="cat") - eng_lemmas = wn.all_lemma_names(lang="eng") - - self.assertTrue(hasattr(eng_lemmas, "__iter__")) - self.assertTrue(hasattr(eng_lemmas, "__next__") or hasattr(eng_lemmas, "next")) - self.assertTrue(eng_lemmas.__iter__() is eng_lemmas) - - self.assertTrue(hasattr(cat_lemmas, "__iter__")) - self.assertTrue(hasattr(cat_lemmas, "__next__") or hasattr(eng_lemmas, "next")) - self.assertTrue(cat_lemmas.__iter__() is cat_lemmas) diff --git a/pipeline/nltk/test/unit/translate/__init__.py b/pipeline/nltk/test/unit/translate/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/pipeline/nltk/test/unit/translate/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/test/unit/translate/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index e9fdce1e6844c7fec1aa4ff5e164be1fa3423ba7..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/translate/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/translate/__pycache__/test_bleu.cpython-39.pyc b/pipeline/nltk/test/unit/translate/__pycache__/test_bleu.cpython-39.pyc deleted file mode 100644 index 9086bdec9a39cb37e025a0e7fbd9805248f27e5f..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/translate/__pycache__/test_bleu.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/translate/__pycache__/test_gdfa.cpython-39.pyc b/pipeline/nltk/test/unit/translate/__pycache__/test_gdfa.cpython-39.pyc deleted file mode 100644 index 5a6659fa7ef9baae63b597270d4fc09e6e0a69bd..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/translate/__pycache__/test_gdfa.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/translate/__pycache__/test_ibm1.cpython-39.pyc b/pipeline/nltk/test/unit/translate/__pycache__/test_ibm1.cpython-39.pyc deleted file mode 100644 index baa0e3befba3e449e0abda46c37eeb4bc477e57b..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/translate/__pycache__/test_ibm1.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/translate/__pycache__/test_ibm2.cpython-39.pyc b/pipeline/nltk/test/unit/translate/__pycache__/test_ibm2.cpython-39.pyc deleted file mode 100644 index 2853c1ff2dee102ef860005f45f54c857112e4b2..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/translate/__pycache__/test_ibm2.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/translate/__pycache__/test_ibm3.cpython-39.pyc b/pipeline/nltk/test/unit/translate/__pycache__/test_ibm3.cpython-39.pyc deleted file mode 100644 index 47088f89ced413cb62f2c21c55af01d2e4ba6243..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/translate/__pycache__/test_ibm3.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/translate/__pycache__/test_ibm4.cpython-39.pyc b/pipeline/nltk/test/unit/translate/__pycache__/test_ibm4.cpython-39.pyc deleted file mode 100644 index 30c31b7ffcbbae203435dee956b0f06ab42cebcd..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/translate/__pycache__/test_ibm4.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/translate/__pycache__/test_ibm5.cpython-39.pyc b/pipeline/nltk/test/unit/translate/__pycache__/test_ibm5.cpython-39.pyc deleted file mode 100644 index e71c8cd6c7aa5e0793a3cbc86c804c5540c7c2f8..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/translate/__pycache__/test_ibm5.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/translate/__pycache__/test_ibm_model.cpython-39.pyc b/pipeline/nltk/test/unit/translate/__pycache__/test_ibm_model.cpython-39.pyc deleted file mode 100644 index cb3cf3750c653b510d73a26dbac28a5f50432027..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/translate/__pycache__/test_ibm_model.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/translate/__pycache__/test_meteor.cpython-39.pyc b/pipeline/nltk/test/unit/translate/__pycache__/test_meteor.cpython-39.pyc deleted file mode 100644 index 6fcd070072e557b38b42db5dfeb487e7872bbe9d..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/translate/__pycache__/test_meteor.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/translate/__pycache__/test_nist.cpython-39.pyc b/pipeline/nltk/test/unit/translate/__pycache__/test_nist.cpython-39.pyc deleted file mode 100644 index fa5bb8b68b0eed50173cd5651101fbc70ea4e016..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/translate/__pycache__/test_nist.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/translate/__pycache__/test_stack_decoder.cpython-39.pyc b/pipeline/nltk/test/unit/translate/__pycache__/test_stack_decoder.cpython-39.pyc deleted file mode 100644 index b9db0e9aaf61405fc1a02c22d761e0707644597b..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/test/unit/translate/__pycache__/test_stack_decoder.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/test/unit/translate/test_bleu.py b/pipeline/nltk/test/unit/translate/test_bleu.py deleted file mode 100644 index 8fa1e07903036885be24a23392ea68c16065dfde..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/translate/test_bleu.py +++ /dev/null @@ -1,405 +0,0 @@ -""" -Tests for BLEU translation evaluation metric -""" - -import io -import unittest - -from nltk.data import find -from nltk.translate.bleu_score import ( - SmoothingFunction, - brevity_penalty, - closest_ref_length, - corpus_bleu, - modified_precision, - sentence_bleu, -) - - -class TestBLEU(unittest.TestCase): - def test_modified_precision(self): - """ - Examples from the original BLEU paper - https://www.aclweb.org/anthology/P02-1040.pdf - """ - # Example 1: the "the*" example. - # Reference sentences. - ref1 = "the cat is on the mat".split() - ref2 = "there is a cat on the mat".split() - # Hypothesis sentence(s). - hyp1 = "the the the the the the the".split() - - references = [ref1, ref2] - - # Testing modified unigram precision. - hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1)) - assert round(hyp1_unigram_precision, 4) == 0.2857 - # With assertAlmostEqual at 4 place precision. - self.assertAlmostEqual(hyp1_unigram_precision, 0.28571428, places=4) - - # Testing modified bigram precision. - assert float(modified_precision(references, hyp1, n=2)) == 0.0 - - # Example 2: the "of the" example. - # Reference sentences - ref1 = str( - "It is a guide to action that ensures that the military " - "will forever heed Party commands" - ).split() - ref2 = str( - "It is the guiding principle which guarantees the military " - "forces always being under the command of the Party" - ).split() - ref3 = str( - "It is the practical guide for the army always to heed " - "the directions of the party" - ).split() - # Hypothesis sentence(s). - hyp1 = "of the".split() - - references = [ref1, ref2, ref3] - # Testing modified unigram precision. - assert float(modified_precision(references, hyp1, n=1)) == 1.0 - - # Testing modified bigram precision. - assert float(modified_precision(references, hyp1, n=2)) == 1.0 - - # Example 3: Proper MT outputs. - hyp1 = str( - "It is a guide to action which ensures that the military " - "always obeys the commands of the party" - ).split() - hyp2 = str( - "It is to insure the troops forever hearing the activity " - "guidebook that party direct" - ).split() - - references = [ref1, ref2, ref3] - - # Unigram precision. - hyp1_unigram_precision = float(modified_precision(references, hyp1, n=1)) - hyp2_unigram_precision = float(modified_precision(references, hyp2, n=1)) - # Test unigram precision with assertAlmostEqual at 4 place precision. - self.assertAlmostEqual(hyp1_unigram_precision, 0.94444444, places=4) - self.assertAlmostEqual(hyp2_unigram_precision, 0.57142857, places=4) - # Test unigram precision with rounding. - assert round(hyp1_unigram_precision, 4) == 0.9444 - assert round(hyp2_unigram_precision, 4) == 0.5714 - - # Bigram precision - hyp1_bigram_precision = float(modified_precision(references, hyp1, n=2)) - hyp2_bigram_precision = float(modified_precision(references, hyp2, n=2)) - # Test bigram precision with assertAlmostEqual at 4 place precision. - self.assertAlmostEqual(hyp1_bigram_precision, 0.58823529, places=4) - self.assertAlmostEqual(hyp2_bigram_precision, 0.07692307, places=4) - # Test bigram precision with rounding. - assert round(hyp1_bigram_precision, 4) == 0.5882 - assert round(hyp2_bigram_precision, 4) == 0.0769 - - def test_brevity_penalty(self): - # Test case from brevity_penalty_closest function in mteval-v13a.pl. - # Same test cases as in the doctest in nltk.translate.bleu_score.py - references = [["a"] * 11, ["a"] * 8] - hypothesis = ["a"] * 7 - hyp_len = len(hypothesis) - closest_ref_len = closest_ref_length(references, hyp_len) - self.assertAlmostEqual( - brevity_penalty(closest_ref_len, hyp_len), 0.8669, places=4 - ) - - references = [["a"] * 11, ["a"] * 8, ["a"] * 6, ["a"] * 7] - hypothesis = ["a"] * 7 - hyp_len = len(hypothesis) - closest_ref_len = closest_ref_length(references, hyp_len) - assert brevity_penalty(closest_ref_len, hyp_len) == 1.0 - - def test_zero_matches(self): - # Test case where there's 0 matches - references = ["The candidate has no alignment to any of the references".split()] - hypothesis = "John loves Mary".split() - - # Test BLEU to nth order of n-grams, where n is len(hypothesis). - for n in range(1, len(hypothesis)): - weights = (1.0 / n,) * n # Uniform weights. - assert sentence_bleu(references, hypothesis, weights) == 0 - - def test_full_matches(self): - # Test case where there's 100% matches - references = ["John loves Mary".split()] - hypothesis = "John loves Mary".split() - - # Test BLEU to nth order of n-grams, where n is len(hypothesis). - for n in range(1, len(hypothesis)): - weights = (1.0 / n,) * n # Uniform weights. - assert sentence_bleu(references, hypothesis, weights) == 1.0 - - def test_partial_matches_hypothesis_longer_than_reference(self): - references = ["John loves Mary".split()] - hypothesis = "John loves Mary who loves Mike".split() - # Since no 4-grams matches were found the result should be zero - # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0 - self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4) - # Checks that the warning has been raised because len(reference) < 4. - try: - self.assertWarns(UserWarning, sentence_bleu, references, hypothesis) - except AttributeError: - pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2. - - -# @unittest.skip("Skipping fringe cases for BLEU.") -class TestBLEUFringeCases(unittest.TestCase): - def test_case_where_n_is_bigger_than_hypothesis_length(self): - # Test BLEU to nth order of n-grams, where n > len(hypothesis). - references = ["John loves Mary ?".split()] - hypothesis = "John loves Mary".split() - n = len(hypothesis) + 1 # - weights = (1.0 / n,) * n # Uniform weights. - # Since no n-grams matches were found the result should be zero - # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0 - self.assertAlmostEqual( - sentence_bleu(references, hypothesis, weights), 0.0, places=4 - ) - # Checks that the warning has been raised because len(hypothesis) < 4. - try: - self.assertWarns(UserWarning, sentence_bleu, references, hypothesis) - except AttributeError: - pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2. - - # Test case where n > len(hypothesis) but so is n > len(reference), and - # it's a special case where reference == hypothesis. - references = ["John loves Mary".split()] - hypothesis = "John loves Mary".split() - # Since no 4-grams matches were found the result should be zero - # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0 - self.assertAlmostEqual( - sentence_bleu(references, hypothesis, weights), 0.0, places=4 - ) - - def test_empty_hypothesis(self): - # Test case where there's hypothesis is empty. - references = ["The candidate has no alignment to any of the references".split()] - hypothesis = [] - assert sentence_bleu(references, hypothesis) == 0 - - def test_length_one_hypothesis(self): - # Test case where there's hypothesis is of length 1 in Smoothing method 4. - references = ["The candidate has no alignment to any of the references".split()] - hypothesis = ["Foo"] - method4 = SmoothingFunction().method4 - try: - sentence_bleu(references, hypothesis, smoothing_function=method4) - except ValueError: - pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2. - - def test_empty_references(self): - # Test case where there's reference is empty. - references = [[]] - hypothesis = "John loves Mary".split() - assert sentence_bleu(references, hypothesis) == 0 - - def test_empty_references_and_hypothesis(self): - # Test case where both references and hypothesis is empty. - references = [[]] - hypothesis = [] - assert sentence_bleu(references, hypothesis) == 0 - - def test_reference_or_hypothesis_shorter_than_fourgrams(self): - # Test case where the length of reference or hypothesis - # is shorter than 4. - references = ["let it go".split()] - hypothesis = "let go it".split() - # Checks that the value the hypothesis and reference returns is 0.0 - # exp(w_1 * 1 * w_2 * 1 * w_3 * 1 * w_4 * -inf) = 0 - self.assertAlmostEqual(sentence_bleu(references, hypothesis), 0.0, places=4) - # Checks that the warning has been raised. - try: - self.assertWarns(UserWarning, sentence_bleu, references, hypothesis) - except AttributeError: - pass # unittest.TestCase.assertWarns is only supported in Python >= 3.2. - - -class TestBLEUvsMteval13a(unittest.TestCase): - def test_corpus_bleu(self): - ref_file = find("models/wmt15_eval/ref.ru") - hyp_file = find("models/wmt15_eval/google.ru") - mteval_output_file = find("models/wmt15_eval/mteval-13a.output") - - # Reads the BLEU scores from the `mteval-13a.output` file. - # The order of the list corresponds to the order of the ngrams. - with open(mteval_output_file) as mteval_fin: - # The numbers are located in the last 2nd line of the file. - # The first and 2nd item in the list are the score and system names. - mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1]) - - with open(ref_file, encoding="utf8") as ref_fin: - with open(hyp_file, encoding="utf8") as hyp_fin: - # Whitespace tokenize the file. - # Note: split() automatically strip(). - hypothesis = list(map(lambda x: x.split(), hyp_fin)) - # Note that the corpus_bleu input is list of list of references. - references = list(map(lambda x: [x.split()], ref_fin)) - # Without smoothing. - for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores): - nltk_bleu = corpus_bleu( - references, hypothesis, weights=(1.0 / i,) * i - ) - # Check that the BLEU scores difference is less than 0.005 . - # Note: This is an approximate comparison; as much as - # +/- 0.01 BLEU might be "statistically significant", - # the actual translation quality might not be. - assert abs(mteval_bleu - nltk_bleu) < 0.005 - - # With the same smoothing method used in mteval-v13a.pl - chencherry = SmoothingFunction() - for i, mteval_bleu in zip(range(1, 10), mteval_bleu_scores): - nltk_bleu = corpus_bleu( - references, - hypothesis, - weights=(1.0 / i,) * i, - smoothing_function=chencherry.method3, - ) - assert abs(mteval_bleu - nltk_bleu) < 0.005 - - -class TestBLEUWithBadSentence(unittest.TestCase): - def test_corpus_bleu_with_bad_sentence(self): - hyp = "Teo S yb , oe uNb , R , T t , , t Tue Ar saln S , , 5istsi l , 5oe R ulO sae oR R" - ref = str( - "Their tasks include changing a pump on the faulty stokehold ." - "Likewise , two species that are very similar in morphology " - "were distinguished using genetics ." - ) - references = [[ref.split()]] - hypotheses = [hyp.split()] - try: # Check that the warning is raised since no. of 2-grams < 0. - with self.assertWarns(UserWarning): - # Verify that the BLEU output is undesired since no. of 2-grams < 0. - self.assertAlmostEqual( - corpus_bleu(references, hypotheses), 0.0, places=4 - ) - except AttributeError: # unittest.TestCase.assertWarns is only supported in Python >= 3.2. - self.assertAlmostEqual(corpus_bleu(references, hypotheses), 0.0, places=4) - - -class TestBLEUWithMultipleWeights(unittest.TestCase): - def test_corpus_bleu_with_multiple_weights(self): - hyp1 = [ - "It", - "is", - "a", - "guide", - "to", - "action", - "which", - "ensures", - "that", - "the", - "military", - "always", - "obeys", - "the", - "commands", - "of", - "the", - "party", - ] - ref1a = [ - "It", - "is", - "a", - "guide", - "to", - "action", - "that", - "ensures", - "that", - "the", - "military", - "will", - "forever", - "heed", - "Party", - "commands", - ] - ref1b = [ - "It", - "is", - "the", - "guiding", - "principle", - "which", - "guarantees", - "the", - "military", - "forces", - "always", - "being", - "under", - "the", - "command", - "of", - "the", - "Party", - ] - ref1c = [ - "It", - "is", - "the", - "practical", - "guide", - "for", - "the", - "army", - "always", - "to", - "heed", - "the", - "directions", - "of", - "the", - "party", - ] - hyp2 = [ - "he", - "read", - "the", - "book", - "because", - "he", - "was", - "interested", - "in", - "world", - "history", - ] - ref2a = [ - "he", - "was", - "interested", - "in", - "world", - "history", - "because", - "he", - "read", - "the", - "book", - ] - weight_1 = (1, 0, 0, 0) - weight_2 = (0.25, 0.25, 0.25, 0.25) - weight_3 = (0, 0, 0, 0, 1) - - bleu_scores = corpus_bleu( - list_of_references=[[ref1a, ref1b, ref1c], [ref2a]], - hypotheses=[hyp1, hyp2], - weights=[weight_1, weight_2, weight_3], - ) - assert bleu_scores[0] == corpus_bleu( - [[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_1 - ) - assert bleu_scores[1] == corpus_bleu( - [[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_2 - ) - assert bleu_scores[2] == corpus_bleu( - [[ref1a, ref1b, ref1c], [ref2a]], [hyp1, hyp2], weight_3 - ) diff --git a/pipeline/nltk/test/unit/translate/test_gdfa.py b/pipeline/nltk/test/unit/translate/test_gdfa.py deleted file mode 100644 index 1824be45265762050ad4f61fb181a822d5aaa7a7..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/translate/test_gdfa.py +++ /dev/null @@ -1,154 +0,0 @@ -""" -Tests GDFA alignments -""" - -import unittest - -from nltk.translate.gdfa import grow_diag_final_and - - -class TestGDFA(unittest.TestCase): - def test_from_eflomal_outputs(self): - """ - Testing GDFA with first 10 eflomal outputs from issue #1829 - https://github.com/nltk/nltk/issues/1829 - """ - # Input. - forwards = [ - "0-0 1-2", - "0-0 1-1", - "0-0 2-1 3-2 4-3 5-4 6-5 7-6 8-7 7-8 9-9 10-10 9-11 11-12 12-13 13-14", - "0-0 1-1 1-2 2-3 3-4 4-5 4-6 5-7 6-8 8-9 9-10", - "0-0 14-1 15-2 16-3 20-5 21-6 22-7 5-8 6-9 7-10 8-11 9-12 10-13 11-14 12-15 13-16 14-17 17-18 18-19 19-20 20-21 23-22 24-23 25-24 26-25 27-27 28-28 29-29 30-30 31-31", - "0-0 1-1 0-2 2-3", - "0-0 2-2 4-4", - "0-0 1-1 2-3 3-4 5-5 7-6 8-7 9-8 10-9 11-10 12-11 13-12 14-13 15-14 16-16 17-17 18-18 19-19 20-20", - "3-0 4-1 6-2 5-3 6-4 7-5 8-6 9-7 10-8 11-9 16-10 9-12 10-13 12-14", - "1-0", - ] - backwards = [ - "0-0 1-2", - "0-0 1-1", - "0-0 2-1 3-2 4-3 5-4 6-5 7-6 8-7 9-8 10-10 11-12 12-11 13-13", - "0-0 1-2 2-3 3-4 4-6 6-8 7-5 8-7 9-8", - "0-0 1-8 2-9 3-10 4-11 5-12 6-11 8-13 9-14 10-15 11-16 12-17 13-18 14-19 15-20 16-21 17-22 18-23 19-24 20-29 21-30 22-31 23-2 24-3 25-4 26-5 27-5 28-6 29-7 30-28 31-31", - "0-0 1-1 2-3", - "0-0 1-1 2-3 4-4", - "0-0 1-1 2-3 3-4 5-5 7-6 8-7 9-8 10-9 11-10 12-11 13-12 14-13 15-14 16-16 17-17 18-18 19-19 20-16 21-18", - "0-0 1-1 3-2 4-1 5-3 6-4 7-5 8-6 9-7 10-8 11-9 12-8 13-9 14-8 15-9 16-10", - "1-0", - ] - source_lens = [2, 3, 3, 15, 11, 33, 4, 6, 23, 18] - target_lens = [2, 4, 3, 16, 12, 33, 5, 6, 22, 16] - # Expected Output. - expected = [ - [(0, 0), (1, 2)], - [(0, 0), (1, 1)], - [ - (0, 0), - (2, 1), - (3, 2), - (4, 3), - (5, 4), - (6, 5), - (7, 6), - (8, 7), - (10, 10), - (11, 12), - ], - [ - (0, 0), - (1, 1), - (1, 2), - (2, 3), - (3, 4), - (4, 5), - (4, 6), - (5, 7), - (6, 8), - (7, 5), - (8, 7), - (8, 9), - (9, 8), - (9, 10), - ], - [ - (0, 0), - (1, 8), - (2, 9), - (3, 10), - (4, 11), - (5, 8), - (6, 9), - (6, 11), - (7, 10), - (8, 11), - (31, 31), - ], - [(0, 0), (0, 2), (1, 1), (2, 3)], - [(0, 0), (1, 1), (2, 2), (2, 3), (4, 4)], - [ - (0, 0), - (1, 1), - (2, 3), - (3, 4), - (5, 5), - (7, 6), - (8, 7), - (9, 8), - (10, 9), - (11, 10), - (12, 11), - (13, 12), - (14, 13), - (15, 14), - (16, 16), - (17, 17), - (18, 18), - (19, 19), - ], - [ - (0, 0), - (1, 1), - (3, 0), - (3, 2), - (4, 1), - (5, 3), - (6, 2), - (6, 4), - (7, 5), - (8, 6), - (9, 7), - (9, 12), - (10, 8), - (10, 13), - (11, 9), - (12, 8), - (12, 14), - (13, 9), - (14, 8), - (15, 9), - (16, 10), - ], - [(1, 0)], - [ - (0, 0), - (1, 1), - (3, 2), - (4, 3), - (5, 4), - (6, 5), - (7, 6), - (9, 10), - (10, 12), - (11, 13), - (12, 14), - (13, 15), - ], - ] - - # Iterate through all 10 examples and check for expected outputs. - for fw, bw, src_len, trg_len, expect in zip( - forwards, backwards, source_lens, target_lens, expected - ): - self.assertListEqual(expect, grow_diag_final_and(src_len, trg_len, fw, bw)) diff --git a/pipeline/nltk/test/unit/translate/test_ibm1.py b/pipeline/nltk/test/unit/translate/test_ibm1.py deleted file mode 100644 index a4f32ef73cae1f789ee587b6d3d214cfeb0e70d2..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/translate/test_ibm1.py +++ /dev/null @@ -1,73 +0,0 @@ -""" -Tests for IBM Model 1 training methods -""" - -import unittest -from collections import defaultdict - -from nltk.translate import AlignedSent, IBMModel, IBMModel1 -from nltk.translate.ibm_model import AlignmentInfo - - -class TestIBMModel1(unittest.TestCase): - def test_set_uniform_translation_probabilities(self): - # arrange - corpus = [ - AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), - AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), - ] - model1 = IBMModel1(corpus, 0) - - # act - model1.set_uniform_probabilities(corpus) - - # assert - # expected_prob = 1.0 / (target vocab size + 1) - self.assertEqual(model1.translation_table["ham"]["eier"], 1.0 / 3) - self.assertEqual(model1.translation_table["eggs"][None], 1.0 / 3) - - def test_set_uniform_translation_probabilities_of_non_domain_values(self): - # arrange - corpus = [ - AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), - AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), - ] - model1 = IBMModel1(corpus, 0) - - # act - model1.set_uniform_probabilities(corpus) - - # assert - # examine target words that are not in the training data domain - self.assertEqual(model1.translation_table["parrot"]["eier"], IBMModel.MIN_PROB) - - def test_prob_t_a_given_s(self): - # arrange - src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"] - trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"] - corpus = [AlignedSent(trg_sentence, src_sentence)] - alignment_info = AlignmentInfo( - (0, 1, 4, 0, 2, 5, 5), - [None] + src_sentence, - ["UNUSED"] + trg_sentence, - None, - ) - - translation_table = defaultdict(lambda: defaultdict(float)) - translation_table["i"]["ich"] = 0.98 - translation_table["love"]["gern"] = 0.98 - translation_table["to"][None] = 0.98 - translation_table["eat"]["esse"] = 0.98 - translation_table["smoked"]["räucherschinken"] = 0.98 - translation_table["ham"]["räucherschinken"] = 0.98 - - model1 = IBMModel1(corpus, 0) - model1.translation_table = translation_table - - # act - probability = model1.prob_t_a_given_s(alignment_info) - - # assert - lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 - expected_probability = lexical_translation - self.assertEqual(round(probability, 4), round(expected_probability, 4)) diff --git a/pipeline/nltk/test/unit/translate/test_ibm2.py b/pipeline/nltk/test/unit/translate/test_ibm2.py deleted file mode 100644 index e2194dde9aabd503489e4f961b85da550b56d7c2..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/translate/test_ibm2.py +++ /dev/null @@ -1,86 +0,0 @@ -""" -Tests for IBM Model 2 training methods -""" - -import unittest -from collections import defaultdict - -from nltk.translate import AlignedSent, IBMModel, IBMModel2 -from nltk.translate.ibm_model import AlignmentInfo - - -class TestIBMModel2(unittest.TestCase): - def test_set_uniform_alignment_probabilities(self): - # arrange - corpus = [ - AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), - AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), - ] - model2 = IBMModel2(corpus, 0) - - # act - model2.set_uniform_probabilities(corpus) - - # assert - # expected_prob = 1.0 / (length of source sentence + 1) - self.assertEqual(model2.alignment_table[0][1][3][2], 1.0 / 4) - self.assertEqual(model2.alignment_table[2][4][2][4], 1.0 / 3) - - def test_set_uniform_alignment_probabilities_of_non_domain_values(self): - # arrange - corpus = [ - AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), - AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), - ] - model2 = IBMModel2(corpus, 0) - - # act - model2.set_uniform_probabilities(corpus) - - # assert - # examine i and j values that are not in the training data domain - self.assertEqual(model2.alignment_table[99][1][3][2], IBMModel.MIN_PROB) - self.assertEqual(model2.alignment_table[2][99][2][4], IBMModel.MIN_PROB) - - def test_prob_t_a_given_s(self): - # arrange - src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"] - trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"] - corpus = [AlignedSent(trg_sentence, src_sentence)] - alignment_info = AlignmentInfo( - (0, 1, 4, 0, 2, 5, 5), - [None] + src_sentence, - ["UNUSED"] + trg_sentence, - None, - ) - - translation_table = defaultdict(lambda: defaultdict(float)) - translation_table["i"]["ich"] = 0.98 - translation_table["love"]["gern"] = 0.98 - translation_table["to"][None] = 0.98 - translation_table["eat"]["esse"] = 0.98 - translation_table["smoked"]["räucherschinken"] = 0.98 - translation_table["ham"]["räucherschinken"] = 0.98 - - alignment_table = defaultdict( - lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float))) - ) - alignment_table[0][3][5][6] = 0.97 # None -> to - alignment_table[1][1][5][6] = 0.97 # ich -> i - alignment_table[2][4][5][6] = 0.97 # esse -> eat - alignment_table[4][2][5][6] = 0.97 # gern -> love - alignment_table[5][5][5][6] = 0.96 # räucherschinken -> smoked - alignment_table[5][6][5][6] = 0.96 # räucherschinken -> ham - - model2 = IBMModel2(corpus, 0) - model2.translation_table = translation_table - model2.alignment_table = alignment_table - - # act - probability = model2.prob_t_a_given_s(alignment_info) - - # assert - lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 - alignment = 0.97 * 0.97 * 0.97 * 0.97 * 0.96 * 0.96 - expected_probability = lexical_translation * alignment - self.assertEqual(round(probability, 4), round(expected_probability, 4)) diff --git a/pipeline/nltk/test/unit/translate/test_ibm3.py b/pipeline/nltk/test/unit/translate/test_ibm3.py deleted file mode 100644 index 14d89d6d9857f0ff62bb4388cf1e8f04c2f90d46..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/translate/test_ibm3.py +++ /dev/null @@ -1,105 +0,0 @@ -""" -Tests for IBM Model 3 training methods -""" - -import unittest -from collections import defaultdict - -from nltk.translate import AlignedSent, IBMModel, IBMModel3 -from nltk.translate.ibm_model import AlignmentInfo - - -class TestIBMModel3(unittest.TestCase): - def test_set_uniform_distortion_probabilities(self): - # arrange - corpus = [ - AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), - AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), - ] - model3 = IBMModel3(corpus, 0) - - # act - model3.set_uniform_probabilities(corpus) - - # assert - # expected_prob = 1.0 / length of target sentence - self.assertEqual(model3.distortion_table[1][0][3][2], 1.0 / 2) - self.assertEqual(model3.distortion_table[4][2][2][4], 1.0 / 4) - - def test_set_uniform_distortion_probabilities_of_non_domain_values(self): - # arrange - corpus = [ - AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), - AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), - ] - model3 = IBMModel3(corpus, 0) - - # act - model3.set_uniform_probabilities(corpus) - - # assert - # examine i and j values that are not in the training data domain - self.assertEqual(model3.distortion_table[0][0][3][2], IBMModel.MIN_PROB) - self.assertEqual(model3.distortion_table[9][2][2][4], IBMModel.MIN_PROB) - self.assertEqual(model3.distortion_table[2][9][2][4], IBMModel.MIN_PROB) - - def test_prob_t_a_given_s(self): - # arrange - src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"] - trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"] - corpus = [AlignedSent(trg_sentence, src_sentence)] - alignment_info = AlignmentInfo( - (0, 1, 4, 0, 2, 5, 5), - [None] + src_sentence, - ["UNUSED"] + trg_sentence, - [[3], [1], [4], [], [2], [5, 6]], - ) - - distortion_table = defaultdict( - lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(float))) - ) - distortion_table[1][1][5][6] = 0.97 # i -> ich - distortion_table[2][4][5][6] = 0.97 # love -> gern - distortion_table[3][0][5][6] = 0.97 # to -> NULL - distortion_table[4][2][5][6] = 0.97 # eat -> esse - distortion_table[5][5][5][6] = 0.97 # smoked -> räucherschinken - distortion_table[6][5][5][6] = 0.97 # ham -> räucherschinken - - translation_table = defaultdict(lambda: defaultdict(float)) - translation_table["i"]["ich"] = 0.98 - translation_table["love"]["gern"] = 0.98 - translation_table["to"][None] = 0.98 - translation_table["eat"]["esse"] = 0.98 - translation_table["smoked"]["räucherschinken"] = 0.98 - translation_table["ham"]["räucherschinken"] = 0.98 - - fertility_table = defaultdict(lambda: defaultdict(float)) - fertility_table[1]["ich"] = 0.99 - fertility_table[1]["esse"] = 0.99 - fertility_table[0]["ja"] = 0.99 - fertility_table[1]["gern"] = 0.99 - fertility_table[2]["räucherschinken"] = 0.999 - fertility_table[1][None] = 0.99 - - probabilities = { - "p1": 0.167, - "translation_table": translation_table, - "distortion_table": distortion_table, - "fertility_table": fertility_table, - "alignment_table": None, - } - - model3 = IBMModel3(corpus, 0, probabilities) - - # act - probability = model3.prob_t_a_given_s(alignment_info) - - # assert - null_generation = 5 * pow(0.167, 1) * pow(0.833, 4) - fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999 - lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 - distortion = 0.97 * 0.97 * 0.97 * 0.97 * 0.97 * 0.97 - expected_probability = ( - null_generation * fertility * lexical_translation * distortion - ) - self.assertEqual(round(probability, 4), round(expected_probability, 4)) diff --git a/pipeline/nltk/test/unit/translate/test_ibm4.py b/pipeline/nltk/test/unit/translate/test_ibm4.py deleted file mode 100644 index 674b2bc37aaae3a42711d13f05a9bd9d0b35a717..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/translate/test_ibm4.py +++ /dev/null @@ -1,120 +0,0 @@ -""" -Tests for IBM Model 4 training methods -""" - -import unittest -from collections import defaultdict - -from nltk.translate import AlignedSent, IBMModel, IBMModel4 -from nltk.translate.ibm_model import AlignmentInfo - - -class TestIBMModel4(unittest.TestCase): - def test_set_uniform_distortion_probabilities_of_max_displacements(self): - # arrange - src_classes = {"schinken": 0, "eier": 0, "spam": 1} - trg_classes = {"ham": 0, "eggs": 1, "spam": 2} - corpus = [ - AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), - AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), - ] - model4 = IBMModel4(corpus, 0, src_classes, trg_classes) - - # act - model4.set_uniform_probabilities(corpus) - - # assert - # number of displacement values = - # 2 *(number of words in longest target sentence - 1) - expected_prob = 1.0 / (2 * (4 - 1)) - - # examine the boundary values for (displacement, src_class, trg_class) - self.assertEqual(model4.head_distortion_table[3][0][0], expected_prob) - self.assertEqual(model4.head_distortion_table[-3][1][2], expected_prob) - self.assertEqual(model4.non_head_distortion_table[3][0], expected_prob) - self.assertEqual(model4.non_head_distortion_table[-3][2], expected_prob) - - def test_set_uniform_distortion_probabilities_of_non_domain_values(self): - # arrange - src_classes = {"schinken": 0, "eier": 0, "spam": 1} - trg_classes = {"ham": 0, "eggs": 1, "spam": 2} - corpus = [ - AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), - AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), - ] - model4 = IBMModel4(corpus, 0, src_classes, trg_classes) - - # act - model4.set_uniform_probabilities(corpus) - - # assert - # examine displacement values that are not in the training data domain - self.assertEqual(model4.head_distortion_table[4][0][0], IBMModel.MIN_PROB) - self.assertEqual(model4.head_distortion_table[100][1][2], IBMModel.MIN_PROB) - self.assertEqual(model4.non_head_distortion_table[4][0], IBMModel.MIN_PROB) - self.assertEqual(model4.non_head_distortion_table[100][2], IBMModel.MIN_PROB) - - def test_prob_t_a_given_s(self): - # arrange - src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"] - trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"] - src_classes = {"räucherschinken": 0, "ja": 1, "ich": 2, "esse": 3, "gern": 4} - trg_classes = {"ham": 0, "smoked": 1, "i": 3, "love": 4, "to": 2, "eat": 4} - corpus = [AlignedSent(trg_sentence, src_sentence)] - alignment_info = AlignmentInfo( - (0, 1, 4, 0, 2, 5, 5), - [None] + src_sentence, - ["UNUSED"] + trg_sentence, - [[3], [1], [4], [], [2], [5, 6]], - ) - - head_distortion_table = defaultdict( - lambda: defaultdict(lambda: defaultdict(float)) - ) - head_distortion_table[1][None][3] = 0.97 # None, i - head_distortion_table[3][2][4] = 0.97 # ich, eat - head_distortion_table[-2][3][4] = 0.97 # esse, love - head_distortion_table[3][4][1] = 0.97 # gern, smoked - - non_head_distortion_table = defaultdict(lambda: defaultdict(float)) - non_head_distortion_table[1][0] = 0.96 # ham - - translation_table = defaultdict(lambda: defaultdict(float)) - translation_table["i"]["ich"] = 0.98 - translation_table["love"]["gern"] = 0.98 - translation_table["to"][None] = 0.98 - translation_table["eat"]["esse"] = 0.98 - translation_table["smoked"]["räucherschinken"] = 0.98 - translation_table["ham"]["räucherschinken"] = 0.98 - - fertility_table = defaultdict(lambda: defaultdict(float)) - fertility_table[1]["ich"] = 0.99 - fertility_table[1]["esse"] = 0.99 - fertility_table[0]["ja"] = 0.99 - fertility_table[1]["gern"] = 0.99 - fertility_table[2]["räucherschinken"] = 0.999 - fertility_table[1][None] = 0.99 - - probabilities = { - "p1": 0.167, - "translation_table": translation_table, - "head_distortion_table": head_distortion_table, - "non_head_distortion_table": non_head_distortion_table, - "fertility_table": fertility_table, - "alignment_table": None, - } - - model4 = IBMModel4(corpus, 0, src_classes, trg_classes, probabilities) - - # act - probability = model4.prob_t_a_given_s(alignment_info) - - # assert - null_generation = 5 * pow(0.167, 1) * pow(0.833, 4) - fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999 - lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 - distortion = 0.97 * 0.97 * 1 * 0.97 * 0.97 * 0.96 - expected_probability = ( - null_generation * fertility * lexical_translation * distortion - ) - self.assertEqual(round(probability, 4), round(expected_probability, 4)) diff --git a/pipeline/nltk/test/unit/translate/test_ibm5.py b/pipeline/nltk/test/unit/translate/test_ibm5.py deleted file mode 100644 index 7c29c47de230c0e128cb969514787b2ded0451ef..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/translate/test_ibm5.py +++ /dev/null @@ -1,160 +0,0 @@ -""" -Tests for IBM Model 5 training methods -""" - -import unittest -from collections import defaultdict - -from nltk.translate import AlignedSent, IBMModel, IBMModel4, IBMModel5 -from nltk.translate.ibm_model import AlignmentInfo - - -class TestIBMModel5(unittest.TestCase): - def test_set_uniform_vacancy_probabilities_of_max_displacements(self): - # arrange - src_classes = {"schinken": 0, "eier": 0, "spam": 1} - trg_classes = {"ham": 0, "eggs": 1, "spam": 2} - corpus = [ - AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), - AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), - ] - model5 = IBMModel5(corpus, 0, src_classes, trg_classes) - - # act - model5.set_uniform_probabilities(corpus) - - # assert - # number of vacancy difference values = - # 2 * number of words in longest target sentence - expected_prob = 1.0 / (2 * 4) - - # examine the boundary values for (dv, max_v, trg_class) - self.assertEqual(model5.head_vacancy_table[4][4][0], expected_prob) - self.assertEqual(model5.head_vacancy_table[-3][1][2], expected_prob) - self.assertEqual(model5.non_head_vacancy_table[4][4][0], expected_prob) - self.assertEqual(model5.non_head_vacancy_table[-3][1][2], expected_prob) - - def test_set_uniform_vacancy_probabilities_of_non_domain_values(self): - # arrange - src_classes = {"schinken": 0, "eier": 0, "spam": 1} - trg_classes = {"ham": 0, "eggs": 1, "spam": 2} - corpus = [ - AlignedSent(["ham", "eggs"], ["schinken", "schinken", "eier"]), - AlignedSent(["spam", "spam", "spam", "spam"], ["spam", "spam"]), - ] - model5 = IBMModel5(corpus, 0, src_classes, trg_classes) - - # act - model5.set_uniform_probabilities(corpus) - - # assert - # examine dv and max_v values that are not in the training data domain - self.assertEqual(model5.head_vacancy_table[5][4][0], IBMModel.MIN_PROB) - self.assertEqual(model5.head_vacancy_table[-4][1][2], IBMModel.MIN_PROB) - self.assertEqual(model5.head_vacancy_table[4][0][0], IBMModel.MIN_PROB) - self.assertEqual(model5.non_head_vacancy_table[5][4][0], IBMModel.MIN_PROB) - self.assertEqual(model5.non_head_vacancy_table[-4][1][2], IBMModel.MIN_PROB) - - def test_prob_t_a_given_s(self): - # arrange - src_sentence = ["ich", "esse", "ja", "gern", "räucherschinken"] - trg_sentence = ["i", "love", "to", "eat", "smoked", "ham"] - src_classes = {"räucherschinken": 0, "ja": 1, "ich": 2, "esse": 3, "gern": 4} - trg_classes = {"ham": 0, "smoked": 1, "i": 3, "love": 4, "to": 2, "eat": 4} - corpus = [AlignedSent(trg_sentence, src_sentence)] - alignment_info = AlignmentInfo( - (0, 1, 4, 0, 2, 5, 5), - [None] + src_sentence, - ["UNUSED"] + trg_sentence, - [[3], [1], [4], [], [2], [5, 6]], - ) - - head_vacancy_table = defaultdict( - lambda: defaultdict(lambda: defaultdict(float)) - ) - head_vacancy_table[1 - 0][6][3] = 0.97 # ich -> i - head_vacancy_table[3 - 0][5][4] = 0.97 # esse -> eat - head_vacancy_table[1 - 2][4][4] = 0.97 # gern -> love - head_vacancy_table[2 - 0][2][1] = 0.97 # räucherschinken -> smoked - - non_head_vacancy_table = defaultdict( - lambda: defaultdict(lambda: defaultdict(float)) - ) - non_head_vacancy_table[1 - 0][1][0] = 0.96 # räucherschinken -> ham - - translation_table = defaultdict(lambda: defaultdict(float)) - translation_table["i"]["ich"] = 0.98 - translation_table["love"]["gern"] = 0.98 - translation_table["to"][None] = 0.98 - translation_table["eat"]["esse"] = 0.98 - translation_table["smoked"]["räucherschinken"] = 0.98 - translation_table["ham"]["räucherschinken"] = 0.98 - - fertility_table = defaultdict(lambda: defaultdict(float)) - fertility_table[1]["ich"] = 0.99 - fertility_table[1]["esse"] = 0.99 - fertility_table[0]["ja"] = 0.99 - fertility_table[1]["gern"] = 0.99 - fertility_table[2]["räucherschinken"] = 0.999 - fertility_table[1][None] = 0.99 - - probabilities = { - "p1": 0.167, - "translation_table": translation_table, - "fertility_table": fertility_table, - "head_vacancy_table": head_vacancy_table, - "non_head_vacancy_table": non_head_vacancy_table, - "head_distortion_table": None, - "non_head_distortion_table": None, - "alignment_table": None, - } - - model5 = IBMModel5(corpus, 0, src_classes, trg_classes, probabilities) - - # act - probability = model5.prob_t_a_given_s(alignment_info) - - # assert - null_generation = 5 * pow(0.167, 1) * pow(0.833, 4) - fertility = 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 1 * 0.99 * 2 * 0.999 - lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 - vacancy = 0.97 * 0.97 * 1 * 0.97 * 0.97 * 0.96 - expected_probability = ( - null_generation * fertility * lexical_translation * vacancy - ) - self.assertEqual(round(probability, 4), round(expected_probability, 4)) - - def test_prune(self): - # arrange - alignment_infos = [ - AlignmentInfo((1, 1), None, None, None), - AlignmentInfo((1, 2), None, None, None), - AlignmentInfo((2, 1), None, None, None), - AlignmentInfo((2, 2), None, None, None), - AlignmentInfo((0, 0), None, None, None), - ] - min_factor = IBMModel5.MIN_SCORE_FACTOR - best_score = 0.9 - scores = { - (1, 1): min(min_factor * 1.5, 1) * best_score, # above threshold - (1, 2): best_score, - (2, 1): min_factor * best_score, # at threshold - (2, 2): min_factor * best_score * 0.5, # low score - (0, 0): min(min_factor * 1.1, 1) * 1.2, # above threshold - } - corpus = [AlignedSent(["a"], ["b"])] - original_prob_function = IBMModel4.model4_prob_t_a_given_s - # mock static method - IBMModel4.model4_prob_t_a_given_s = staticmethod( - lambda a, model: scores[a.alignment] - ) - model5 = IBMModel5(corpus, 0, None, None) - - # act - pruned_alignments = model5.prune(alignment_infos) - - # assert - self.assertEqual(len(pruned_alignments), 3) - - # restore static method - IBMModel4.model4_prob_t_a_given_s = original_prob_function diff --git a/pipeline/nltk/test/unit/translate/test_ibm_model.py b/pipeline/nltk/test/unit/translate/test_ibm_model.py deleted file mode 100644 index 2707fc6e8c8825c9e1c042cfcb28b3edacff3e87..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/translate/test_ibm_model.py +++ /dev/null @@ -1,269 +0,0 @@ -""" -Tests for common methods of IBM translation models -""" - -import unittest -from collections import defaultdict - -from nltk.translate import AlignedSent, IBMModel -from nltk.translate.ibm_model import AlignmentInfo - - -class TestIBMModel(unittest.TestCase): - __TEST_SRC_SENTENCE = ["j'", "aime", "bien", "jambon"] - __TEST_TRG_SENTENCE = ["i", "love", "ham"] - - def test_vocabularies_are_initialized(self): - parallel_corpora = [ - AlignedSent(["one", "two", "three", "four"], ["un", "deux", "trois"]), - AlignedSent(["five", "one", "six"], ["quatre", "cinq", "six"]), - AlignedSent([], ["sept"]), - ] - - ibm_model = IBMModel(parallel_corpora) - self.assertEqual(len(ibm_model.src_vocab), 8) - self.assertEqual(len(ibm_model.trg_vocab), 6) - - def test_vocabularies_are_initialized_even_with_empty_corpora(self): - parallel_corpora = [] - - ibm_model = IBMModel(parallel_corpora) - self.assertEqual(len(ibm_model.src_vocab), 1) # addition of NULL token - self.assertEqual(len(ibm_model.trg_vocab), 0) - - def test_best_model2_alignment(self): - # arrange - sentence_pair = AlignedSent( - TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE - ) - # None and 'bien' have zero fertility - translation_table = { - "i": {"j'": 0.9, "aime": 0.05, "bien": 0.02, "jambon": 0.03, None: 0}, - "love": {"j'": 0.05, "aime": 0.9, "bien": 0.01, "jambon": 0.01, None: 0.03}, - "ham": {"j'": 0, "aime": 0.01, "bien": 0, "jambon": 0.99, None: 0}, - } - alignment_table = defaultdict( - lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2))) - ) - - ibm_model = IBMModel([]) - ibm_model.translation_table = translation_table - ibm_model.alignment_table = alignment_table - - # act - a_info = ibm_model.best_model2_alignment(sentence_pair) - - # assert - self.assertEqual(a_info.alignment[1:], (1, 2, 4)) # 0th element unused - self.assertEqual(a_info.cepts, [[], [1], [2], [], [3]]) - - def test_best_model2_alignment_does_not_change_pegged_alignment(self): - # arrange - sentence_pair = AlignedSent( - TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE - ) - translation_table = { - "i": {"j'": 0.9, "aime": 0.05, "bien": 0.02, "jambon": 0.03, None: 0}, - "love": {"j'": 0.05, "aime": 0.9, "bien": 0.01, "jambon": 0.01, None: 0.03}, - "ham": {"j'": 0, "aime": 0.01, "bien": 0, "jambon": 0.99, None: 0}, - } - alignment_table = defaultdict( - lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2))) - ) - - ibm_model = IBMModel([]) - ibm_model.translation_table = translation_table - ibm_model.alignment_table = alignment_table - - # act: force 'love' to be pegged to 'jambon' - a_info = ibm_model.best_model2_alignment(sentence_pair, 2, 4) - # assert - self.assertEqual(a_info.alignment[1:], (1, 4, 4)) - self.assertEqual(a_info.cepts, [[], [1], [], [], [2, 3]]) - - def test_best_model2_alignment_handles_fertile_words(self): - # arrange - sentence_pair = AlignedSent( - ["i", "really", ",", "really", "love", "ham"], - TestIBMModel.__TEST_SRC_SENTENCE, - ) - # 'bien' produces 2 target words: 'really' and another 'really' - translation_table = { - "i": {"j'": 0.9, "aime": 0.05, "bien": 0.02, "jambon": 0.03, None: 0}, - "really": {"j'": 0, "aime": 0, "bien": 0.9, "jambon": 0.01, None: 0.09}, - ",": {"j'": 0, "aime": 0, "bien": 0.3, "jambon": 0, None: 0.7}, - "love": {"j'": 0.05, "aime": 0.9, "bien": 0.01, "jambon": 0.01, None: 0.03}, - "ham": {"j'": 0, "aime": 0.01, "bien": 0, "jambon": 0.99, None: 0}, - } - alignment_table = defaultdict( - lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.2))) - ) - - ibm_model = IBMModel([]) - ibm_model.translation_table = translation_table - ibm_model.alignment_table = alignment_table - - # act - a_info = ibm_model.best_model2_alignment(sentence_pair) - - # assert - self.assertEqual(a_info.alignment[1:], (1, 3, 0, 3, 2, 4)) - self.assertEqual(a_info.cepts, [[3], [1], [5], [2, 4], [6]]) - - def test_best_model2_alignment_handles_empty_src_sentence(self): - # arrange - sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE, []) - ibm_model = IBMModel([]) - - # act - a_info = ibm_model.best_model2_alignment(sentence_pair) - - # assert - self.assertEqual(a_info.alignment[1:], (0, 0, 0)) - self.assertEqual(a_info.cepts, [[1, 2, 3]]) - - def test_best_model2_alignment_handles_empty_trg_sentence(self): - # arrange - sentence_pair = AlignedSent([], TestIBMModel.__TEST_SRC_SENTENCE) - ibm_model = IBMModel([]) - - # act - a_info = ibm_model.best_model2_alignment(sentence_pair) - - # assert - self.assertEqual(a_info.alignment[1:], ()) - self.assertEqual(a_info.cepts, [[], [], [], [], []]) - - def test_neighboring_finds_neighbor_alignments(self): - # arrange - a_info = AlignmentInfo( - (0, 3, 2), - (None, "des", "œufs", "verts"), - ("UNUSED", "green", "eggs"), - [[], [], [2], [1]], - ) - ibm_model = IBMModel([]) - - # act - neighbors = ibm_model.neighboring(a_info) - - # assert - neighbor_alignments = set() - for neighbor in neighbors: - neighbor_alignments.add(neighbor.alignment) - expected_alignments = { - # moves - (0, 0, 2), - (0, 1, 2), - (0, 2, 2), - (0, 3, 0), - (0, 3, 1), - (0, 3, 3), - # swaps - (0, 2, 3), - # original alignment - (0, 3, 2), - } - self.assertEqual(neighbor_alignments, expected_alignments) - - def test_neighboring_sets_neighbor_alignment_info(self): - # arrange - a_info = AlignmentInfo( - (0, 3, 2), - (None, "des", "œufs", "verts"), - ("UNUSED", "green", "eggs"), - [[], [], [2], [1]], - ) - ibm_model = IBMModel([]) - - # act - neighbors = ibm_model.neighboring(a_info) - - # assert: select a few particular alignments - for neighbor in neighbors: - if neighbor.alignment == (0, 2, 2): - moved_alignment = neighbor - elif neighbor.alignment == (0, 3, 2): - swapped_alignment = neighbor - - self.assertEqual(moved_alignment.cepts, [[], [], [1, 2], []]) - self.assertEqual(swapped_alignment.cepts, [[], [], [2], [1]]) - - def test_neighboring_returns_neighbors_with_pegged_alignment(self): - # arrange - a_info = AlignmentInfo( - (0, 3, 2), - (None, "des", "œufs", "verts"), - ("UNUSED", "green", "eggs"), - [[], [], [2], [1]], - ) - ibm_model = IBMModel([]) - - # act: peg 'eggs' to align with 'œufs' - neighbors = ibm_model.neighboring(a_info, 2) - - # assert - neighbor_alignments = set() - for neighbor in neighbors: - neighbor_alignments.add(neighbor.alignment) - expected_alignments = { - # moves - (0, 0, 2), - (0, 1, 2), - (0, 2, 2), - # no swaps - # original alignment - (0, 3, 2), - } - self.assertEqual(neighbor_alignments, expected_alignments) - - def test_hillclimb(self): - # arrange - initial_alignment = AlignmentInfo((0, 3, 2), None, None, None) - - def neighboring_mock(a, j): - if a.alignment == (0, 3, 2): - return { - AlignmentInfo((0, 2, 2), None, None, None), - AlignmentInfo((0, 1, 1), None, None, None), - } - elif a.alignment == (0, 2, 2): - return { - AlignmentInfo((0, 3, 3), None, None, None), - AlignmentInfo((0, 4, 4), None, None, None), - } - return set() - - def prob_t_a_given_s_mock(a): - prob_values = { - (0, 3, 2): 0.5, - (0, 2, 2): 0.6, - (0, 1, 1): 0.4, - (0, 3, 3): 0.6, - (0, 4, 4): 0.7, - } - return prob_values.get(a.alignment, 0.01) - - ibm_model = IBMModel([]) - ibm_model.neighboring = neighboring_mock - ibm_model.prob_t_a_given_s = prob_t_a_given_s_mock - - # act - best_alignment = ibm_model.hillclimb(initial_alignment) - - # assert: hill climbing goes from (0, 3, 2) -> (0, 2, 2) -> (0, 4, 4) - self.assertEqual(best_alignment.alignment, (0, 4, 4)) - - def test_sample(self): - # arrange - sentence_pair = AlignedSent( - TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE - ) - ibm_model = IBMModel([]) - ibm_model.prob_t_a_given_s = lambda x: 0.001 - - # act - samples, best_alignment = ibm_model.sample(sentence_pair) - - # assert - self.assertEqual(len(samples), 61) diff --git a/pipeline/nltk/test/unit/translate/test_meteor.py b/pipeline/nltk/test/unit/translate/test_meteor.py deleted file mode 100644 index 13d8e311c9337266a9cdc1b2ecfd67ef58cfb5b2..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/translate/test_meteor.py +++ /dev/null @@ -1,20 +0,0 @@ -import unittest - -from nltk.translate.meteor_score import meteor_score - - -class TestMETEOR(unittest.TestCase): - reference = [["this", "is", "a", "test"], ["this", "is" "test"]] - candidate = ["THIS", "Is", "a", "tEST"] - - def test_meteor(self): - score = meteor_score(self.reference, self.candidate, preprocess=str.lower) - assert score == 0.9921875 - - def test_reference_type_check(self): - str_reference = [" ".join(ref) for ref in self.reference] - self.assertRaises(TypeError, meteor_score, str_reference, self.candidate) - - def test_candidate_type_check(self): - str_candidate = " ".join(self.candidate) - self.assertRaises(TypeError, meteor_score, self.reference, str_candidate) diff --git a/pipeline/nltk/test/unit/translate/test_nist.py b/pipeline/nltk/test/unit/translate/test_nist.py deleted file mode 100644 index 1bb8829abaf40e892680f83b283bcb23884f386b..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/translate/test_nist.py +++ /dev/null @@ -1,36 +0,0 @@ -""" -Tests for NIST translation evaluation metric -""" - -import io -import unittest - -from nltk.data import find -from nltk.translate.nist_score import corpus_nist - - -class TestNIST(unittest.TestCase): - def test_sentence_nist(self): - ref_file = find("models/wmt15_eval/ref.ru") - hyp_file = find("models/wmt15_eval/google.ru") - mteval_output_file = find("models/wmt15_eval/mteval-13a.output") - - # Reads the NIST scores from the `mteval-13a.output` file. - # The order of the list corresponds to the order of the ngrams. - with open(mteval_output_file) as mteval_fin: - # The numbers are located in the last 4th line of the file. - # The first and 2nd item in the list are the score and system names. - mteval_nist_scores = map(float, mteval_fin.readlines()[-4].split()[1:-1]) - - with open(ref_file, encoding="utf8") as ref_fin: - with open(hyp_file, encoding="utf8") as hyp_fin: - # Whitespace tokenize the file. - # Note: split() automatically strip(). - hypotheses = list(map(lambda x: x.split(), hyp_fin)) - # Note that the corpus_bleu input is list of list of references. - references = list(map(lambda x: [x.split()], ref_fin)) - # Without smoothing. - for i, mteval_nist in zip(range(1, 10), mteval_nist_scores): - nltk_nist = corpus_nist(references, hypotheses, i) - # Check that the NIST scores difference is less than 0.5 - assert abs(mteval_nist - nltk_nist) < 0.05 diff --git a/pipeline/nltk/test/unit/translate/test_stack_decoder.py b/pipeline/nltk/test/unit/translate/test_stack_decoder.py deleted file mode 100644 index c3c3eb5de16fb522293c18fc07a16436f54a2940..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/unit/translate/test_stack_decoder.py +++ /dev/null @@ -1,294 +0,0 @@ -# Natural Language Toolkit: Stack decoder -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Tah Wei Hoon -# URL: -# For license information, see LICENSE.TXT - -""" -Tests for stack decoder -""" - -import unittest -from collections import defaultdict -from math import log - -from nltk.translate import PhraseTable, StackDecoder -from nltk.translate.stack_decoder import _Hypothesis, _Stack - - -class TestStackDecoder(unittest.TestCase): - def test_find_all_src_phrases(self): - # arrange - phrase_table = TestStackDecoder.create_fake_phrase_table() - stack_decoder = StackDecoder(phrase_table, None) - sentence = ("my", "hovercraft", "is", "full", "of", "eels") - - # act - src_phrase_spans = stack_decoder.find_all_src_phrases(sentence) - - # assert - self.assertEqual(src_phrase_spans[0], [2]) # 'my hovercraft' - self.assertEqual(src_phrase_spans[1], [2]) # 'hovercraft' - self.assertEqual(src_phrase_spans[2], [3]) # 'is' - self.assertEqual(src_phrase_spans[3], [5, 6]) # 'full of', 'full of eels' - self.assertFalse(src_phrase_spans[4]) # no entry starting with 'of' - self.assertEqual(src_phrase_spans[5], [6]) # 'eels' - - def test_distortion_score(self): - # arrange - stack_decoder = StackDecoder(None, None) - stack_decoder.distortion_factor = 0.5 - hypothesis = _Hypothesis() - hypothesis.src_phrase_span = (3, 5) - - # act - score = stack_decoder.distortion_score(hypothesis, (8, 10)) - - # assert - expected_score = log(stack_decoder.distortion_factor) * (8 - 5) - self.assertEqual(score, expected_score) - - def test_distortion_score_of_first_expansion(self): - # arrange - stack_decoder = StackDecoder(None, None) - stack_decoder.distortion_factor = 0.5 - hypothesis = _Hypothesis() - - # act - score = stack_decoder.distortion_score(hypothesis, (8, 10)) - - # assert - # expansion from empty hypothesis always has zero distortion cost - self.assertEqual(score, 0.0) - - def test_compute_future_costs(self): - # arrange - phrase_table = TestStackDecoder.create_fake_phrase_table() - language_model = TestStackDecoder.create_fake_language_model() - stack_decoder = StackDecoder(phrase_table, language_model) - sentence = ("my", "hovercraft", "is", "full", "of", "eels") - - # act - future_scores = stack_decoder.compute_future_scores(sentence) - - # assert - self.assertEqual( - future_scores[1][2], - ( - phrase_table.translations_for(("hovercraft",))[0].log_prob - + language_model.probability(("hovercraft",)) - ), - ) - self.assertEqual( - future_scores[0][2], - ( - phrase_table.translations_for(("my", "hovercraft"))[0].log_prob - + language_model.probability(("my", "hovercraft")) - ), - ) - - def test_compute_future_costs_for_phrases_not_in_phrase_table(self): - # arrange - phrase_table = TestStackDecoder.create_fake_phrase_table() - language_model = TestStackDecoder.create_fake_language_model() - stack_decoder = StackDecoder(phrase_table, language_model) - sentence = ("my", "hovercraft", "is", "full", "of", "eels") - - # act - future_scores = stack_decoder.compute_future_scores(sentence) - - # assert - self.assertEqual( - future_scores[1][3], # 'hovercraft is' is not in phrase table - future_scores[1][2] + future_scores[2][3], - ) # backoff - - def test_future_score(self): - # arrange: sentence with 8 words; words 2, 3, 4 already translated - hypothesis = _Hypothesis() - hypothesis.untranslated_spans = lambda _: [(0, 2), (5, 8)] # mock - future_score_table = defaultdict(lambda: defaultdict(float)) - future_score_table[0][2] = 0.4 - future_score_table[5][8] = 0.5 - stack_decoder = StackDecoder(None, None) - - # act - future_score = stack_decoder.future_score(hypothesis, future_score_table, 8) - - # assert - self.assertEqual(future_score, 0.4 + 0.5) - - def test_valid_phrases(self): - # arrange - hypothesis = _Hypothesis() - # mock untranslated_spans method - hypothesis.untranslated_spans = lambda _: [(0, 2), (3, 6)] - all_phrases_from = [[1, 4], [2], [], [5], [5, 6, 7], [], [7]] - - # act - phrase_spans = StackDecoder.valid_phrases(all_phrases_from, hypothesis) - - # assert - self.assertEqual(phrase_spans, [(0, 1), (1, 2), (3, 5), (4, 5), (4, 6)]) - - @staticmethod - def create_fake_phrase_table(): - phrase_table = PhraseTable() - phrase_table.add(("hovercraft",), ("",), 0.8) - phrase_table.add(("my", "hovercraft"), ("", ""), 0.7) - phrase_table.add(("my", "cheese"), ("", ""), 0.7) - phrase_table.add(("is",), ("",), 0.8) - phrase_table.add(("is",), ("",), 0.5) - phrase_table.add(("full", "of"), ("", ""), 0.01) - phrase_table.add(("full", "of", "eels"), ("", "", ""), 0.5) - phrase_table.add(("full", "of", "spam"), ("", ""), 0.5) - phrase_table.add(("eels",), ("",), 0.5) - phrase_table.add(("spam",), ("",), 0.5) - return phrase_table - - @staticmethod - def create_fake_language_model(): - # nltk.model should be used here once it is implemented - language_prob = defaultdict(lambda: -999.0) - language_prob[("my",)] = log(0.1) - language_prob[("hovercraft",)] = log(0.1) - language_prob[("is",)] = log(0.1) - language_prob[("full",)] = log(0.1) - language_prob[("of",)] = log(0.1) - language_prob[("eels",)] = log(0.1) - language_prob[("my", "hovercraft")] = log(0.3) - language_model = type( - "", (object,), {"probability": lambda _, phrase: language_prob[phrase]} - )() - return language_model - - -class TestHypothesis(unittest.TestCase): - def setUp(self): - root = _Hypothesis() - child = _Hypothesis( - raw_score=0.5, - src_phrase_span=(3, 7), - trg_phrase=("hello", "world"), - previous=root, - ) - grandchild = _Hypothesis( - raw_score=0.4, - src_phrase_span=(1, 2), - trg_phrase=("and", "goodbye"), - previous=child, - ) - self.hypothesis_chain = grandchild - - def test_translation_so_far(self): - # act - translation = self.hypothesis_chain.translation_so_far() - - # assert - self.assertEqual(translation, ["hello", "world", "and", "goodbye"]) - - def test_translation_so_far_for_empty_hypothesis(self): - # arrange - hypothesis = _Hypothesis() - - # act - translation = hypothesis.translation_so_far() - - # assert - self.assertEqual(translation, []) - - def test_total_translated_words(self): - # act - total_translated_words = self.hypothesis_chain.total_translated_words() - - # assert - self.assertEqual(total_translated_words, 5) - - def test_translated_positions(self): - # act - translated_positions = self.hypothesis_chain.translated_positions() - - # assert - translated_positions.sort() - self.assertEqual(translated_positions, [1, 3, 4, 5, 6]) - - def test_untranslated_spans(self): - # act - untranslated_spans = self.hypothesis_chain.untranslated_spans(10) - - # assert - self.assertEqual(untranslated_spans, [(0, 1), (2, 3), (7, 10)]) - - def test_untranslated_spans_for_empty_hypothesis(self): - # arrange - hypothesis = _Hypothesis() - - # act - untranslated_spans = hypothesis.untranslated_spans(10) - - # assert - self.assertEqual(untranslated_spans, [(0, 10)]) - - -class TestStack(unittest.TestCase): - def test_push_bumps_off_worst_hypothesis_when_stack_is_full(self): - # arrange - stack = _Stack(3) - poor_hypothesis = _Hypothesis(0.01) - - # act - stack.push(_Hypothesis(0.2)) - stack.push(poor_hypothesis) - stack.push(_Hypothesis(0.1)) - stack.push(_Hypothesis(0.3)) - - # assert - self.assertFalse(poor_hypothesis in stack) - - def test_push_removes_hypotheses_that_fall_below_beam_threshold(self): - # arrange - stack = _Stack(3, 0.5) - poor_hypothesis = _Hypothesis(0.01) - worse_hypothesis = _Hypothesis(0.009) - - # act - stack.push(poor_hypothesis) - stack.push(worse_hypothesis) - stack.push(_Hypothesis(0.9)) # greatly superior hypothesis - - # assert - self.assertFalse(poor_hypothesis in stack) - self.assertFalse(worse_hypothesis in stack) - - def test_push_does_not_add_hypothesis_that_falls_below_beam_threshold(self): - # arrange - stack = _Stack(3, 0.5) - poor_hypothesis = _Hypothesis(0.01) - - # act - stack.push(_Hypothesis(0.9)) # greatly superior hypothesis - stack.push(poor_hypothesis) - - # assert - self.assertFalse(poor_hypothesis in stack) - - def test_best_returns_the_best_hypothesis(self): - # arrange - stack = _Stack(3) - best_hypothesis = _Hypothesis(0.99) - - # act - stack.push(_Hypothesis(0.0)) - stack.push(best_hypothesis) - stack.push(_Hypothesis(0.5)) - - # assert - self.assertEqual(stack.best(), best_hypothesis) - - def test_best_returns_none_when_stack_is_empty(self): - # arrange - stack = _Stack(3) - - # assert - self.assertEqual(stack.best(), None) diff --git a/pipeline/nltk/test/util.doctest b/pipeline/nltk/test/util.doctest deleted file mode 100644 index b5dce4e3ce1f2ab30ee1083d969c666889e6c4ec..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/util.doctest +++ /dev/null @@ -1,47 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -================= -Utility functions -================= - - >>> from nltk.util import * - >>> from nltk.tree import Tree - - >>> print_string("This is a long string, therefore it should break", 25) - This is a long string, - therefore it should break - - >>> re_show("[a-z]+", "sdf123") - {sdf}123 - - >>> tree = Tree(5, - ... [Tree(4, [Tree(2, [1, 3])]), - ... Tree(8, [Tree(6, [7]), 9])]) - >>> for x in breadth_first(tree): - ... if isinstance(x, int): print(x) - ... else: print(x.label()) - 5 - 4 - 8 - 2 - 6 - 9 - 1 - 3 - 7 - >>> for x in breadth_first(tree, maxdepth=2): - ... if isinstance(x, int): print(x) - ... else: print(x.label()) - 5 - 4 - 8 - 2 - 6 - 9 - - >>> invert_dict({1: 2}) - defaultdict(<... 'list'>, {2: 1}) - - >>> invert_dict({1: [3, 4, 5]}) - defaultdict(<... 'list'>, {3: [1], 4: [1], 5: [1]}) diff --git a/pipeline/nltk/test/wordnet.doctest b/pipeline/nltk/test/wordnet.doctest deleted file mode 100644 index 0249e6564a73155051050700d76c0014b4086b0e..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/wordnet.doctest +++ /dev/null @@ -1,828 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -================= -WordNet Interface -================= - -WordNet is just another NLTK corpus reader, and can be imported like this: - - >>> from nltk.corpus import wordnet - -For more compact code, we recommend: - - >>> from nltk.corpus import wordnet as wn - ------ -Words ------ - -Look up a word using ``synsets()``; this function has an optional ``pos`` argument -which lets you constrain the part of speech of the word: - - >>> wn.synsets('dog') - [Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'), - Synset('frank.n.02'), Synset('pawl.n.01'), Synset('andiron.n.01'), Synset('chase.v.01')] - >>> wn.synsets('dog', pos=wn.VERB) - [Synset('chase.v.01')] - -The other parts of speech are ``NOUN``, ``ADJ`` and ``ADV``. -A synset is identified with a 3-part name of the form: word.pos.nn: - - >>> wn.synset('dog.n.01') - Synset('dog.n.01') - >>> print(wn.synset('dog.n.01').definition()) - a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds - >>> len(wn.synset('dog.n.01').examples()) - 1 - >>> print(wn.synset('dog.n.01').examples()[0]) - the dog barked all night - >>> wn.synset('dog.n.01').lemmas() - [Lemma('dog.n.01.dog'), Lemma('dog.n.01.domestic_dog'), Lemma('dog.n.01.Canis_familiaris')] - >>> [str(lemma.name()) for lemma in wn.synset('dog.n.01').lemmas()] - ['dog', 'domestic_dog', 'Canis_familiaris'] - >>> wn.lemma('dog.n.01.dog').synset() - Synset('dog.n.01') - -The WordNet corpus reader gives access to the Open Multilingual -WordNet, using ISO-639 language codes. These languages are not -loaded by default, but only lazily, when needed. - - >>> wn.langs() - ['eng'] - - >>> wn.synsets(b'\xe7\x8a\xac'.decode('utf-8'), lang='jpn') - [Synset('dog.n.01'), Synset('spy.n.01')] - - >>> wn.synset('spy.n.01').lemma_names('jpn') - ['いぬ', 'まわし者', 'スパイ', '回し者', '回者', '密偵', - '工作員', '廻し者', '廻者', '探', '探り', '犬', '秘密捜査員', - '諜報員', '諜者', '間者', '間諜', '隠密'] - - >>> sorted(wn.langs()) - ['als', 'arb', 'bul', 'cat', 'cmn', 'dan', 'ell', 'eng', 'eus', - 'fin', 'fra', 'glg', 'heb', 'hrv', 'ind', 'isl', 'ita', 'ita_iwn', - 'jpn', 'lit', 'nld', 'nno', 'nob', 'pol', 'por', 'ron', 'slk', - 'slv', 'spa', 'swe', 'tha', 'zsm'] - - >>> wn.synset('dog.n.01').lemma_names('ita') - ['Canis_familiaris', 'cane'] - >>> wn.lemmas('cane', lang='ita') - [Lemma('dog.n.01.cane'), Lemma('cramp.n.02.cane'), Lemma('hammer.n.01.cane'), Lemma('bad_person.n.01.cane'), - Lemma('incompetent.n.01.cane')] - >>> sorted(wn.synset('dog.n.01').lemmas('dan')) - [Lemma('dog.n.01.hund'), Lemma('dog.n.01.k\xf8ter'), - Lemma('dog.n.01.vovhund'), Lemma('dog.n.01.vovse')] - - >>> sorted(wn.synset('dog.n.01').lemmas('por')) - [Lemma('dog.n.01.cachorra'), Lemma('dog.n.01.cachorro'), Lemma('dog.n.01.cadela'), Lemma('dog.n.01.c\xe3o')] - - >>> dog_lemma = wn.lemma(b'dog.n.01.c\xc3\xa3o'.decode('utf-8'), lang='por') - >>> dog_lemma - Lemma('dog.n.01.c\xe3o') - >>> dog_lemma.lang() - 'por' - >>> len(list(wordnet.all_lemma_names(pos='n', lang='jpn'))) - 66031 - -The synonyms of a word are returned as a nested list of synonyms of the different senses of -the input word in the given language, since these different senses are not mutual synonyms: - - >>> wn.synonyms('car') - [['auto', 'automobile', 'machine', 'motorcar'], ['railcar', 'railroad_car', 'railway_car'], ['gondola'], ['elevator_car'], ['cable_car']] - >>> wn.synonyms('coche', lang='spa') - [['auto', 'automóvil', 'carro', 'máquina', 'turismo', 'vehículo'], ['automotor', 'vagón'], ['vagón', 'vagón_de_pasajeros']] - - -------- -Synsets -------- - -`Synset`: a set of synonyms that share a common meaning. - - >>> dog = wn.synset('dog.n.01') - >>> dog.hypernyms() - [Synset('canine.n.02'), Synset('domestic_animal.n.01')] - >>> dog.hyponyms() - [Synset('basenji.n.01'), Synset('corgi.n.01'), Synset('cur.n.01'), Synset('dalmatian.n.02'), ...] - >>> dog.member_holonyms() - [Synset('canis.n.01'), Synset('pack.n.06')] - >>> dog.root_hypernyms() - [Synset('entity.n.01')] - >>> wn.synset('dog.n.01').lowest_common_hypernyms(wn.synset('cat.n.01')) - [Synset('carnivore.n.01')] - -Each synset contains one or more lemmas, which represent a specific -sense of a specific word. - -Note that some relations are defined by WordNet only over Lemmas: - - >>> good = wn.synset('good.a.01') - >>> good.antonyms() - Traceback (most recent call last): - File "", line 1, in - AttributeError: 'Synset' object has no attribute 'antonyms' - >>> good.lemmas()[0].antonyms() - [Lemma('bad.a.01.bad')] - -The relations that are currently defined in this way are `antonyms`, -`derivationally_related_forms` and `pertainyms`. - -If you know the byte offset used to identify a synset in the original -Princeton WordNet data file, you can use that to instantiate the synset -in NLTK: - - >>> wn.synset_from_pos_and_offset('n', 4543158) - Synset('wagon.n.01') - -Likewise, instantiate a synset from a known sense key: - >>> wn.synset_from_sense_key("driving%1:04:03::") - Synset('drive.n.06') - - ------- -Lemmas ------- - - >>> eat = wn.lemma('eat.v.03.eat') - >>> eat - Lemma('feed.v.06.eat') - >>> print(eat.key()) - eat%2:34:02:: - >>> eat.count() - 4 - >>> wn.lemma_from_key(eat.key()) - Lemma('feed.v.06.eat') - >>> wn.lemma_from_key(eat.key()).synset() - Synset('feed.v.06') - >>> wn.lemma_from_key('feebleminded%5:00:00:retarded:00') - Lemma('backward.s.03.feebleminded') - >>> for lemma in wn.synset('eat.v.03').lemmas(): - ... print(lemma, lemma.count()) - ... - Lemma('feed.v.06.feed') 3 - Lemma('feed.v.06.eat') 4 - >>> for lemma in wn.lemmas('eat', 'v'): - ... print(lemma, lemma.count()) - ... - Lemma('eat.v.01.eat') 61 - Lemma('eat.v.02.eat') 13 - Lemma('feed.v.06.eat') 4 - Lemma('eat.v.04.eat') 0 - Lemma('consume.v.05.eat') 0 - Lemma('corrode.v.01.eat') 0 - >>> wn.lemma('jump.v.11.jump') - Lemma('jump.v.11.jump') - -Lemmas can also have relations between them: - - >>> vocal = wn.lemma('vocal.a.01.vocal') - >>> vocal.derivationally_related_forms() - [Lemma('vocalize.v.02.vocalize')] - >>> vocal.pertainyms() - [Lemma('voice.n.02.voice')] - >>> vocal.antonyms() - [Lemma('instrumental.a.01.instrumental')] - -The three relations above exist only on lemmas, not on synsets. - ------------ -Verb Frames ------------ - - >>> wn.synset('think.v.01').frame_ids() - [5, 9] - >>> for lemma in wn.synset('think.v.01').lemmas(): - ... print(lemma, lemma.frame_ids()) - ... print(" | ".join(lemma.frame_strings())) - ... - Lemma('think.v.01.think') [5, 9] - Something think something Adjective/Noun | Somebody think somebody - Lemma('think.v.01.believe') [5, 9] - Something believe something Adjective/Noun | Somebody believe somebody - Lemma('think.v.01.consider') [5, 9] - Something consider something Adjective/Noun | Somebody consider somebody - Lemma('think.v.01.conceive') [5, 9] - Something conceive something Adjective/Noun | Somebody conceive somebody - >>> wn.synset('stretch.v.02').frame_ids() - [8] - >>> for lemma in wn.synset('stretch.v.02').lemmas(): - ... print(lemma, lemma.frame_ids()) - ... print(" | ".join(lemma.frame_strings())) - ... - Lemma('stretch.v.02.stretch') [8, 2] - Somebody stretch something | Somebody stretch - Lemma('stretch.v.02.extend') [8] - Somebody extend something - - ----------- -Similarity ----------- - - >>> dog = wn.synset('dog.n.01') - >>> cat = wn.synset('cat.n.01') - - >>> hit = wn.synset('hit.v.01') - >>> slap = wn.synset('slap.v.01') - - -``synset1.path_similarity(synset2):`` -Return a score denoting how similar two word senses are, based on the -shortest path that connects the senses in the is-a (hypernym/hypnoym) -taxonomy. The score is in the range 0 to 1. By default, there is now -a fake root node added to verbs so for cases where previously a path -could not be found---and None was returned---it should return a value. -The old behavior can be achieved by setting simulate_root to be False. -A score of 1 represents identity i.e. comparing a sense with itself -will return 1. - - >>> dog.path_similarity(cat) - 0.2... - - >>> hit.path_similarity(slap) - 0.142... - - >>> wn.path_similarity(hit, slap) - 0.142... - - >>> print(hit.path_similarity(slap, simulate_root=False)) - None - - >>> print(wn.path_similarity(hit, slap, simulate_root=False)) - None - -``synset1.lch_similarity(synset2):`` -Leacock-Chodorow Similarity: -Return a score denoting how similar two word senses are, based on the -shortest path that connects the senses (as above) and the maximum depth -of the taxonomy in which the senses occur. The relationship is given -as -log(p/2d) where p is the shortest path length and d the taxonomy -depth. - - >>> dog.lch_similarity(cat) - 2.028... - - >>> hit.lch_similarity(slap) - 1.312... - - >>> wn.lch_similarity(hit, slap) - 1.312... - - >>> print(hit.lch_similarity(slap, simulate_root=False)) - None - - >>> print(wn.lch_similarity(hit, slap, simulate_root=False)) - None - -``synset1.wup_similarity(synset2):`` -Wu-Palmer Similarity: -Return a score denoting how similar two word senses are, based on the -depth of the two senses in the taxonomy and that of their Least Common -Subsumer (most specific ancestor node). Note that at this time the -scores given do **not** always agree with those given by Pedersen's Perl -implementation of Wordnet Similarity. - -The LCS does not necessarily feature in the shortest path connecting the -two senses, as it is by definition the common ancestor deepest in the -taxonomy, not closest to the two senses. Typically, however, it will so -feature. Where multiple candidates for the LCS exist, that whose -shortest path to the root node is the longest will be selected. Where -the LCS has multiple paths to the root, the longer path is used for -the purposes of the calculation. - - >>> dog.wup_similarity(cat) - 0.857... - - >>> hit.wup_similarity(slap) - 0.25 - - >>> wn.wup_similarity(hit, slap) - 0.25 - - >>> print(hit.wup_similarity(slap, simulate_root=False)) - None - - >>> print(wn.wup_similarity(hit, slap, simulate_root=False)) - None - -``wordnet_ic`` -Information Content: -Load an information content file from the wordnet_ic corpus. - - >>> from nltk.corpus import wordnet_ic - >>> brown_ic = wordnet_ic.ic('ic-brown.dat') - >>> semcor_ic = wordnet_ic.ic('ic-semcor.dat') - -Or you can create an information content dictionary from a corpus (or -anything that has a words() method). - - >>> from nltk.corpus import genesis - >>> genesis_ic = wn.ic(genesis, False, 0.0) - -``synset1.res_similarity(synset2, ic):`` -Resnik Similarity: -Return a score denoting how similar two word senses are, based on the -Information Content (IC) of the Least Common Subsumer (most specific -ancestor node). Note that for any similarity measure that uses -information content, the result is dependent on the corpus used to -generate the information content and the specifics of how the -information content was created. - - >>> dog.res_similarity(cat, brown_ic) - 7.911... - >>> dog.res_similarity(cat, genesis_ic) - 7.204... - -``synset1.jcn_similarity(synset2, ic):`` -Jiang-Conrath Similarity -Return a score denoting how similar two word senses are, based on the -Information Content (IC) of the Least Common Subsumer (most specific -ancestor node) and that of the two input Synsets. The relationship is -given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)). - - >>> dog.jcn_similarity(cat, brown_ic) - 0.449... - >>> dog.jcn_similarity(cat, genesis_ic) - 0.285... - -``synset1.lin_similarity(synset2, ic):`` -Lin Similarity: -Return a score denoting how similar two word senses are, based on the -Information Content (IC) of the Least Common Subsumer (most specific -ancestor node) and that of the two input Synsets. The relationship is -given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)). - - >>> dog.lin_similarity(cat, semcor_ic) - 0.886... - - ---------------------- -Access to all Synsets ---------------------- - -Iterate over all the noun synsets: - - >>> for synset in list(wn.all_synsets('n'))[:10]: - ... print(synset) - ... - Synset('entity.n.01') - Synset('physical_entity.n.01') - Synset('abstraction.n.06') - Synset('thing.n.12') - Synset('object.n.01') - Synset('whole.n.02') - Synset('congener.n.03') - Synset('living_thing.n.01') - Synset('organism.n.01') - Synset('benthos.n.02') - -Get all synsets for this word, possibly restricted by POS: - - >>> wn.synsets('dog') - [Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'), ...] - >>> wn.synsets('dog', pos='v') - [Synset('chase.v.01')] - -Walk through the noun synsets looking at their hypernyms: - - >>> from itertools import islice - >>> for synset in islice(wn.all_synsets('n'), 5): - ... print(synset, synset.hypernyms()) - ... - Synset('entity.n.01') [] - Synset('physical_entity.n.01') [Synset('entity.n.01')] - Synset('abstraction.n.06') [Synset('entity.n.01')] - Synset('thing.n.12') [Synset('physical_entity.n.01')] - Synset('object.n.01') [Synset('physical_entity.n.01')] - - ------- -Morphy ------- - -Look up forms not in WordNet, with the help of Morphy: - - >>> wn.morphy('denied', wn.NOUN) - >>> print(wn.morphy('denied', wn.VERB)) - deny - >>> wn.synsets('denied', wn.NOUN) - [] - >>> wn.synsets('denied', wn.VERB) - [Synset('deny.v.01'), Synset('deny.v.02'), Synset('deny.v.03'), Synset('deny.v.04'), - Synset('deny.v.05'), Synset('traverse.v.03'), Synset('deny.v.07')] - -Morphy uses a combination of inflectional ending rules and exception -lists to handle a variety of different possibilities: - - >>> print(wn.morphy('dogs')) - dog - >>> print(wn.morphy('churches')) - church - >>> print(wn.morphy('aardwolves')) - aardwolf - >>> print(wn.morphy('abaci')) - abacus - >>> print(wn.morphy('book', wn.NOUN)) - book - >>> wn.morphy('hardrock', wn.ADV) - >>> wn.morphy('book', wn.ADJ) - >>> wn.morphy('his', wn.NOUN) - >>> - ---------------- -Synset Closures ---------------- - -Compute transitive closures of synsets - - >>> dog = wn.synset('dog.n.01') - >>> hypo = lambda s: s.hyponyms() - >>> hyper = lambda s: s.hypernyms() - >>> list(dog.closure(hypo, depth=1)) == dog.hyponyms() - True - >>> list(dog.closure(hyper, depth=1)) == dog.hypernyms() - True - >>> list(dog.closure(hypo)) - [Synset('basenji.n.01'), Synset('corgi.n.01'), Synset('cur.n.01'), - Synset('dalmatian.n.02'), Synset('great_pyrenees.n.01'), - Synset('griffon.n.02'), Synset('hunting_dog.n.01'), Synset('lapdog.n.01'), - Synset('leonberg.n.01'), Synset('mexican_hairless.n.01'), - Synset('newfoundland.n.01'), Synset('pooch.n.01'), Synset('poodle.n.01'), ...] - >>> list(dog.closure(hyper)) - [Synset('canine.n.02'), Synset('domestic_animal.n.01'), Synset('carnivore.n.01'), Synset('animal.n.01'), - Synset('placental.n.01'), Synset('organism.n.01'), Synset('mammal.n.01'), Synset('living_thing.n.01'), - Synset('vertebrate.n.01'), Synset('whole.n.02'), Synset('chordate.n.01'), Synset('object.n.01'), - Synset('physical_entity.n.01'), Synset('entity.n.01')] - - ----------------- -Regression Tests ----------------- - -Bug 85: morphy returns the base form of a word, if it's input is given -as a base form for a POS for which that word is not defined: - - >>> wn.synsets('book', wn.NOUN) - [Synset('book.n.01'), Synset('book.n.02'), Synset('record.n.05'), Synset('script.n.01'), Synset('ledger.n.01'), Synset('book.n.06'), Synset('book.n.07'), Synset('koran.n.01'), Synset('bible.n.01'), Synset('book.n.10'), Synset('book.n.11')] - >>> wn.synsets('book', wn.ADJ) - [] - >>> wn.morphy('book', wn.NOUN) - 'book' - >>> wn.morphy('book', wn.ADJ) - >>> - -Bug 160: wup_similarity breaks when the two synsets have no common hypernym - - >>> t = wn.synsets('picasso')[0] - >>> m = wn.synsets('male')[1] - >>> t.wup_similarity(m) - 0.631... - -Issue #2278: wup_similarity not commutative when comparing a noun and a verb. -Patch #2650 resolved this error. As a result, the output of the following use of wup_similarity no longer returns None. - - >>> t = wn.synsets('titan')[1] - >>> s = wn.synsets('say', wn.VERB)[0] - >>> t.wup_similarity(s) - 0.142... - -Bug 21: "instance of" not included in LCS (very similar to bug 160) - - >>> a = wn.synsets("writings")[0] - >>> b = wn.synsets("scripture")[0] - >>> brown_ic = wordnet_ic.ic('ic-brown.dat') - >>> a.jcn_similarity(b, brown_ic) - 0.175... - -Bug 221: Verb root IC is zero - - >>> from nltk.corpus.reader.wordnet import information_content - >>> s = wn.synsets('say', wn.VERB)[0] - >>> information_content(s, brown_ic) - 4.623... - -Bug 161: Comparison between WN keys/lemmas should not be case sensitive - - >>> k = wn.synsets("jefferson")[0].lemmas()[0].key() - >>> wn.lemma_from_key(k) - Lemma('jefferson.n.01.Jefferson') - >>> wn.lemma_from_key(k.upper()) - Lemma('jefferson.n.01.Jefferson') - -Bug 99: WordNet root_hypernyms gives incorrect results - - >>> from nltk.corpus import wordnet as wn - >>> for s in wn.all_synsets(wn.NOUN): - ... if s.root_hypernyms()[0] != wn.synset('entity.n.01'): - ... print(s, s.root_hypernyms()) - ... - >>> - -Bug 382: JCN Division by zero error - - >>> tow = wn.synset('tow.v.01') - >>> shlep = wn.synset('shlep.v.02') - >>> from nltk.corpus import wordnet_ic - >>> brown_ic = wordnet_ic.ic('ic-brown.dat') - >>> tow.jcn_similarity(shlep, brown_ic) - 1...e+300 - -Bug 428: Depth is zero for instance nouns - - >>> s = wn.synset("lincoln.n.01") - >>> s.max_depth() > 0 - True - -Bug 429: Information content smoothing used old reference to all_synsets - - >>> genesis_ic = wn.ic(genesis, True, 1.0) - -Bug 430: all_synsets used wrong pos lookup when synsets were cached - - >>> for ii in wn.all_synsets(): pass - >>> for ii in wn.all_synsets(): pass - -Bug 470: shortest_path_distance ignored instance hypernyms - - >>> google = wordnet.synsets("google")[0] - >>> earth = wordnet.synsets("earth")[0] - >>> google.wup_similarity(earth) - 0.1... - -Bug 484: similarity metrics returned -1 instead of None for no LCS - - >>> t = wn.synsets('fly', wn.VERB)[0] - >>> s = wn.synsets('say', wn.VERB)[0] - >>> print(s.shortest_path_distance(t)) - None - >>> print(s.path_similarity(t, simulate_root=False)) - None - >>> print(s.lch_similarity(t, simulate_root=False)) - None - >>> print(s.wup_similarity(t, simulate_root=False)) - None - -Bug 427: "pants" does not return all the senses it should - - >>> from nltk.corpus import wordnet - >>> wordnet.synsets("pants",'n') - [Synset('bloomers.n.01'), Synset('pant.n.01'), Synset('trouser.n.01'), Synset('gasp.n.01')] - -Bug 482: Some nouns not being lemmatised by WordNetLemmatizer().lemmatize - - >>> from nltk.stem.wordnet import WordNetLemmatizer - >>> WordNetLemmatizer().lemmatize("eggs", pos="n") - 'egg' - >>> WordNetLemmatizer().lemmatize("legs", pos="n") - 'leg' - -Bug 284: instance hypernyms not used in similarity calculations - - >>> wn.synset('john.n.02').lch_similarity(wn.synset('dog.n.01')) - 1.335... - >>> wn.synset('john.n.02').wup_similarity(wn.synset('dog.n.01')) - 0.571... - >>> wn.synset('john.n.02').res_similarity(wn.synset('dog.n.01'), brown_ic) - 2.224... - >>> wn.synset('john.n.02').jcn_similarity(wn.synset('dog.n.01'), brown_ic) - 0.075... - >>> wn.synset('john.n.02').lin_similarity(wn.synset('dog.n.01'), brown_ic) - 0.252... - >>> wn.synset('john.n.02').hypernym_paths() - [[Synset('entity.n.01'), ..., Synset('john.n.02')]] - -Issue 541: add domains to wordnet - - >>> wn.synset('code.n.03').topic_domains() - [Synset('computer_science.n.01')] - >>> wn.synset('pukka.a.01').region_domains() - [Synset('india.n.01')] - >>> wn.synset('freaky.a.01').usage_domains() - [Synset('slang.n.02')] - -Issue 629: wordnet failures when python run with -O optimizations - - >>> # Run the test suite with python -O to check this - >>> wn.synsets("brunch") - [Synset('brunch.n.01'), Synset('brunch.v.01')] - -Issue 395: wordnet returns incorrect result for lowest_common_hypernyms of chef and policeman - - >>> wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01')) - [Synset('person.n.01')] - -Bug https://github.com/nltk/nltk/issues/1641: Non-English lemmas containing capital letters cannot be looked up using wordnet.lemmas() or wordnet.synsets() - - >>> wn.lemmas('Londres', lang='fra') - [Lemma('united_kingdom.n.01.Londres'), Lemma('london.n.01.Londres'), Lemma('london.n.02.Londres')] - >>> wn.lemmas('londres', lang='fra') - [Lemma('united_kingdom.n.01.Londres'), Lemma('london.n.01.Londres'), Lemma('london.n.02.Londres')] - -Patch-1 https://github.com/nltk/nltk/pull/2065 Adding 3 functions (relations) to WordNet class - - >>> wn.synsets("computer_science")[0].in_topic_domains()[2] - Synset('access_time.n.01') - >>> wn.synsets("France")[0].in_region_domains()[18] - Synset('french.n.01') - >>> wn.synsets("slang")[1].in_usage_domains()[18] - Synset('can-do.s.01') - -Issue 2721: WordNetCorpusReader.ic() does not add smoothing to N - - >>> class FakeCorpus: - ... def words(self): return ['word'] - ... - >>> fake_ic = wn.ic(FakeCorpus(), False, 1.0) - >>> word = wn.synset('word.n.01') - >>> information_content(word, fake_ic) > 0 - True - -Issue 3077: Incorrect part-of-speech filtering in all_synsets - - >>> next(wn.all_synsets(pos="a")) - Synset('able.a.01') - >>> next(wn.all_synsets(pos="s")) - Synset('emergent.s.02') - >>> wn.add_omw() - >>> next(wn.all_synsets(lang="hrv")) - Synset('able.a.01') - >>> next(wn.all_synsets(lang="hrv", pos="n")) - Synset('entity.n.01') - >>> next(wn.all_synsets(lang="hrv", pos="v")) - Synset('breathe.v.01') - >>> next(wn.all_synsets(lang="hrv", pos="s")) - Synset('ideological.s.02') - >>> next(wn.all_synsets(lang="hrv", pos="a")) - Synset('able.a.01') - - ------------------------------------------------- -Endlessness vs. intractability in relation trees ------------------------------------------------- - -1. Endlessness --------------- - -Until NLTK v. 3.5, the ``tree()`` function looped forever on symmetric -relations (verb_groups, attributes, and most also_sees). But in -the current version, ``tree()`` now detects and discards these cycles: - - >>> from pprint import pprint - >>> pprint(wn.synset('bound.a.01').tree(lambda s:s.also_sees())) - [Synset('bound.a.01'), - [Synset('unfree.a.02'), - [Synset('confined.a.02'), - [Synset('restricted.a.01'), [Synset('classified.a.02')]]], - [Synset('dependent.a.01')], - [Synset('restricted.a.01'), - [Synset('classified.a.02')], - [Synset('confined.a.02')]]]] - -Specifying the "cut_mark" parameter increases verbosity, so that the cycles -are mentioned in the output, together with the level where they occur: - - >>> pprint(wn.synset('bound.a.01').tree(lambda s:s.also_sees(),cut_mark='...')) - [Synset('bound.a.01'), - [Synset('unfree.a.02'), - "Cycle(Synset('bound.a.01'),-3,...)", - [Synset('confined.a.02'), - [Synset('restricted.a.01'), - [Synset('classified.a.02')], - "Cycle(Synset('confined.a.02'),-5,...)", - "Cycle(Synset('unfree.a.02'),-5,...)"], - "Cycle(Synset('unfree.a.02'),-4,...)"], - [Synset('dependent.a.01'), "Cycle(Synset('unfree.a.02'),-4,...)"], - [Synset('restricted.a.01'), - [Synset('classified.a.02')], - [Synset('confined.a.02'), - "Cycle(Synset('restricted.a.01'),-5,...)", - "Cycle(Synset('unfree.a.02'),-5,...)"], - "Cycle(Synset('unfree.a.02'),-4,...)"]]] - - -2. Intractability ------------------ - -However, even after discarding the infinite cycles, some trees can remain -intractable, due to combinatorial explosion in a relation. This happens in -WordNet, because the ``also_sees()`` relation has a big Strongly Connected -Component (_SCC_) consisting in 758 synsets, where any member node is -transitively connected by the same relation, to all other members of the -same SCC. This produces intractable relation trees for each of these 758 -synsets, i. e. trees that are too big to compute or display on any computer. - -For example, the synset 'concrete.a.01' is a member of the largest SCC, -so its ``also_sees()`` tree is intractable, and can normally only be handled -by limiting the ``depth`` parameter to display a small number of levels: - - >>> from pprint import pprint - >>> pprint(wn.synset('concrete.a.01').tree(lambda s:s.also_sees(),cut_mark='...',depth=2)) - [Synset('concrete.a.01'), - [Synset('practical.a.01'), - "Cycle(Synset('concrete.a.01'),0,...)", - [Synset('possible.a.01'), '...'], - [Synset('realistic.a.01'), '...'], - [Synset('serviceable.a.01'), '...']], - [Synset('real.a.01'), - "Cycle(Synset('concrete.a.01'),0,...)", - [Synset('genuine.a.01'), '...'], - [Synset('realistic.a.01'), '...'], - [Synset('sincere.a.01'), '...']], - [Synset('tangible.a.01'), "Cycle(Synset('concrete.a.01'),0,...)"]] - - -2.1 First solution: ``acyclic_tree()`` -...................................... - -On the other hand, the new ``acyclic_tree()`` function is able to also handle -the intractable cases. The ``also_sees()`` acyclic tree of 'concrete.a.01' is -several hundred lines long, so here is a simpler example, concerning a much -smaller SCC: counting only five members, the SCC that includes 'bound.a.01' -is tractable with the normal ``tree()`` function, as seen above. - -But while ``tree()`` only prunes redundancy within local branches, ``acyclic_tree()`` -prunes the tree globally, thus discarding any additional redundancy, and -produces a tree that includes all reachable nodes (i.e., a **spanning tree**). -This tree is **minimal** because it includes the reachable nodes only once, -but it is not necessarily a **Minimum Spanning Tree** (MST), because the -Depth-first search strategy does not guarantee that nodes are reached -through the lowest number of links (as Breadth-first search would). - - >>> pprint(wn.synset('bound.a.01').acyclic_tree(lambda s:s.also_sees())) - [Synset('bound.a.01'), - [Synset('unfree.a.02'), - [Synset('confined.a.02'), - [Synset('restricted.a.01'), [Synset('classified.a.02')]]], - [Synset('dependent.a.01')]]] - -Again, specifying the ``cut_mark`` parameter increases verbosity, so that the -cycles are mentioned in the output, together with the level where they occur: - - >>> pprint(wn.synset('bound.a.01').acyclic_tree(lambda s:s.also_sees(),cut_mark='...')) - [Synset('bound.a.01'), - [Synset('unfree.a.02'), - "Cycle(Synset('bound.a.01'),-3,...)", - [Synset('confined.a.02'), - [Synset('restricted.a.01'), - [Synset('classified.a.02')], - "Cycle(Synset('confined.a.02'),-5,...)", - "Cycle(Synset('unfree.a.02'),-5,...)"], - "Cycle(Synset('unfree.a.02'),-4,...)"], - [Synset('dependent.a.01'), "Cycle(Synset('unfree.a.02'),-4,...)"], - "Cycle(Synset('restricted.a.01'),-3,...)"]] - - -2.2 Better solution: mst() -.......................... - -A Minimum Spanning Tree (MST) spans all the nodes of a relation subgraph once, -while guaranteeing that each node is reached through the shortest path possible. -In unweighted relation graphs like WordNet, a MST can be computed very efficiently -in linear time, using Breadth-First Search (BFS). Like acyclic_tree(), the new -``unweighted_minimum_spanning_tree()`` function (imported in the Wordnet -module as ``mst``) handles intractable trees, such as the example discussed above: -``wn.synset('concrete.a.01').mst(lambda s:s.also_sees())``. - -But, while the also_sees() acyclic_tree of 'bound.a.01' reaches -'classified.a.02' through four links, using depth-first search as seen above -(bound.a.01 > unfree.a.02 > confined.a.02 > restricted.a.01 > classified.a.02), -in the following MST, the path to 'classified.a.02' is the shortest possible, -consisting only in three links (bound.a.01 > unfree.a.02 > restricted.a.01 > -classified.a.02): - - >>> pprint(wn.synset('bound.a.01').mst(lambda s:s.also_sees())) - [Synset('bound.a.01'), - [Synset('unfree.a.02'), - [Synset('confined.a.02')], - [Synset('dependent.a.01')], - [Synset('restricted.a.01'), [Synset('classified.a.02')]]]] - - ----------------------------------------------------------------- -Loading alternative Wordnet versions ----------------------------------------------------------------- - - >>> print("Wordnet {}".format(wn.get_version())) - Wordnet 3.0 - - >>> from nltk.corpus import wordnet31 as wn31 - >>> print("Wordnet {}".format(wn31.get_version())) - Wordnet 3.1 - - >>> print(wn.synset('restrain.v.01').hyponyms()) - [Synset('confine.v.03'), Synset('control.v.02'), Synset('hold.v.36'), Synset('inhibit.v.04')] - - >>> print(wn31.synset('restrain.v.01').hyponyms()) - [Synset('enchain.v.01'), Synset('fetter.v.01'), Synset('ground.v.02'), Synset('impound.v.02'), Synset('pen_up.v.01'), Synset('pinion.v.01'), Synset('pound.v.06'), Synset('tie_down.v.01')] - - >>> print(wn31.synset('restrain.v.04').hyponyms()) - [Synset('baffle.v.03'), Synset('confine.v.02'), Synset('control.v.02'), Synset('hold.v.36'), Synset('rule.v.07'), Synset('swallow.v.06'), Synset('wink.v.04')] - - -------------- -Teardown test -------------- - - >>> from nltk.corpus import wordnet - >>> wordnet._unload() diff --git a/pipeline/nltk/test/wordnet_lch.doctest b/pipeline/nltk/test/wordnet_lch.doctest deleted file mode 100644 index 877626fe9236f494d8fa9fb4609925049103be2b..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/wordnet_lch.doctest +++ /dev/null @@ -1,53 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -=============================== -WordNet Lowest Common Hypernyms -=============================== - -Wordnet's lowest_common_hypernyms() method is based used to locate the -lowest single hypernym that is shared by two given words: - - >>> from nltk.corpus import wordnet as wn - >>> wn.synset('kin.n.01').lowest_common_hypernyms(wn.synset('mother.n.01')) - [Synset('relative.n.01')] - - >>> wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01')) - [Synset('person.n.01')] - -This method generally returns a single result, but in some cases, more than one -valid LCH is possible: - - >>> wn.synset('body.n.09').lowest_common_hypernyms(wn.synset('sidereal_day.n.01')) - [Synset('attribute.n.02'), Synset('measure.n.02')] - -In some cases, lowest_common_hypernyms() can return one of the synsets which was -passed to it as an argument: - - >>> wn.synset('woman.n.01').lowest_common_hypernyms(wn.synset('girlfriend.n.02')) - [Synset('woman.n.01')] - -In NLTK 3.0a2 the behavior of lowest_common_hypernyms() was changed to give more -accurate results in a small set of cases, generally when dealing with nouns describing -social roles or jobs. To emulate the pre v3.0a2 behavior, you can set the use_min_depth=True -flag: - - >>> wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01')) - [Synset('person.n.01')] - >>> wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01'), use_min_depth=True) - [Synset('organism.n.01')] - -In some cases use_min_depth=True may return more or fewer results than the default -behavior: - - >>> wn.synset('woman.n.01').lowest_common_hypernyms(wn.synset('girlfriend.n.02')) - [Synset('woman.n.01')] - >>> wn.synset('woman.n.01').lowest_common_hypernyms(wn.synset('girlfriend.n.02'), use_min_depth=True) - [Synset('organism.n.01'), Synset('woman.n.01')] - -In the general case, however, they tend to return the same results: - - >>> wn.synset('body.n.09').lowest_common_hypernyms(wn.synset('sidereal_day.n.01')) - [Synset('attribute.n.02'), Synset('measure.n.02')] - >>> wn.synset('body.n.09').lowest_common_hypernyms(wn.synset('sidereal_day.n.01'), use_min_depth=True) - [Synset('attribute.n.02'), Synset('measure.n.02')] diff --git a/pipeline/nltk/test/wsd.doctest b/pipeline/nltk/test/wsd.doctest deleted file mode 100644 index e4445c51d038f01e73214f370c220471530f859d..0000000000000000000000000000000000000000 --- a/pipeline/nltk/test/wsd.doctest +++ /dev/null @@ -1,68 +0,0 @@ -.. Copyright (C) 2001-2023 NLTK Project -.. For license information, see LICENSE.TXT - -.. -*- coding: utf-8 -*- - -========================= -Word Sense Disambiguation -========================= - - -Lesk Algorithm --------------- - - -Performs the classic Lesk algorithm for Word Sense Disambiguation (WSD) using -a the definitions of the ambiguous word. - -Given an ambiguous word and the context in which the word occurs, Lesk returns -a Synset with the highest number of overlapping words between the context -sentence and different definitions from each Synset. - - >>> from nltk.wsd import lesk - >>> sent = ['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.'] - - >>> print(lesk(sent, 'bank', 'n')) - Synset('savings_bank.n.02') - - >>> print(lesk(sent, 'bank')) - Synset('savings_bank.n.02') - -The definitions for "bank" are: - - >>> from nltk.corpus import wordnet as wn - >>> for ss in wn.synsets('bank'): - ... print(ss, ss.definition()) - ... - Synset('bank.n.01') sloping land (especially the slope beside a body of water) - Synset('depository_financial_institution.n.01') a financial institution that accepts deposits and channels the money into lending activities - Synset('bank.n.03') a long ridge or pile - Synset('bank.n.04') an arrangement of similar objects in a row or in tiers - Synset('bank.n.05') a supply or stock held in reserve for future use (especially in emergencies) - Synset('bank.n.06') the funds held by a gambling house or the dealer in some gambling games - Synset('bank.n.07') a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force - Synset('savings_bank.n.02') a container (usually with a slot in the top) for keeping money at home - Synset('bank.n.09') a building in which the business of banking transacted - Synset('bank.n.10') a flight maneuver; aircraft tips laterally about its longitudinal axis (especially in turning) - Synset('bank.v.01') tip laterally - Synset('bank.v.02') enclose with a bank - Synset('bank.v.03') do business with a bank or keep an account at a bank - Synset('bank.v.04') act as the banker in a game or in gambling - Synset('bank.v.05') be in the banking business - Synset('deposit.v.02') put into a bank account - Synset('bank.v.07') cover with ashes so to control the rate of burning - Synset('trust.v.01') have confidence or faith in - -Test disambiguation of POS tagged `able`. - - >>> [(s, s.pos()) for s in wn.synsets('able')] - [(Synset('able.a.01'), 'a'), (Synset('able.s.02'), 's'), (Synset('able.s.03'), 's'), (Synset('able.s.04'), 's')] - >>> sent = 'people should be able to marry a person of their choice'.split() - >>> lesk(sent, 'able') - Synset('able.s.04') - >>> lesk(sent, 'able', pos='a') - Synset('able.a.01') - -Test behavior if there is are no matching senses. - - >>> lesk('John loves Mary'.split(), 'loves', synsets=[]) diff --git a/pipeline/nltk/text.py b/pipeline/nltk/text.py deleted file mode 100644 index 85614dbd22f6a4afd388cf96cb5f8e3520883c16..0000000000000000000000000000000000000000 --- a/pipeline/nltk/text.py +++ /dev/null @@ -1,779 +0,0 @@ -# Natural Language Toolkit: Texts -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Edward Loper -# URL: -# For license information, see LICENSE.TXT - -""" -This module brings together a variety of NLTK functionality for -text analysis, and provides simple, interactive interfaces. -Functionality includes: concordancing, collocation discovery, -regular expression search over tokenized strings, and -distributional similarity. -""" - -import re -import sys -from collections import Counter, defaultdict, namedtuple -from functools import reduce -from math import log - -from nltk.collocations import BigramCollocationFinder -from nltk.lm import MLE -from nltk.lm.preprocessing import padded_everygram_pipeline -from nltk.metrics import BigramAssocMeasures, f_measure -from nltk.probability import ConditionalFreqDist as CFD -from nltk.probability import FreqDist -from nltk.tokenize import sent_tokenize -from nltk.util import LazyConcatenation, tokenwrap - -ConcordanceLine = namedtuple( - "ConcordanceLine", - ["left", "query", "right", "offset", "left_print", "right_print", "line"], -) - - -class ContextIndex: - """ - A bidirectional index between words and their 'contexts' in a text. - The context of a word is usually defined to be the words that occur - in a fixed window around the word; but other definitions may also - be used by providing a custom context function. - """ - - @staticmethod - def _default_context(tokens, i): - """One left token and one right token, normalized to lowercase""" - left = tokens[i - 1].lower() if i != 0 else "*START*" - right = tokens[i + 1].lower() if i != len(tokens) - 1 else "*END*" - return (left, right) - - def __init__(self, tokens, context_func=None, filter=None, key=lambda x: x): - self._key = key - self._tokens = tokens - if context_func: - self._context_func = context_func - else: - self._context_func = self._default_context - if filter: - tokens = [t for t in tokens if filter(t)] - self._word_to_contexts = CFD( - (self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens) - ) - self._context_to_words = CFD( - (self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens) - ) - - def tokens(self): - """ - :rtype: list(str) - :return: The document that this context index was - created from. - """ - return self._tokens - - def word_similarity_dict(self, word): - """ - Return a dictionary mapping from words to 'similarity scores,' - indicating how often these two words occur in the same - context. - """ - word = self._key(word) - word_contexts = set(self._word_to_contexts[word]) - - scores = {} - for w, w_contexts in self._word_to_contexts.items(): - scores[w] = f_measure(word_contexts, set(w_contexts)) - - return scores - - def similar_words(self, word, n=20): - scores = defaultdict(int) - for c in self._word_to_contexts[self._key(word)]: - for w in self._context_to_words[c]: - if w != word: - scores[w] += ( - self._context_to_words[c][word] * self._context_to_words[c][w] - ) - return sorted(scores, key=scores.get, reverse=True)[:n] - - def common_contexts(self, words, fail_on_unknown=False): - """ - Find contexts where the specified words can all appear; and - return a frequency distribution mapping each context to the - number of times that context was used. - - :param words: The words used to seed the similarity search - :type words: str - :param fail_on_unknown: If true, then raise a value error if - any of the given words do not occur at all in the index. - """ - words = [self._key(w) for w in words] - contexts = [set(self._word_to_contexts[w]) for w in words] - empty = [words[i] for i in range(len(words)) if not contexts[i]] - common = reduce(set.intersection, contexts) - if empty and fail_on_unknown: - raise ValueError("The following word(s) were not found:", " ".join(words)) - elif not common: - # nothing in common -- just return an empty freqdist. - return FreqDist() - else: - fd = FreqDist( - c for w in words for c in self._word_to_contexts[w] if c in common - ) - return fd - - -class ConcordanceIndex: - """ - An index that can be used to look up the offset locations at which - a given word occurs in a document. - """ - - def __init__(self, tokens, key=lambda x: x): - """ - Construct a new concordance index. - - :param tokens: The document (list of tokens) that this - concordance index was created from. This list can be used - to access the context of a given word occurrence. - :param key: A function that maps each token to a normalized - version that will be used as a key in the index. E.g., if - you use ``key=lambda s:s.lower()``, then the index will be - case-insensitive. - """ - self._tokens = tokens - """The document (list of tokens) that this concordance index - was created from.""" - - self._key = key - """Function mapping each token to an index key (or None).""" - - self._offsets = defaultdict(list) - """Dictionary mapping words (or keys) to lists of offset indices.""" - # Initialize the index (self._offsets) - for index, word in enumerate(tokens): - word = self._key(word) - self._offsets[word].append(index) - - def tokens(self): - """ - :rtype: list(str) - :return: The document that this concordance index was - created from. - """ - return self._tokens - - def offsets(self, word): - """ - :rtype: list(int) - :return: A list of the offset positions at which the given - word occurs. If a key function was specified for the - index, then given word's key will be looked up. - """ - word = self._key(word) - return self._offsets[word] - - def __repr__(self): - return "" % ( - len(self._tokens), - len(self._offsets), - ) - - def find_concordance(self, word, width=80): - """ - Find all concordance lines given the query word. - - Provided with a list of words, these will be found as a phrase. - """ - if isinstance(word, list): - phrase = word - else: - phrase = [word] - - half_width = (width - len(" ".join(phrase)) - 2) // 2 - context = width // 4 # approx number of words of context - - # Find the instances of the word to create the ConcordanceLine - concordance_list = [] - offsets = self.offsets(phrase[0]) - for i, word in enumerate(phrase[1:]): - word_offsets = {offset - i - 1 for offset in self.offsets(word)} - offsets = sorted(word_offsets.intersection(offsets)) - if offsets: - for i in offsets: - query_word = " ".join(self._tokens[i : i + len(phrase)]) - # Find the context of query word. - left_context = self._tokens[max(0, i - context) : i] - right_context = self._tokens[i + len(phrase) : i + context] - # Create the pretty lines with the query_word in the middle. - left_print = " ".join(left_context)[-half_width:] - right_print = " ".join(right_context)[:half_width] - # The WYSIWYG line of the concordance. - line_print = " ".join([left_print, query_word, right_print]) - # Create the ConcordanceLine - concordance_line = ConcordanceLine( - left_context, - query_word, - right_context, - i, - left_print, - right_print, - line_print, - ) - concordance_list.append(concordance_line) - return concordance_list - - def print_concordance(self, word, width=80, lines=25): - """ - Print concordance lines given the query word. - :param word: The target word or phrase (a list of strings) - :type word: str or list - :param lines: The number of lines to display (default=25) - :type lines: int - :param width: The width of each line, in characters (default=80) - :type width: int - :param save: The option to save the concordance. - :type save: bool - """ - concordance_list = self.find_concordance(word, width=width) - - if not concordance_list: - print("no matches") - else: - lines = min(lines, len(concordance_list)) - print(f"Displaying {lines} of {len(concordance_list)} matches:") - for i, concordance_line in enumerate(concordance_list[:lines]): - print(concordance_line.line) - - -class TokenSearcher: - """ - A class that makes it easier to use regular expressions to search - over tokenized strings. The tokenized string is converted to a - string where tokens are marked with angle brackets -- e.g., - ``''``. The regular expression - passed to the ``findall()`` method is modified to treat angle - brackets as non-capturing parentheses, in addition to matching the - token boundaries; and to have ``'.'`` not match the angle brackets. - """ - - def __init__(self, tokens): - self._raw = "".join("<" + w + ">" for w in tokens) - - def findall(self, regexp): - """ - Find instances of the regular expression in the text. - The text is a list of tokens, and a regexp pattern to match - a single token must be surrounded by angle brackets. E.g. - - >>> from nltk.text import TokenSearcher - >>> from nltk.book import text1, text5, text9 - >>> text5.findall("<.*><.*>") - you rule bro; telling you bro; u twizted bro - >>> text1.findall("(<.*>)") - monied; nervous; dangerous; white; white; white; pious; queer; good; - mature; white; Cape; great; wise; wise; butterless; white; fiendish; - pale; furious; better; certain; complete; dismasted; younger; brave; - brave; brave; brave - >>> text9.findall("{3,}") - thread through those; the thought that; that the thing; the thing - that; that that thing; through these than through; them that the; - through the thick; them that they; thought that the - - :param regexp: A regular expression - :type regexp: str - """ - # preprocess the regular expression - regexp = re.sub(r"\s", "", regexp) - regexp = re.sub(r"<", "(?:<(?:", regexp) - regexp = re.sub(r">", ")>)", regexp) - regexp = re.sub(r"(?]", regexp) - - # perform the search - hits = re.findall(regexp, self._raw) - - # Sanity check - for h in hits: - if not h.startswith("<") and h.endswith(">"): - raise ValueError("Bad regexp for TokenSearcher.findall") - - # postprocess the output - hits = [h[1:-1].split("><") for h in hits] - return hits - - -class Text: - """ - A wrapper around a sequence of simple (string) tokens, which is - intended to support initial exploration of texts (via the - interactive console). Its methods perform a variety of analyses - on the text's contexts (e.g., counting, concordancing, collocation - discovery), and display the results. If you wish to write a - program which makes use of these analyses, then you should bypass - the ``Text`` class, and use the appropriate analysis function or - class directly instead. - - A ``Text`` is typically initialized from a given document or - corpus. E.g.: - - >>> import nltk.corpus - >>> from nltk.text import Text - >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt')) - - """ - - # This defeats lazy loading, but makes things faster. This - # *shouldn't* be necessary because the corpus view *should* be - # doing intelligent caching, but without this it's running slow. - # Look into whether the caching is working correctly. - _COPY_TOKENS = True - - def __init__(self, tokens, name=None): - """ - Create a Text object. - - :param tokens: The source text. - :type tokens: sequence of str - """ - if self._COPY_TOKENS: - tokens = list(tokens) - self.tokens = tokens - - if name: - self.name = name - elif "]" in tokens[:20]: - end = tokens[:20].index("]") - self.name = " ".join(str(tok) for tok in tokens[1:end]) - else: - self.name = " ".join(str(tok) for tok in tokens[:8]) + "..." - - # //////////////////////////////////////////////////////////// - # Support item & slice access - # //////////////////////////////////////////////////////////// - - def __getitem__(self, i): - return self.tokens[i] - - def __len__(self): - return len(self.tokens) - - # //////////////////////////////////////////////////////////// - # Interactive console methods - # //////////////////////////////////////////////////////////// - - def concordance(self, word, width=79, lines=25): - """ - Prints a concordance for ``word`` with the specified context window. - Word matching is not case-sensitive. - - :param word: The target word or phrase (a list of strings) - :type word: str or list - :param width: The width of each line, in characters (default=80) - :type width: int - :param lines: The number of lines to display (default=25) - :type lines: int - - :seealso: ``ConcordanceIndex`` - """ - if "_concordance_index" not in self.__dict__: - self._concordance_index = ConcordanceIndex( - self.tokens, key=lambda s: s.lower() - ) - - return self._concordance_index.print_concordance(word, width, lines) - - def concordance_list(self, word, width=79, lines=25): - """ - Generate a concordance for ``word`` with the specified context window. - Word matching is not case-sensitive. - - :param word: The target word or phrase (a list of strings) - :type word: str or list - :param width: The width of each line, in characters (default=80) - :type width: int - :param lines: The number of lines to display (default=25) - :type lines: int - - :seealso: ``ConcordanceIndex`` - """ - if "_concordance_index" not in self.__dict__: - self._concordance_index = ConcordanceIndex( - self.tokens, key=lambda s: s.lower() - ) - return self._concordance_index.find_concordance(word, width)[:lines] - - def collocation_list(self, num=20, window_size=2): - """ - Return collocations derived from the text, ignoring stopwords. - - >>> from nltk.book import text4 - >>> text4.collocation_list()[:2] - [('United', 'States'), ('fellow', 'citizens')] - - :param num: The maximum number of collocations to return. - :type num: int - :param window_size: The number of tokens spanned by a collocation (default=2) - :type window_size: int - :rtype: list(tuple(str, str)) - """ - if not ( - "_collocations" in self.__dict__ - and self._num == num - and self._window_size == window_size - ): - self._num = num - self._window_size = window_size - - # print("Building collocations list") - from nltk.corpus import stopwords - - ignored_words = stopwords.words("english") - finder = BigramCollocationFinder.from_words(self.tokens, window_size) - finder.apply_freq_filter(2) - finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) - bigram_measures = BigramAssocMeasures() - self._collocations = list( - finder.nbest(bigram_measures.likelihood_ratio, num) - ) - return self._collocations - - def collocations(self, num=20, window_size=2): - """ - Print collocations derived from the text, ignoring stopwords. - - >>> from nltk.book import text4 - >>> text4.collocations() # doctest: +NORMALIZE_WHITESPACE - United States; fellow citizens; years ago; four years; Federal - Government; General Government; American people; Vice President; God - bless; Chief Justice; one another; fellow Americans; Old World; - Almighty God; Fellow citizens; Chief Magistrate; every citizen; Indian - tribes; public debt; foreign nations - - - :param num: The maximum number of collocations to print. - :type num: int - :param window_size: The number of tokens spanned by a collocation (default=2) - :type window_size: int - """ - - collocation_strings = [ - w1 + " " + w2 for w1, w2 in self.collocation_list(num, window_size) - ] - print(tokenwrap(collocation_strings, separator="; ")) - - def count(self, word): - """ - Count the number of times this word appears in the text. - """ - return self.tokens.count(word) - - def index(self, word): - """ - Find the index of the first occurrence of the word in the text. - """ - return self.tokens.index(word) - - def readability(self, method): - # code from nltk_contrib.readability - raise NotImplementedError - - def similar(self, word, num=20): - """ - Distributional similarity: find other words which appear in the - same contexts as the specified word; list most similar words first. - - :param word: The word used to seed the similarity search - :type word: str - :param num: The number of words to generate (default=20) - :type num: int - :seealso: ContextIndex.similar_words() - """ - if "_word_context_index" not in self.__dict__: - # print('Building word-context index...') - self._word_context_index = ContextIndex( - self.tokens, filter=lambda x: x.isalpha(), key=lambda s: s.lower() - ) - - # words = self._word_context_index.similar_words(word, num) - - word = word.lower() - wci = self._word_context_index._word_to_contexts - if word in wci.conditions(): - contexts = set(wci[word]) - fd = Counter( - w - for w in wci.conditions() - for c in wci[w] - if c in contexts and not w == word - ) - words = [w for w, _ in fd.most_common(num)] - print(tokenwrap(words)) - else: - print("No matches") - - def common_contexts(self, words, num=20): - """ - Find contexts where the specified words appear; list - most frequent common contexts first. - - :param words: The words used to seed the similarity search - :type words: str - :param num: The number of words to generate (default=20) - :type num: int - :seealso: ContextIndex.common_contexts() - """ - if "_word_context_index" not in self.__dict__: - # print('Building word-context index...') - self._word_context_index = ContextIndex( - self.tokens, key=lambda s: s.lower() - ) - - try: - fd = self._word_context_index.common_contexts(words, True) - if not fd: - print("No common contexts were found") - else: - ranked_contexts = [w for w, _ in fd.most_common(num)] - print(tokenwrap(w1 + "_" + w2 for w1, w2 in ranked_contexts)) - - except ValueError as e: - print(e) - - def dispersion_plot(self, words): - """ - Produce a plot showing the distribution of the words through the text. - Requires pylab to be installed. - - :param words: The words to be plotted - :type words: list(str) - :seealso: nltk.draw.dispersion_plot() - """ - from nltk.draw import dispersion_plot - - dispersion_plot(self, words) - - def _train_default_ngram_lm(self, tokenized_sents, n=3): - train_data, padded_sents = padded_everygram_pipeline(n, tokenized_sents) - model = MLE(order=n) - model.fit(train_data, padded_sents) - return model - - def generate(self, length=100, text_seed=None, random_seed=42): - """ - Print random text, generated using a trigram language model. - See also `help(nltk.lm)`. - - :param length: The length of text to generate (default=100) - :type length: int - - :param text_seed: Generation can be conditioned on preceding context. - :type text_seed: list(str) - - :param random_seed: A random seed or an instance of `random.Random`. If provided, - makes the random sampling part of generation reproducible. (default=42) - :type random_seed: int - """ - # Create the model when using it the first time. - self._tokenized_sents = [ - sent.split(" ") for sent in sent_tokenize(" ".join(self.tokens)) - ] - if not hasattr(self, "_trigram_model"): - print("Building ngram index...", file=sys.stderr) - self._trigram_model = self._train_default_ngram_lm( - self._tokenized_sents, n=3 - ) - - generated_tokens = [] - - assert length > 0, "The `length` must be more than 0." - while len(generated_tokens) < length: - for idx, token in enumerate( - self._trigram_model.generate( - length, text_seed=text_seed, random_seed=random_seed - ) - ): - if token == "": - continue - if token == "": - break - generated_tokens.append(token) - random_seed += 1 - - prefix = " ".join(text_seed) + " " if text_seed else "" - output_str = prefix + tokenwrap(generated_tokens[:length]) - print(output_str) - return output_str - - def plot(self, *args): - """ - See documentation for FreqDist.plot() - :seealso: nltk.prob.FreqDist.plot() - """ - return self.vocab().plot(*args) - - def vocab(self): - """ - :seealso: nltk.prob.FreqDist - """ - if "_vocab" not in self.__dict__: - # print("Building vocabulary index...") - self._vocab = FreqDist(self) - return self._vocab - - def findall(self, regexp): - """ - Find instances of the regular expression in the text. - The text is a list of tokens, and a regexp pattern to match - a single token must be surrounded by angle brackets. E.g. - - >>> from nltk.book import text1, text5, text9 - >>> text5.findall("<.*><.*>") - you rule bro; telling you bro; u twizted bro - >>> text1.findall("(<.*>)") - monied; nervous; dangerous; white; white; white; pious; queer; good; - mature; white; Cape; great; wise; wise; butterless; white; fiendish; - pale; furious; better; certain; complete; dismasted; younger; brave; - brave; brave; brave - >>> text9.findall("{3,}") - thread through those; the thought that; that the thing; the thing - that; that that thing; through these than through; them that the; - through the thick; them that they; thought that the - - :param regexp: A regular expression - :type regexp: str - """ - - if "_token_searcher" not in self.__dict__: - self._token_searcher = TokenSearcher(self) - - hits = self._token_searcher.findall(regexp) - hits = [" ".join(h) for h in hits] - print(tokenwrap(hits, "; ")) - - # //////////////////////////////////////////////////////////// - # Helper Methods - # //////////////////////////////////////////////////////////// - - _CONTEXT_RE = re.compile(r"\w+|[\.\!\?]") - - def _context(self, tokens, i): - """ - One left & one right token, both case-normalized. Skip over - non-sentence-final punctuation. Used by the ``ContextIndex`` - that is created for ``similar()`` and ``common_contexts()``. - """ - # Left context - j = i - 1 - while j >= 0 and not self._CONTEXT_RE.match(tokens[j]): - j -= 1 - left = tokens[j] if j != 0 else "*START*" - - # Right context - j = i + 1 - while j < len(tokens) and not self._CONTEXT_RE.match(tokens[j]): - j += 1 - right = tokens[j] if j != len(tokens) else "*END*" - - return (left, right) - - # //////////////////////////////////////////////////////////// - # String Display - # //////////////////////////////////////////////////////////// - - def __str__(self): - return "" % self.name - - def __repr__(self): - return "" % self.name - - -# Prototype only; this approach will be slow to load -class TextCollection(Text): - """A collection of texts, which can be loaded with list of texts, or - with a corpus consisting of one or more texts, and which supports - counting, concordancing, collocation discovery, etc. Initialize a - TextCollection as follows: - - >>> import nltk.corpus - >>> from nltk.text import TextCollection - >>> from nltk.book import text1, text2, text3 - >>> gutenberg = TextCollection(nltk.corpus.gutenberg) - >>> mytexts = TextCollection([text1, text2, text3]) - - Iterating over a TextCollection produces all the tokens of all the - texts in order. - """ - - def __init__(self, source): - if hasattr(source, "words"): # bridge to the text corpus reader - source = [source.words(f) for f in source.fileids()] - - self._texts = source - Text.__init__(self, LazyConcatenation(source)) - self._idf_cache = {} - - def tf(self, term, text): - """The frequency of the term in text.""" - return text.count(term) / len(text) - - def idf(self, term): - """The number of texts in the corpus divided by the - number of texts that the term appears in. - If a term does not appear in the corpus, 0.0 is returned.""" - # idf values are cached for performance. - idf = self._idf_cache.get(term) - if idf is None: - matches = len([True for text in self._texts if term in text]) - if len(self._texts) == 0: - raise ValueError("IDF undefined for empty document collection") - idf = log(len(self._texts) / matches) if matches else 0.0 - self._idf_cache[term] = idf - return idf - - def tf_idf(self, term, text): - return self.tf(term, text) * self.idf(term) - - -def demo(): - from nltk.corpus import brown - - text = Text(brown.words(categories="news")) - print(text) - print() - print("Concordance:") - text.concordance("news") - print() - print("Distributionally similar words:") - text.similar("news") - print() - print("Collocations:") - text.collocations() - print() - # print("Automatically generated text:") - # text.generate() - # print() - print("Dispersion plot:") - text.dispersion_plot(["news", "report", "said", "announced"]) - print() - print("Vocabulary plot:") - text.plot(50) - print() - print("Indexing:") - print("text[3]:", text[3]) - print("text[3:5]:", text[3:5]) - print("text.vocab()['news']:", text.vocab()["news"]) - - -if __name__ == "__main__": - demo() - -__all__ = [ - "ContextIndex", - "ConcordanceIndex", - "TokenSearcher", - "Text", - "TextCollection", -] diff --git a/pipeline/nltk/tgrep.py b/pipeline/nltk/tgrep.py deleted file mode 100644 index 45ce2ab92629296fc52931ff12720d62aab939cf..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tgrep.py +++ /dev/null @@ -1,1039 +0,0 @@ -#!/usr/bin/env python -# -# Natural Language Toolkit: TGrep search -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Will Roberts -# URL: -# For license information, see LICENSE.TXT - -""" -============================================ - TGrep search implementation for NLTK trees -============================================ - -This module supports TGrep2 syntax for matching parts of NLTK Trees. -Note that many tgrep operators require the tree passed to be a -``ParentedTree``. - -External links: - -- `Tgrep tutorial `_ -- `Tgrep2 manual `_ -- `Tgrep2 source `_ - -Usage -===== - ->>> from nltk.tree import ParentedTree ->>> from nltk.tgrep import tgrep_nodes, tgrep_positions ->>> tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) (VP bit) (NP (DT a) (NN cat)))') ->>> list(tgrep_nodes('NN', [tree])) -[[ParentedTree('NN', ['dog']), ParentedTree('NN', ['cat'])]] ->>> list(tgrep_positions('NN', [tree])) -[[(0, 2), (2, 1)]] ->>> list(tgrep_nodes('DT', [tree])) -[[ParentedTree('DT', ['the']), ParentedTree('DT', ['a'])]] ->>> list(tgrep_nodes('DT $ JJ', [tree])) -[[ParentedTree('DT', ['the'])]] - -This implementation adds syntax to select nodes based on their NLTK -tree position. This syntax is ``N`` plus a Python tuple representing -the tree position. For instance, ``N()``, ``N(0,)``, ``N(0,0)`` are -valid node selectors. Example: - ->>> tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) (VP bit) (NP (DT a) (NN cat)))') ->>> tree[0,0] -ParentedTree('DT', ['the']) ->>> tree[0,0].treeposition() -(0, 0) ->>> list(tgrep_nodes('N(0,0)', [tree])) -[[ParentedTree('DT', ['the'])]] - -Caveats: -======== - -- Link modifiers: "?" and "=" are not implemented. -- Tgrep compatibility: Using "@" for "!", "{" for "<", "}" for ">" are - not implemented. -- The "=" and "~" links are not implemented. - -Known Issues: -============= - -- There are some issues with link relations involving leaf nodes - (which are represented as bare strings in NLTK trees). For - instance, consider the tree:: - - (S (A x)) - - The search string ``* !>> S`` should select all nodes which are not - dominated in some way by an ``S`` node (i.e., all nodes which are - not descendants of an ``S``). Clearly, in this tree, the only node - which fulfills this criterion is the top node (since it is not - dominated by anything). However, the code here will find both the - top node and the leaf node ``x``. This is because we cannot recover - the parent of the leaf, since it is stored as a bare string. - - A possible workaround, when performing this kind of search, would be - to filter out all leaf nodes. - -Implementation notes -==================== - -This implementation is (somewhat awkwardly) based on lambda functions -which are predicates on a node. A predicate is a function which is -either True or False; using a predicate function, we can identify sets -of nodes with particular properties. A predicate function, could, for -instance, return True only if a particular node has a label matching a -particular regular expression, and has a daughter node which has no -sisters. Because tgrep2 search strings can do things statefully (such -as substituting in macros, and binding nodes with node labels), the -actual predicate function is declared with three arguments:: - - pred = lambda n, m, l: return True # some logic here - -``n`` - is a node in a tree; this argument must always be given - -``m`` - contains a dictionary, mapping macro names onto predicate functions - -``l`` - is a dictionary to map node labels onto nodes in the tree - -``m`` and ``l`` are declared to default to ``None``, and so need not be -specified in a call to a predicate. Predicates which call other -predicates must always pass the value of these arguments on. The -top-level predicate (constructed by ``_tgrep_exprs_action``) binds the -macro definitions to ``m`` and initialises ``l`` to an empty dictionary. -""" - -import functools -import re - -try: - import pyparsing -except ImportError: - print("Warning: nltk.tgrep will not work without the `pyparsing` package") - print("installed.") - -import nltk.tree - - -class TgrepException(Exception): - """Tgrep exception type.""" - - pass - - -def ancestors(node): - """ - Returns the list of all nodes dominating the given tree node. - This method will not work with leaf nodes, since there is no way - to recover the parent. - """ - results = [] - try: - current = node.parent() - except AttributeError: - # if node is a leaf, we cannot retrieve its parent - return results - while current: - results.append(current) - current = current.parent() - return results - - -def unique_ancestors(node): - """ - Returns the list of all nodes dominating the given node, where - there is only a single path of descent. - """ - results = [] - try: - current = node.parent() - except AttributeError: - # if node is a leaf, we cannot retrieve its parent - return results - while current and len(current) == 1: - results.append(current) - current = current.parent() - return results - - -def _descendants(node): - """ - Returns the list of all nodes which are descended from the given - tree node in some way. - """ - try: - treepos = node.treepositions() - except AttributeError: - return [] - return [node[x] for x in treepos[1:]] - - -def _leftmost_descendants(node): - """ - Returns the set of all nodes descended in some way through - left branches from this node. - """ - try: - treepos = node.treepositions() - except AttributeError: - return [] - return [node[x] for x in treepos[1:] if all(y == 0 for y in x)] - - -def _rightmost_descendants(node): - """ - Returns the set of all nodes descended in some way through - right branches from this node. - """ - try: - rightmost_leaf = max(node.treepositions()) - except AttributeError: - return [] - return [node[rightmost_leaf[:i]] for i in range(1, len(rightmost_leaf) + 1)] - - -def _istree(obj): - """Predicate to check whether `obj` is a nltk.tree.Tree.""" - return isinstance(obj, nltk.tree.Tree) - - -def _unique_descendants(node): - """ - Returns the list of all nodes descended from the given node, where - there is only a single path of descent. - """ - results = [] - current = node - while current and _istree(current) and len(current) == 1: - current = current[0] - results.append(current) - return results - - -def _before(node): - """ - Returns the set of all nodes that are before the given node. - """ - try: - pos = node.treeposition() - tree = node.root() - except AttributeError: - return [] - return [tree[x] for x in tree.treepositions() if x[: len(pos)] < pos[: len(x)]] - - -def _immediately_before(node): - """ - Returns the set of all nodes that are immediately before the given - node. - - Tree node A immediately precedes node B if the last terminal - symbol (word) produced by A immediately precedes the first - terminal symbol produced by B. - """ - try: - pos = node.treeposition() - tree = node.root() - except AttributeError: - return [] - # go "upwards" from pos until there is a place we can go to the left - idx = len(pos) - 1 - while 0 <= idx and pos[idx] == 0: - idx -= 1 - if idx < 0: - return [] - pos = list(pos[: idx + 1]) - pos[-1] -= 1 - before = tree[pos] - return [before] + _rightmost_descendants(before) - - -def _after(node): - """ - Returns the set of all nodes that are after the given node. - """ - try: - pos = node.treeposition() - tree = node.root() - except AttributeError: - return [] - return [tree[x] for x in tree.treepositions() if x[: len(pos)] > pos[: len(x)]] - - -def _immediately_after(node): - """ - Returns the set of all nodes that are immediately after the given - node. - - Tree node A immediately follows node B if the first terminal - symbol (word) produced by A immediately follows the last - terminal symbol produced by B. - """ - try: - pos = node.treeposition() - tree = node.root() - current = node.parent() - except AttributeError: - return [] - # go "upwards" from pos until there is a place we can go to the - # right - idx = len(pos) - 1 - while 0 <= idx and pos[idx] == len(current) - 1: - idx -= 1 - current = current.parent() - if idx < 0: - return [] - pos = list(pos[: idx + 1]) - pos[-1] += 1 - after = tree[pos] - return [after] + _leftmost_descendants(after) - - -def _tgrep_node_literal_value(node): - """ - Gets the string value of a given parse tree node, for comparison - using the tgrep node literal predicates. - """ - return node.label() if _istree(node) else str(node) - - -def _tgrep_macro_use_action(_s, _l, tokens): - """ - Builds a lambda function which looks up the macro name used. - """ - assert len(tokens) == 1 - assert tokens[0][0] == "@" - macro_name = tokens[0][1:] - - def macro_use(n, m=None, l=None): - if m is None or macro_name not in m: - raise TgrepException(f"macro {macro_name} not defined") - return m[macro_name](n, m, l) - - return macro_use - - -def _tgrep_node_action(_s, _l, tokens): - """ - Builds a lambda function representing a predicate on a tree node - depending on the name of its node. - """ - if tokens[0] == "'": - # strip initial apostrophe (tgrep2 print command) - tokens = tokens[1:] - if len(tokens) > 1: - # disjunctive definition of a node name - assert list(set(tokens[1::2])) == ["|"] - # recursively call self to interpret each node name definition - tokens = [_tgrep_node_action(None, None, [node]) for node in tokens[::2]] - # capture tokens and return the disjunction - return (lambda t: lambda n, m=None, l=None: any(f(n, m, l) for f in t))(tokens) - else: - if hasattr(tokens[0], "__call__"): - # this is a previously interpreted parenthetical node - # definition (lambda function) - return tokens[0] - elif tokens[0] == "*" or tokens[0] == "__": - return lambda n, m=None, l=None: True - elif tokens[0].startswith('"'): - assert tokens[0].endswith('"') - node_lit = tokens[0][1:-1].replace('\\"', '"').replace("\\\\", "\\") - return ( - lambda s: lambda n, m=None, l=None: _tgrep_node_literal_value(n) == s - )(node_lit) - elif tokens[0].startswith("/"): - assert tokens[0].endswith("/") - node_lit = tokens[0][1:-1] - return ( - lambda r: lambda n, m=None, l=None: r.search( - _tgrep_node_literal_value(n) - ) - )(re.compile(node_lit)) - elif tokens[0].startswith("i@"): - node_func = _tgrep_node_action(_s, _l, [tokens[0][2:].lower()]) - return ( - lambda f: lambda n, m=None, l=None: f( - _tgrep_node_literal_value(n).lower() - ) - )(node_func) - else: - return ( - lambda s: lambda n, m=None, l=None: _tgrep_node_literal_value(n) == s - )(tokens[0]) - - -def _tgrep_parens_action(_s, _l, tokens): - """ - Builds a lambda function representing a predicate on a tree node - from a parenthetical notation. - """ - assert len(tokens) == 3 - assert tokens[0] == "(" - assert tokens[2] == ")" - return tokens[1] - - -def _tgrep_nltk_tree_pos_action(_s, _l, tokens): - """ - Builds a lambda function representing a predicate on a tree node - which returns true if the node is located at a specific tree - position. - """ - # recover the tuple from the parsed string - node_tree_position = tuple(int(x) for x in tokens if x.isdigit()) - # capture the node's tree position - return ( - lambda i: lambda n, m=None, l=None: ( - hasattr(n, "treeposition") and n.treeposition() == i - ) - )(node_tree_position) - - -def _tgrep_relation_action(_s, _l, tokens): - """ - Builds a lambda function representing a predicate on a tree node - depending on its relation to other nodes in the tree. - """ - # process negation first if needed - negated = False - if tokens[0] == "!": - negated = True - tokens = tokens[1:] - if tokens[0] == "[": - # process square-bracketed relation expressions - assert len(tokens) == 3 - assert tokens[2] == "]" - retval = tokens[1] - else: - # process operator-node relation expressions - assert len(tokens) == 2 - operator, predicate = tokens - # A < B A is the parent of (immediately dominates) B. - if operator == "<": - retval = lambda n, m=None, l=None: ( - _istree(n) and any(predicate(x, m, l) for x in n) - ) - # A > B A is the child of B. - elif operator == ">": - retval = lambda n, m=None, l=None: ( - hasattr(n, "parent") - and bool(n.parent()) - and predicate(n.parent(), m, l) - ) - # A <, B Synonymous with A <1 B. - elif operator == "<," or operator == "<1": - retval = lambda n, m=None, l=None: ( - _istree(n) and bool(list(n)) and predicate(n[0], m, l) - ) - # A >, B Synonymous with A >1 B. - elif operator == ">," or operator == ">1": - retval = lambda n, m=None, l=None: ( - hasattr(n, "parent") - and bool(n.parent()) - and (n is n.parent()[0]) - and predicate(n.parent(), m, l) - ) - # A N B A is the Nth child of B (the first child is >1). - elif operator[0] == ">" and operator[1:].isdigit(): - idx = int(operator[1:]) - # capture the index parameter - retval = ( - lambda i: lambda n, m=None, l=None: ( - hasattr(n, "parent") - and bool(n.parent()) - and 0 <= i < len(n.parent()) - and (n is n.parent()[i]) - and predicate(n.parent(), m, l) - ) - )(idx - 1) - # A <' B B is the last child of A (also synonymous with A <-1 B). - # A <- B B is the last child of A (synonymous with A <-1 B). - elif operator == "<'" or operator == "<-" or operator == "<-1": - retval = lambda n, m=None, l=None: ( - _istree(n) and bool(list(n)) and predicate(n[-1], m, l) - ) - # A >' B A is the last child of B (also synonymous with A >-1 B). - # A >- B A is the last child of B (synonymous with A >-1 B). - elif operator == ">'" or operator == ">-" or operator == ">-1": - retval = lambda n, m=None, l=None: ( - hasattr(n, "parent") - and bool(n.parent()) - and (n is n.parent()[-1]) - and predicate(n.parent(), m, l) - ) - # A <-N B B is the N th-to-last child of A (the last child is <-1). - elif operator[:2] == "<-" and operator[2:].isdigit(): - idx = -int(operator[2:]) - # capture the index parameter - retval = ( - lambda i: lambda n, m=None, l=None: ( - _istree(n) - and bool(list(n)) - and 0 <= (i + len(n)) < len(n) - and predicate(n[i + len(n)], m, l) - ) - )(idx) - # A >-N B A is the N th-to-last child of B (the last child is >-1). - elif operator[:2] == ">-" and operator[2:].isdigit(): - idx = -int(operator[2:]) - # capture the index parameter - retval = ( - lambda i: lambda n, m=None, l=None: ( - hasattr(n, "parent") - and bool(n.parent()) - and 0 <= (i + len(n.parent())) < len(n.parent()) - and (n is n.parent()[i + len(n.parent())]) - and predicate(n.parent(), m, l) - ) - )(idx) - # A <: B B is the only child of A - elif operator == "<:": - retval = lambda n, m=None, l=None: ( - _istree(n) and len(n) == 1 and predicate(n[0], m, l) - ) - # A >: B A is the only child of B. - elif operator == ">:": - retval = lambda n, m=None, l=None: ( - hasattr(n, "parent") - and bool(n.parent()) - and len(n.parent()) == 1 - and predicate(n.parent(), m, l) - ) - # A << B A dominates B (A is an ancestor of B). - elif operator == "<<": - retval = lambda n, m=None, l=None: ( - _istree(n) and any(predicate(x, m, l) for x in _descendants(n)) - ) - # A >> B A is dominated by B (A is a descendant of B). - elif operator == ">>": - retval = lambda n, m=None, l=None: any( - predicate(x, m, l) for x in ancestors(n) - ) - # A <<, B B is a left-most descendant of A. - elif operator == "<<," or operator == "<<1": - retval = lambda n, m=None, l=None: ( - _istree(n) and any(predicate(x, m, l) for x in _leftmost_descendants(n)) - ) - # A >>, B A is a left-most descendant of B. - elif operator == ">>,": - retval = lambda n, m=None, l=None: any( - (predicate(x, m, l) and n in _leftmost_descendants(x)) - for x in ancestors(n) - ) - # A <<' B B is a right-most descendant of A. - elif operator == "<<'": - retval = lambda n, m=None, l=None: ( - _istree(n) - and any(predicate(x, m, l) for x in _rightmost_descendants(n)) - ) - # A >>' B A is a right-most descendant of B. - elif operator == ">>'": - retval = lambda n, m=None, l=None: any( - (predicate(x, m, l) and n in _rightmost_descendants(x)) - for x in ancestors(n) - ) - # A <<: B There is a single path of descent from A and B is on it. - elif operator == "<<:": - retval = lambda n, m=None, l=None: ( - _istree(n) and any(predicate(x, m, l) for x in _unique_descendants(n)) - ) - # A >>: B There is a single path of descent from B and A is on it. - elif operator == ">>:": - retval = lambda n, m=None, l=None: any( - predicate(x, m, l) for x in unique_ancestors(n) - ) - # A . B A immediately precedes B. - elif operator == ".": - retval = lambda n, m=None, l=None: any( - predicate(x, m, l) for x in _immediately_after(n) - ) - # A , B A immediately follows B. - elif operator == ",": - retval = lambda n, m=None, l=None: any( - predicate(x, m, l) for x in _immediately_before(n) - ) - # A .. B A precedes B. - elif operator == "..": - retval = lambda n, m=None, l=None: any( - predicate(x, m, l) for x in _after(n) - ) - # A ,, B A follows B. - elif operator == ",,": - retval = lambda n, m=None, l=None: any( - predicate(x, m, l) for x in _before(n) - ) - # A $ B A is a sister of B (and A != B). - elif operator == "$" or operator == "%": - retval = lambda n, m=None, l=None: ( - hasattr(n, "parent") - and bool(n.parent()) - and any(predicate(x, m, l) for x in n.parent() if x is not n) - ) - # A $. B A is a sister of and immediately precedes B. - elif operator == "$." or operator == "%.": - retval = lambda n, m=None, l=None: ( - hasattr(n, "right_sibling") - and bool(n.right_sibling()) - and predicate(n.right_sibling(), m, l) - ) - # A $, B A is a sister of and immediately follows B. - elif operator == "$," or operator == "%,": - retval = lambda n, m=None, l=None: ( - hasattr(n, "left_sibling") - and bool(n.left_sibling()) - and predicate(n.left_sibling(), m, l) - ) - # A $.. B A is a sister of and precedes B. - elif operator == "$.." or operator == "%..": - retval = lambda n, m=None, l=None: ( - hasattr(n, "parent") - and hasattr(n, "parent_index") - and bool(n.parent()) - and any(predicate(x, m, l) for x in n.parent()[n.parent_index() + 1 :]) - ) - # A $,, B A is a sister of and follows B. - elif operator == "$,," or operator == "%,,": - retval = lambda n, m=None, l=None: ( - hasattr(n, "parent") - and hasattr(n, "parent_index") - and bool(n.parent()) - and any(predicate(x, m, l) for x in n.parent()[: n.parent_index()]) - ) - else: - raise TgrepException(f'cannot interpret tgrep operator "{operator}"') - # now return the built function - if negated: - return (lambda r: (lambda n, m=None, l=None: not r(n, m, l)))(retval) - else: - return retval - - -def _tgrep_conjunction_action(_s, _l, tokens, join_char="&"): - """ - Builds a lambda function representing a predicate on a tree node - from the conjunction of several other such lambda functions. - - This is prototypically called for expressions like - (`tgrep_rel_conjunction`):: - - < NP & < AP < VP - - where tokens is a list of predicates representing the relations - (`< NP`, `< AP`, and `< VP`), possibly with the character `&` - included (as in the example here). - - This is also called for expressions like (`tgrep_node_expr2`):: - - NP < NN - S=s < /NP/=n : s < /VP/=v : n .. v - - tokens[0] is a tgrep_expr predicate; tokens[1:] are an (optional) - list of segmented patterns (`tgrep_expr_labeled`, processed by - `_tgrep_segmented_pattern_action`). - """ - # filter out the ampersand - tokens = [x for x in tokens if x != join_char] - if len(tokens) == 1: - return tokens[0] - else: - return ( - lambda ts: lambda n, m=None, l=None: all( - predicate(n, m, l) for predicate in ts - ) - )(tokens) - - -def _tgrep_segmented_pattern_action(_s, _l, tokens): - """ - Builds a lambda function representing a segmented pattern. - - Called for expressions like (`tgrep_expr_labeled`):: - - =s .. =v < =n - - This is a segmented pattern, a tgrep2 expression which begins with - a node label. - - The problem is that for segemented_pattern_action (': =v < =s'), - the first element (in this case, =v) is specifically selected by - virtue of matching a particular node in the tree; to retrieve - the node, we need the label, not a lambda function. For node - labels inside a tgrep_node_expr, we need a lambda function which - returns true if the node visited is the same as =v. - - We solve this by creating two copies of a node_label_use in the - grammar; the label use inside a tgrep_expr_labeled has a separate - parse action to the pred use inside a node_expr. See - `_tgrep_node_label_use_action` and - `_tgrep_node_label_pred_use_action`. - """ - # tokens[0] is a string containing the node label - node_label = tokens[0] - # tokens[1:] is an (optional) list of predicates which must all - # hold of the bound node - reln_preds = tokens[1:] - - def pattern_segment_pred(n, m=None, l=None): - """This predicate function ignores its node argument.""" - # look up the bound node using its label - if l is None or node_label not in l: - raise TgrepException(f"node_label ={node_label} not bound in pattern") - node = l[node_label] - # match the relation predicates against the node - return all(pred(node, m, l) for pred in reln_preds) - - return pattern_segment_pred - - -def _tgrep_node_label_use_action(_s, _l, tokens): - """ - Returns the node label used to begin a tgrep_expr_labeled. See - `_tgrep_segmented_pattern_action`. - - Called for expressions like (`tgrep_node_label_use`):: - - =s - - when they appear as the first element of a `tgrep_expr_labeled` - expression (see `_tgrep_segmented_pattern_action`). - - It returns the node label. - """ - assert len(tokens) == 1 - assert tokens[0].startswith("=") - return tokens[0][1:] - - -def _tgrep_node_label_pred_use_action(_s, _l, tokens): - """ - Builds a lambda function representing a predicate on a tree node - which describes the use of a previously bound node label. - - Called for expressions like (`tgrep_node_label_use_pred`):: - - =s - - when they appear inside a tgrep_node_expr (for example, inside a - relation). The predicate returns true if and only if its node - argument is identical the the node looked up in the node label - dictionary using the node's label. - """ - assert len(tokens) == 1 - assert tokens[0].startswith("=") - node_label = tokens[0][1:] - - def node_label_use_pred(n, m=None, l=None): - # look up the bound node using its label - if l is None or node_label not in l: - raise TgrepException(f"node_label ={node_label} not bound in pattern") - node = l[node_label] - # truth means the given node is this node - return n is node - - return node_label_use_pred - - -def _tgrep_bind_node_label_action(_s, _l, tokens): - """ - Builds a lambda function representing a predicate on a tree node - which can optionally bind a matching node into the tgrep2 string's - label_dict. - - Called for expressions like (`tgrep_node_expr2`):: - - /NP/ - @NP=n - """ - # tokens[0] is a tgrep_node_expr - if len(tokens) == 1: - return tokens[0] - else: - # if present, tokens[1] is the character '=', and tokens[2] is - # a tgrep_node_label, a string value containing the node label - assert len(tokens) == 3 - assert tokens[1] == "=" - node_pred = tokens[0] - node_label = tokens[2] - - def node_label_bind_pred(n, m=None, l=None): - if node_pred(n, m, l): - # bind `n` into the dictionary `l` - if l is None: - raise TgrepException( - "cannot bind node_label {}: label_dict is None".format( - node_label - ) - ) - l[node_label] = n - return True - else: - return False - - return node_label_bind_pred - - -def _tgrep_rel_disjunction_action(_s, _l, tokens): - """ - Builds a lambda function representing a predicate on a tree node - from the disjunction of several other such lambda functions. - """ - # filter out the pipe - tokens = [x for x in tokens if x != "|"] - if len(tokens) == 1: - return tokens[0] - elif len(tokens) == 2: - return (lambda a, b: lambda n, m=None, l=None: a(n, m, l) or b(n, m, l))( - tokens[0], tokens[1] - ) - - -def _macro_defn_action(_s, _l, tokens): - """ - Builds a dictionary structure which defines the given macro. - """ - assert len(tokens) == 3 - assert tokens[0] == "@" - return {tokens[1]: tokens[2]} - - -def _tgrep_exprs_action(_s, _l, tokens): - """ - This is the top-lebel node in a tgrep2 search string; the - predicate function it returns binds together all the state of a - tgrep2 search string. - - Builds a lambda function representing a predicate on a tree node - from the disjunction of several tgrep expressions. Also handles - macro definitions and macro name binding, and node label - definitions and node label binding. - """ - if len(tokens) == 1: - return lambda n, m=None, l=None: tokens[0](n, None, {}) - # filter out all the semicolons - tokens = [x for x in tokens if x != ";"] - # collect all macro definitions - macro_dict = {} - macro_defs = [tok for tok in tokens if isinstance(tok, dict)] - for macro_def in macro_defs: - macro_dict.update(macro_def) - # collect all tgrep expressions - tgrep_exprs = [tok for tok in tokens if not isinstance(tok, dict)] - # create a new scope for the node label dictionary - def top_level_pred(n, m=macro_dict, l=None): - label_dict = {} - # bind macro definitions and OR together all tgrep_exprs - return any(predicate(n, m, label_dict) for predicate in tgrep_exprs) - - return top_level_pred - - -def _build_tgrep_parser(set_parse_actions=True): - """ - Builds a pyparsing-based parser object for tokenizing and - interpreting tgrep search strings. - """ - tgrep_op = pyparsing.Optional("!") + pyparsing.Regex("[$%,.<>][%,.<>0-9-':]*") - tgrep_qstring = pyparsing.QuotedString( - quoteChar='"', escChar="\\", unquoteResults=False - ) - tgrep_node_regex = pyparsing.QuotedString( - quoteChar="/", escChar="\\", unquoteResults=False - ) - tgrep_qstring_icase = pyparsing.Regex('i@\\"(?:[^"\\n\\r\\\\]|(?:\\\\.))*\\"') - tgrep_node_regex_icase = pyparsing.Regex("i@\\/(?:[^/\\n\\r\\\\]|(?:\\\\.))*\\/") - tgrep_node_literal = pyparsing.Regex("[^][ \r\t\n;:.,&|<>()$!@%'^=]+") - tgrep_expr = pyparsing.Forward() - tgrep_relations = pyparsing.Forward() - tgrep_parens = pyparsing.Literal("(") + tgrep_expr + ")" - tgrep_nltk_tree_pos = ( - pyparsing.Literal("N(") - + pyparsing.Optional( - pyparsing.Word(pyparsing.nums) - + "," - + pyparsing.Optional( - pyparsing.delimitedList(pyparsing.Word(pyparsing.nums), delim=",") - + pyparsing.Optional(",") - ) - ) - + ")" - ) - tgrep_node_label = pyparsing.Regex("[A-Za-z0-9]+") - tgrep_node_label_use = pyparsing.Combine("=" + tgrep_node_label) - # see _tgrep_segmented_pattern_action - tgrep_node_label_use_pred = tgrep_node_label_use.copy() - macro_name = pyparsing.Regex("[^];:.,&|<>()[$!@%'^=\r\t\n ]+") - macro_name.setWhitespaceChars("") - macro_use = pyparsing.Combine("@" + macro_name) - tgrep_node_expr = ( - tgrep_node_label_use_pred - | macro_use - | tgrep_nltk_tree_pos - | tgrep_qstring_icase - | tgrep_node_regex_icase - | tgrep_qstring - | tgrep_node_regex - | "*" - | tgrep_node_literal - ) - tgrep_node_expr2 = ( - tgrep_node_expr - + pyparsing.Literal("=").setWhitespaceChars("") - + tgrep_node_label.copy().setWhitespaceChars("") - ) | tgrep_node_expr - tgrep_node = tgrep_parens | ( - pyparsing.Optional("'") - + tgrep_node_expr2 - + pyparsing.ZeroOrMore("|" + tgrep_node_expr) - ) - tgrep_brackets = pyparsing.Optional("!") + "[" + tgrep_relations + "]" - tgrep_relation = tgrep_brackets | (tgrep_op + tgrep_node) - tgrep_rel_conjunction = pyparsing.Forward() - tgrep_rel_conjunction << ( - tgrep_relation - + pyparsing.ZeroOrMore(pyparsing.Optional("&") + tgrep_rel_conjunction) - ) - tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore( - "|" + tgrep_relations - ) - tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations) - tgrep_expr_labeled = tgrep_node_label_use + pyparsing.Optional(tgrep_relations) - tgrep_expr2 = tgrep_expr + pyparsing.ZeroOrMore(":" + tgrep_expr_labeled) - macro_defn = ( - pyparsing.Literal("@") + pyparsing.White().suppress() + macro_name + tgrep_expr2 - ) - tgrep_exprs = ( - pyparsing.Optional(macro_defn + pyparsing.ZeroOrMore(";" + macro_defn) + ";") - + tgrep_expr2 - + pyparsing.ZeroOrMore(";" + (macro_defn | tgrep_expr2)) - + pyparsing.ZeroOrMore(";").suppress() - ) - if set_parse_actions: - tgrep_node_label_use.setParseAction(_tgrep_node_label_use_action) - tgrep_node_label_use_pred.setParseAction(_tgrep_node_label_pred_use_action) - macro_use.setParseAction(_tgrep_macro_use_action) - tgrep_node.setParseAction(_tgrep_node_action) - tgrep_node_expr2.setParseAction(_tgrep_bind_node_label_action) - tgrep_parens.setParseAction(_tgrep_parens_action) - tgrep_nltk_tree_pos.setParseAction(_tgrep_nltk_tree_pos_action) - tgrep_relation.setParseAction(_tgrep_relation_action) - tgrep_rel_conjunction.setParseAction(_tgrep_conjunction_action) - tgrep_relations.setParseAction(_tgrep_rel_disjunction_action) - macro_defn.setParseAction(_macro_defn_action) - # the whole expression is also the conjunction of two - # predicates: the first node predicate, and the remaining - # relation predicates - tgrep_expr.setParseAction(_tgrep_conjunction_action) - tgrep_expr_labeled.setParseAction(_tgrep_segmented_pattern_action) - tgrep_expr2.setParseAction( - functools.partial(_tgrep_conjunction_action, join_char=":") - ) - tgrep_exprs.setParseAction(_tgrep_exprs_action) - return tgrep_exprs.ignore("#" + pyparsing.restOfLine) - - -def tgrep_tokenize(tgrep_string): - """ - Tokenizes a TGrep search string into separate tokens. - """ - parser = _build_tgrep_parser(False) - if isinstance(tgrep_string, bytes): - tgrep_string = tgrep_string.decode() - return list(parser.parseString(tgrep_string)) - - -def tgrep_compile(tgrep_string): - """ - Parses (and tokenizes, if necessary) a TGrep search string into a - lambda function. - """ - parser = _build_tgrep_parser(True) - if isinstance(tgrep_string, bytes): - tgrep_string = tgrep_string.decode() - return list(parser.parseString(tgrep_string, parseAll=True))[0] - - -def treepositions_no_leaves(tree): - """ - Returns all the tree positions in the given tree which are not - leaf nodes. - """ - treepositions = tree.treepositions() - # leaves are treeposition tuples that are not prefixes of any - # other treeposition - prefixes = set() - for pos in treepositions: - for length in range(len(pos)): - prefixes.add(pos[:length]) - return [pos for pos in treepositions if pos in prefixes] - - -def tgrep_positions(pattern, trees, search_leaves=True): - """ - Return the tree positions in the trees which match the given pattern. - - :param pattern: a tgrep search pattern - :type pattern: str or output of tgrep_compile() - :param trees: a sequence of NLTK trees (usually ParentedTrees) - :type trees: iter(ParentedTree) or iter(Tree) - :param search_leaves: whether to return matching leaf nodes - :type search_leaves: bool - :rtype: iter(tree positions) - """ - - if isinstance(pattern, (bytes, str)): - pattern = tgrep_compile(pattern) - - for tree in trees: - try: - if search_leaves: - positions = tree.treepositions() - else: - positions = treepositions_no_leaves(tree) - yield [position for position in positions if pattern(tree[position])] - except AttributeError: - yield [] - - -def tgrep_nodes(pattern, trees, search_leaves=True): - """ - Return the tree nodes in the trees which match the given pattern. - - :param pattern: a tgrep search pattern - :type pattern: str or output of tgrep_compile() - :param trees: a sequence of NLTK trees (usually ParentedTrees) - :type trees: iter(ParentedTree) or iter(Tree) - :param search_leaves: whether to return matching leaf nodes - :type search_leaves: bool - :rtype: iter(tree nodes) - """ - - if isinstance(pattern, (bytes, str)): - pattern = tgrep_compile(pattern) - - for tree in trees: - try: - if search_leaves: - positions = tree.treepositions() - else: - positions = treepositions_no_leaves(tree) - yield [tree[position] for position in positions if pattern(tree[position])] - except AttributeError: - yield [] diff --git a/pipeline/nltk/tokenize/__init__.py b/pipeline/nltk/tokenize/__init__.py deleted file mode 100644 index 5162796f751878d3521aaf66de56fac11b2a2dd8..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tokenize/__init__.py +++ /dev/null @@ -1,132 +0,0 @@ -# Natural Language Toolkit: Tokenizers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird (minor additions) -# Contributors: matthewmc, clouds56 -# URL: -# For license information, see LICENSE.TXT - -r""" -NLTK Tokenizer Package - -Tokenizers divide strings into lists of substrings. For example, -tokenizers can be used to find the words and punctuation in a string: - - >>> from nltk.tokenize import word_tokenize - >>> s = '''Good muffins cost $3.88\nin New York. Please buy me - ... two of them.\n\nThanks.''' - >>> word_tokenize(s) # doctest: +NORMALIZE_WHITESPACE - ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', - 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] - -This particular tokenizer requires the Punkt sentence tokenization -models to be installed. NLTK also provides a simpler, -regular-expression based tokenizer, which splits text on whitespace -and punctuation: - - >>> from nltk.tokenize import wordpunct_tokenize - >>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE - ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', - 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] - -We can also operate at the level of sentences, using the sentence -tokenizer directly as follows: - - >>> from nltk.tokenize import sent_tokenize, word_tokenize - >>> sent_tokenize(s) - ['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.'] - >>> [word_tokenize(t) for t in sent_tokenize(s)] # doctest: +NORMALIZE_WHITESPACE - [['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'], - ['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']] - -Caution: when tokenizing a Unicode string, make sure you are not -using an encoded version of the string (it may be necessary to -decode it first, e.g. with ``s.decode("utf8")``. - -NLTK tokenizers can produce token-spans, represented as tuples of integers -having the same semantics as string slices, to support efficient comparison -of tokenizers. (These methods are implemented as generators.) - - >>> from nltk.tokenize import WhitespaceTokenizer - >>> list(WhitespaceTokenizer().span_tokenize(s)) # doctest: +NORMALIZE_WHITESPACE - [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44), - (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)] - -There are numerous ways to tokenize text. If you need more control over -tokenization, see the other methods provided in this package. - -For further information, please see Chapter 3 of the NLTK book. -""" - -import re - -from nltk.data import load -from nltk.tokenize.casual import TweetTokenizer, casual_tokenize -from nltk.tokenize.destructive import NLTKWordTokenizer -from nltk.tokenize.legality_principle import LegalitySyllableTokenizer -from nltk.tokenize.mwe import MWETokenizer -from nltk.tokenize.punkt import PunktSentenceTokenizer -from nltk.tokenize.regexp import ( - BlanklineTokenizer, - RegexpTokenizer, - WhitespaceTokenizer, - WordPunctTokenizer, - blankline_tokenize, - regexp_tokenize, - wordpunct_tokenize, -) -from nltk.tokenize.repp import ReppTokenizer -from nltk.tokenize.sexpr import SExprTokenizer, sexpr_tokenize -from nltk.tokenize.simple import ( - LineTokenizer, - SpaceTokenizer, - TabTokenizer, - line_tokenize, -) -from nltk.tokenize.sonority_sequencing import SyllableTokenizer -from nltk.tokenize.stanford_segmenter import StanfordSegmenter -from nltk.tokenize.texttiling import TextTilingTokenizer -from nltk.tokenize.toktok import ToktokTokenizer -from nltk.tokenize.treebank import TreebankWordDetokenizer, TreebankWordTokenizer -from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize - - -# Standard sentence tokenizer. -def sent_tokenize(text, language="english"): - """ - Return a sentence-tokenized copy of *text*, - using NLTK's recommended sentence tokenizer - (currently :class:`.PunktSentenceTokenizer` - for the specified language). - - :param text: text to split into sentences - :param language: the model name in the Punkt corpus - """ - tokenizer = load(f"tokenizers/punkt/{language}.pickle") - return tokenizer.tokenize(text) - - -# Standard word tokenizer. -_treebank_word_tokenizer = NLTKWordTokenizer() - - -def word_tokenize(text, language="english", preserve_line=False): - """ - Return a tokenized copy of *text*, - using NLTK's recommended word tokenizer - (currently an improved :class:`.TreebankWordTokenizer` - along with :class:`.PunktSentenceTokenizer` - for the specified language). - - :param text: text to split into words - :type text: str - :param language: the model name in the Punkt corpus - :type language: str - :param preserve_line: A flag to decide whether to sentence tokenize the text or not. - :type preserve_line: bool - """ - sentences = [text] if preserve_line else sent_tokenize(text, language) - return [ - token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent) - ] diff --git a/pipeline/nltk/tokenize/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/tokenize/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 6b7a8a82194edf2ad64dd2e6d7c76ef873152cb9..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tokenize/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tokenize/__pycache__/api.cpython-39.pyc b/pipeline/nltk/tokenize/__pycache__/api.cpython-39.pyc deleted file mode 100644 index 30f806103b239806f20aa3c371f9b3c69ca4d22d..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tokenize/__pycache__/api.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tokenize/__pycache__/casual.cpython-39.pyc b/pipeline/nltk/tokenize/__pycache__/casual.cpython-39.pyc deleted file mode 100644 index b10ef9cc876b487905679d99f1779da2bd13d8fd..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tokenize/__pycache__/casual.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tokenize/__pycache__/destructive.cpython-39.pyc b/pipeline/nltk/tokenize/__pycache__/destructive.cpython-39.pyc deleted file mode 100644 index f4507865acb4b8f5c5c0f2b781da438d9eabf8cc..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tokenize/__pycache__/destructive.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tokenize/__pycache__/legality_principle.cpython-39.pyc b/pipeline/nltk/tokenize/__pycache__/legality_principle.cpython-39.pyc deleted file mode 100644 index 35618f32fe1c50f55bc7b365e0aba78f48324f89..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tokenize/__pycache__/legality_principle.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tokenize/__pycache__/mwe.cpython-39.pyc b/pipeline/nltk/tokenize/__pycache__/mwe.cpython-39.pyc deleted file mode 100644 index 40be50f1b288910befc3fd337bf76330ffeb38b3..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tokenize/__pycache__/mwe.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tokenize/__pycache__/nist.cpython-39.pyc b/pipeline/nltk/tokenize/__pycache__/nist.cpython-39.pyc deleted file mode 100644 index 7a8f18267cb8dcd1abf120fc5948feaad75f0784..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tokenize/__pycache__/nist.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tokenize/__pycache__/punkt.cpython-39.pyc b/pipeline/nltk/tokenize/__pycache__/punkt.cpython-39.pyc deleted file mode 100644 index d370491ee224b959f4a5233fa2c0f36ba8c14cf5..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tokenize/__pycache__/punkt.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tokenize/__pycache__/regexp.cpython-39.pyc b/pipeline/nltk/tokenize/__pycache__/regexp.cpython-39.pyc deleted file mode 100644 index 29f839a678fa349ce6ca4284c586e7b8220aece5..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tokenize/__pycache__/regexp.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tokenize/__pycache__/repp.cpython-39.pyc b/pipeline/nltk/tokenize/__pycache__/repp.cpython-39.pyc deleted file mode 100644 index 597cd27f0d2d911be67a4b1144d8fb0bc6e6ce53..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tokenize/__pycache__/repp.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tokenize/__pycache__/sexpr.cpython-39.pyc b/pipeline/nltk/tokenize/__pycache__/sexpr.cpython-39.pyc deleted file mode 100644 index 1540cf28c8afa9099ac945b9c2f5085e8ce24450..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tokenize/__pycache__/sexpr.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tokenize/__pycache__/simple.cpython-39.pyc b/pipeline/nltk/tokenize/__pycache__/simple.cpython-39.pyc deleted file mode 100644 index 8222a4b063992c04a60d9dc89625dd36a17786e6..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tokenize/__pycache__/simple.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tokenize/__pycache__/sonority_sequencing.cpython-39.pyc b/pipeline/nltk/tokenize/__pycache__/sonority_sequencing.cpython-39.pyc deleted file mode 100644 index a525e61e8f3b8636bda4a079d3d91e8d86168381..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tokenize/__pycache__/sonority_sequencing.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tokenize/__pycache__/stanford.cpython-39.pyc b/pipeline/nltk/tokenize/__pycache__/stanford.cpython-39.pyc deleted file mode 100644 index eaa3d75e3e0ffe4c85cbbefb6507ce1666017a45..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tokenize/__pycache__/stanford.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tokenize/__pycache__/stanford_segmenter.cpython-39.pyc b/pipeline/nltk/tokenize/__pycache__/stanford_segmenter.cpython-39.pyc deleted file mode 100644 index 0edcfc9a81ab437d06eb0638f2a83456cb8a7b9f..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tokenize/__pycache__/stanford_segmenter.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tokenize/__pycache__/texttiling.cpython-39.pyc b/pipeline/nltk/tokenize/__pycache__/texttiling.cpython-39.pyc deleted file mode 100644 index 061c38e62b59bc12160065408aeaf372fb038a3a..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tokenize/__pycache__/texttiling.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tokenize/__pycache__/toktok.cpython-39.pyc b/pipeline/nltk/tokenize/__pycache__/toktok.cpython-39.pyc deleted file mode 100644 index d84bc09004cd8ba832bacc39433d470b43c3d1f7..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tokenize/__pycache__/toktok.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tokenize/__pycache__/treebank.cpython-39.pyc b/pipeline/nltk/tokenize/__pycache__/treebank.cpython-39.pyc deleted file mode 100644 index c4f3f491ad9aae17fde2a5ad22eab6861ad18368..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tokenize/__pycache__/treebank.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tokenize/__pycache__/util.cpython-39.pyc b/pipeline/nltk/tokenize/__pycache__/util.cpython-39.pyc deleted file mode 100644 index 16f95ef897961ddb6a99e26ee2aeb78146180dd4..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tokenize/__pycache__/util.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tokenize/api.py b/pipeline/nltk/tokenize/api.py deleted file mode 100644 index 419ff646cfb89d5f3b63e645b53bedea09a1b479..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tokenize/api.py +++ /dev/null @@ -1,83 +0,0 @@ -# Natural Language Toolkit: Tokenizer Interface -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird -# URL: -# For license information, see LICENSE.TXT - -""" -Tokenizer Interface -""" - -from abc import ABC, abstractmethod -from typing import Iterator, List, Tuple - -from nltk.internals import overridden -from nltk.tokenize.util import string_span_tokenize - - -class TokenizerI(ABC): - """ - A processing interface for tokenizing a string. - Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both). - """ - - @abstractmethod - def tokenize(self, s: str) -> List[str]: - """ - Return a tokenized copy of *s*. - - :rtype: List[str] - """ - if overridden(self.tokenize_sents): - return self.tokenize_sents([s])[0] - - def span_tokenize(self, s: str) -> Iterator[Tuple[int, int]]: - """ - Identify the tokens using integer offsets ``(start_i, end_i)``, - where ``s[start_i:end_i]`` is the corresponding token. - - :rtype: Iterator[Tuple[int, int]] - """ - raise NotImplementedError() - - def tokenize_sents(self, strings: List[str]) -> List[List[str]]: - """ - Apply ``self.tokenize()`` to each element of ``strings``. I.e.: - - return [self.tokenize(s) for s in strings] - - :rtype: List[List[str]] - """ - return [self.tokenize(s) for s in strings] - - def span_tokenize_sents( - self, strings: List[str] - ) -> Iterator[List[Tuple[int, int]]]: - """ - Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.: - - return [self.span_tokenize(s) for s in strings] - - :yield: List[Tuple[int, int]] - """ - for s in strings: - yield list(self.span_tokenize(s)) - - -class StringTokenizer(TokenizerI): - """A tokenizer that divides a string into substrings by splitting - on the specified string (defined in subclasses). - """ - - @property - @abstractmethod - def _string(self): - raise NotImplementedError - - def tokenize(self, s): - return s.split(self._string) - - def span_tokenize(self, s): - yield from string_span_tokenize(s, self._string) diff --git a/pipeline/nltk/tokenize/casual.py b/pipeline/nltk/tokenize/casual.py deleted file mode 100644 index d0545abe50530c20903f8aeaa29fbfc55094e70e..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tokenize/casual.py +++ /dev/null @@ -1,458 +0,0 @@ -# -# Natural Language Toolkit: Twitter Tokenizer -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Christopher Potts -# Ewan Klein (modifications) -# Pierpaolo Pantone <> (modifications) -# Tom Aarsen <> (modifications) -# URL: -# For license information, see LICENSE.TXT -# - - -""" -Twitter-aware tokenizer, designed to be flexible and easy to adapt to new -domains and tasks. The basic logic is this: - -1. The tuple REGEXPS defines a list of regular expression - strings. - -2. The REGEXPS strings are put, in order, into a compiled - regular expression object called WORD_RE, under the TweetTokenizer - class. - -3. The tokenization is done by WORD_RE.findall(s), where s is the - user-supplied string, inside the tokenize() method of the class - TweetTokenizer. - -4. When instantiating Tokenizer objects, there are several options: - * preserve_case. By default, it is set to True. If it is set to - False, then the tokenizer will downcase everything except for - emoticons. - * reduce_len. By default, it is set to False. It specifies whether - to replace repeated character sequences of length 3 or greater - with sequences of length 3. - * strip_handles. By default, it is set to False. It specifies - whether to remove Twitter handles of text used in the - `tokenize` method. - * match_phone_numbers. By default, it is set to True. It indicates - whether the `tokenize` method should look for phone numbers. -""" - - -###################################################################### - -import html -from typing import List - -import regex # https://github.com/nltk/nltk/issues/2409 - -from nltk.tokenize.api import TokenizerI - -###################################################################### -# The following strings are components in the regular expression -# that is used for tokenizing. It's important that phone_number -# appears first in the final regex (since it can contain whitespace). -# It also could matter that tags comes after emoticons, due to the -# possibility of having text like -# -# <:| and some text >:) -# -# Most importantly, the final element should always be last, since it -# does a last ditch whitespace-based tokenization of whatever is left. - -# ToDo: Update with https://en.wikipedia.org/wiki/List_of_emoticons ? - -# This particular element is used in a couple ways, so we define it -# with a name: -EMOTICONS = r""" - (?: - [<>]? - [:;=8] # eyes - [\-o\*\']? # optional nose - [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth - | - [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth - [\-o\*\']? # optional nose - [:;=8] # eyes - [<>]? - | - {}\[\]]+ # Run of non-space, non-()<>{}[] - | # or - \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...) - | - \([^\s]+?\) # balanced parens, non-recursive: (...) - )+ - (?: # End with: - \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...) - | - \([^\s]+?\) # balanced parens, non-recursive: (...) - | # or - [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars - ) - | # OR, the following to match naked domains: - (?: - (?\s]+>""", - # ASCII Arrows - r"""[\-]+>|<[\-]+""", - # Twitter username: - r"""(?:@[\w_]+)""", - # Twitter hashtags: - r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""", - # email addresses - r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""", - # Zero-Width-Joiner and Skin tone modifier emojis - """.(?: - [\U0001F3FB-\U0001F3FF]?(?:\u200d.[\U0001F3FB-\U0001F3FF]?)+ - | - [\U0001F3FB-\U0001F3FF] - )""", - # flags - FLAGS, - # Remaining word types: - r""" - (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes. - | - (?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals. - | - (?:[\w_]+) # Words without apostrophes or dashes. - | - (?:\.(?:\s*\.){1,}) # Ellipsis dots. - | - (?:\S) # Everything else that isn't whitespace. - """, -) - -# Take the main components and add a phone regex as the second parameter -REGEXPS_PHONE = (REGEXPS[0], PHONE_REGEX, *REGEXPS[1:]) - -###################################################################### -# TweetTokenizer.WORD_RE and TweetTokenizer.PHONE_WORD_RE represent -# the core tokenizing regexes. They are compiled lazily. - -# WORD_RE performs poorly on these patterns: -HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}") - -# The emoticon string gets its own regex so that we can preserve case for -# them as needed: -EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE | regex.I | regex.UNICODE) - -# These are for regularizing HTML entities to Unicode: -ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);") - -# For stripping away handles from a tweet: -HANDLES_RE = regex.compile( - r"(?>> from nltk.tokenize.casual import _replace_html_entities - >>> _replace_html_entities(b'Price: £100') - 'Price: \\xa3100' - >>> print(_replace_html_entities(b'Price: £100')) - Price: £100 - >>> - """ - - def _convert_entity(match): - entity_body = match.group(3) - if match.group(1): - try: - if match.group(2): - number = int(entity_body, 16) - else: - number = int(entity_body, 10) - # Numeric character references in the 80-9F range are typically - # interpreted by browsers as representing the characters mapped - # to bytes 80-9F in the Windows-1252 encoding. For more info - # see: https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Similar_character_sets - if 0x80 <= number <= 0x9F: - return bytes((number,)).decode("cp1252") - except ValueError: - number = None - else: - if entity_body in keep: - return match.group(0) - number = html.entities.name2codepoint.get(entity_body) - if number is not None: - try: - return chr(number) - except (ValueError, OverflowError): - pass - - return "" if remove_illegal else match.group(0) - - return ENT_RE.sub(_convert_entity, _str_to_unicode(text, encoding)) - - -###################################################################### - - -class TweetTokenizer(TokenizerI): - r""" - Tokenizer for tweets. - - >>> from nltk.tokenize import TweetTokenizer - >>> tknzr = TweetTokenizer() - >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--" - >>> tknzr.tokenize(s0) # doctest: +NORMALIZE_WHITESPACE - ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', - '<--'] - - Examples using `strip_handles` and `reduce_len parameters`: - - >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True) - >>> s1 = '@remy: This is waaaaayyyy too much for you!!!!!!' - >>> tknzr.tokenize(s1) - [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!'] - """ - - # Values used to lazily compile WORD_RE and PHONE_WORD_RE, - # which are the core tokenizing regexes. - _WORD_RE = None - _PHONE_WORD_RE = None - - ###################################################################### - - def __init__( - self, - preserve_case=True, - reduce_len=False, - strip_handles=False, - match_phone_numbers=True, - ): - """ - Create a `TweetTokenizer` instance with settings for use in the `tokenize` method. - - :param preserve_case: Flag indicating whether to preserve the casing (capitalisation) - of text used in the `tokenize` method. Defaults to True. - :type preserve_case: bool - :param reduce_len: Flag indicating whether to replace repeated character sequences - of length 3 or greater with sequences of length 3. Defaults to False. - :type reduce_len: bool - :param strip_handles: Flag indicating whether to remove Twitter handles of text used - in the `tokenize` method. Defaults to False. - :type strip_handles: bool - :param match_phone_numbers: Flag indicating whether the `tokenize` method should look - for phone numbers. Defaults to True. - :type match_phone_numbers: bool - """ - self.preserve_case = preserve_case - self.reduce_len = reduce_len - self.strip_handles = strip_handles - self.match_phone_numbers = match_phone_numbers - - def tokenize(self, text: str) -> List[str]: - """Tokenize the input text. - - :param text: str - :rtype: list(str) - :return: a tokenized list of strings; joining this list returns\ - the original string if `preserve_case=False`. - """ - # Fix HTML character entities: - text = _replace_html_entities(text) - # Remove username handles - if self.strip_handles: - text = remove_handles(text) - # Normalize word lengthening - if self.reduce_len: - text = reduce_lengthening(text) - # Shorten problematic sequences of characters - safe_text = HANG_RE.sub(r"\1\1\1", text) - # Recognise phone numbers during tokenization - if self.match_phone_numbers: - words = self.PHONE_WORD_RE.findall(safe_text) - else: - words = self.WORD_RE.findall(safe_text) - # Possibly alter the case, but avoid changing emoticons like :D into :d: - if not self.preserve_case: - words = list( - map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words) - ) - return words - - @property - def WORD_RE(self) -> "regex.Pattern": - """Core TweetTokenizer regex""" - # Compiles the regex for this and all future instantiations of TweetTokenizer. - if not type(self)._WORD_RE: - type(self)._WORD_RE = regex.compile( - f"({'|'.join(REGEXPS)})", - regex.VERBOSE | regex.I | regex.UNICODE, - ) - return type(self)._WORD_RE - - @property - def PHONE_WORD_RE(self) -> "regex.Pattern": - """Secondary core TweetTokenizer regex""" - # Compiles the regex for this and all future instantiations of TweetTokenizer. - if not type(self)._PHONE_WORD_RE: - type(self)._PHONE_WORD_RE = regex.compile( - f"({'|'.join(REGEXPS_PHONE)})", - regex.VERBOSE | regex.I | regex.UNICODE, - ) - return type(self)._PHONE_WORD_RE - - -###################################################################### -# Normalization Functions -###################################################################### - - -def reduce_lengthening(text): - """ - Replace repeated character sequences of length 3 or greater with sequences - of length 3. - """ - pattern = regex.compile(r"(.)\1{2,}") - return pattern.sub(r"\1\1\1", text) - - -def remove_handles(text): - """ - Remove Twitter username handles from text. - """ - # Substitute handles with ' ' to ensure that text on either side of removed handles are tokenized correctly - return HANDLES_RE.sub(" ", text) - - -###################################################################### -# Tokenization Function -###################################################################### - - -def casual_tokenize( - text, - preserve_case=True, - reduce_len=False, - strip_handles=False, - match_phone_numbers=True, -): - """ - Convenience function for wrapping the tokenizer. - """ - return TweetTokenizer( - preserve_case=preserve_case, - reduce_len=reduce_len, - strip_handles=strip_handles, - match_phone_numbers=match_phone_numbers, - ).tokenize(text) - - -############################################################################### diff --git a/pipeline/nltk/tokenize/destructive.py b/pipeline/nltk/tokenize/destructive.py deleted file mode 100644 index 4beb395dde57bf73082dfa91f65ad625d199bc31..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tokenize/destructive.py +++ /dev/null @@ -1,233 +0,0 @@ -# Natural Language Toolkit: NLTK's very own tokenizer. -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Liling Tan -# Tom Aarsen <> (modifications) -# URL: -# For license information, see LICENSE.TXT - - -import re -import warnings -from typing import Iterator, List, Tuple - -from nltk.tokenize.api import TokenizerI -from nltk.tokenize.util import align_tokens - - -class MacIntyreContractions: - """ - List of contractions adapted from Robert MacIntyre's tokenizer. - """ - - CONTRACTIONS2 = [ - r"(?i)\b(can)(?#X)(not)\b", - r"(?i)\b(d)(?#X)('ye)\b", - r"(?i)\b(gim)(?#X)(me)\b", - r"(?i)\b(gon)(?#X)(na)\b", - r"(?i)\b(got)(?#X)(ta)\b", - r"(?i)\b(lem)(?#X)(me)\b", - r"(?i)\b(more)(?#X)('n)\b", - r"(?i)\b(wan)(?#X)(na)(?=\s)", - ] - CONTRACTIONS3 = [r"(?i) ('t)(?#X)(is)\b", r"(?i) ('t)(?#X)(was)\b"] - CONTRACTIONS4 = [r"(?i)\b(whad)(dd)(ya)\b", r"(?i)\b(wha)(t)(cha)\b"] - - -class NLTKWordTokenizer(TokenizerI): - """ - The NLTK tokenizer that has improved upon the TreebankWordTokenizer. - - This is the method that is invoked by ``word_tokenize()``. It assumes that the - text has already been segmented into sentences, e.g. using ``sent_tokenize()``. - - The tokenizer is "destructive" such that the regexes applied will munge the - input string to a state beyond re-construction. It is possible to apply - `TreebankWordDetokenizer.detokenize` to the tokenized outputs of - `NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to - revert to the original string. - """ - - # Starting quotes. - STARTING_QUOTES = [ - (re.compile("([«“‘„]|[`]+)", re.U), r" \1 "), - (re.compile(r"^\""), r"``"), - (re.compile(r"(``)"), r" \1 "), - (re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "), - (re.compile(r"(?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\b", re.U), r"\1 \2"), - ] - - # Ending quotes. - ENDING_QUOTES = [ - (re.compile("([»”’])", re.U), r" \1 "), - (re.compile(r"''"), " '' "), - (re.compile(r'"'), " '' "), - (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "), - (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "), - ] - - # For improvements for starting/closing quotes from TreebankWordTokenizer, - # see discussion on https://github.com/nltk/nltk/pull/1437 - # Adding to TreebankWordTokenizer, nltk.word_tokenize now splits on - # - chervon quotes u'\xab' and u'\xbb' . - # - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d' - # See https://github.com/nltk/nltk/issues/1995#issuecomment-376741608 - # Also, behavior of splitting on clitics now follows Stanford CoreNLP - # - clitics covered (?!re|ve|ll|m|t|s|d)(\w)\b - - # Punctuation. - PUNCTUATION = [ - (re.compile(r'([^\.])(\.)([\]\)}>"\'' "»”’ " r"]*)\s*$", re.U), r"\1 \2 \3 "), - (re.compile(r"([:,])([^\d])"), r" \1 \2"), - (re.compile(r"([:,])$"), r" \1 "), - ( - re.compile(r"\.{2,}", re.U), - r" \g<0> ", - ), # See https://github.com/nltk/nltk/pull/2322 - (re.compile(r"[;@#$%&]"), r" \g<0> "), - ( - re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'), - r"\1 \2\3 ", - ), # Handles the final period. - (re.compile(r"[?!]"), r" \g<0> "), - (re.compile(r"([^'])' "), r"\1 ' "), - ( - re.compile(r"[*]", re.U), - r" \g<0> ", - ), # See https://github.com/nltk/nltk/pull/2322 - ] - - # Pads parentheses - PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ") - - # Optionally: Convert parentheses, brackets and converts them to PTB symbols. - CONVERT_PARENTHESES = [ - (re.compile(r"\("), "-LRB-"), - (re.compile(r"\)"), "-RRB-"), - (re.compile(r"\["), "-LSB-"), - (re.compile(r"\]"), "-RSB-"), - (re.compile(r"\{"), "-LCB-"), - (re.compile(r"\}"), "-RCB-"), - ] - - DOUBLE_DASHES = (re.compile(r"--"), r" -- ") - - # List of contractions adapted from Robert MacIntyre's tokenizer. - _contractions = MacIntyreContractions() - CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2)) - CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3)) - - def tokenize( - self, text: str, convert_parentheses: bool = False, return_str: bool = False - ) -> List[str]: - r"""Return a tokenized copy of `text`. - - >>> from nltk.tokenize import NLTKWordTokenizer - >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.''' - >>> NLTKWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE - ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36', - 'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', - 'of', 'them.', 'Thanks', '.'] - >>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE - ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36', - 'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', - 'of', 'them.', 'Thanks', '.'] - - - :param text: A string with a sentence or sentences. - :type text: str - :param convert_parentheses: if True, replace parentheses to PTB symbols, - e.g. `(` to `-LRB-`. Defaults to False. - :type convert_parentheses: bool, optional - :param return_str: If True, return tokens as space-separated string, - defaults to False. - :type return_str: bool, optional - :return: List of tokens from `text`. - :rtype: List[str] - """ - if return_str: - warnings.warn( - "Parameter 'return_str' has been deprecated and should no " - "longer be used.", - category=DeprecationWarning, - stacklevel=2, - ) - - for regexp, substitution in self.STARTING_QUOTES: - text = regexp.sub(substitution, text) - - for regexp, substitution in self.PUNCTUATION: - text = regexp.sub(substitution, text) - - # Handles parentheses. - regexp, substitution = self.PARENS_BRACKETS - text = regexp.sub(substitution, text) - # Optionally convert parentheses - if convert_parentheses: - for regexp, substitution in self.CONVERT_PARENTHESES: - text = regexp.sub(substitution, text) - - # Handles double dash. - regexp, substitution = self.DOUBLE_DASHES - text = regexp.sub(substitution, text) - - # add extra space to make things easier - text = " " + text + " " - - for regexp, substitution in self.ENDING_QUOTES: - text = regexp.sub(substitution, text) - - for regexp in self.CONTRACTIONS2: - text = regexp.sub(r" \1 \2 ", text) - for regexp in self.CONTRACTIONS3: - text = regexp.sub(r" \1 \2 ", text) - - # We are not using CONTRACTIONS4 since - # they are also commented out in the SED scripts - # for regexp in self._contractions.CONTRACTIONS4: - # text = regexp.sub(r' \1 \2 \3 ', text) - - return text.split() - - def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]: - r""" - Returns the spans of the tokens in ``text``. - Uses the post-hoc nltk.tokens.align_tokens to return the offset spans. - - >>> from nltk.tokenize import NLTKWordTokenizer - >>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).''' - >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), - ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), - ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), - ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] - >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected - True - >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in', - ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')', - ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.'] - >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected - True - - :param text: A string with a sentence or sentences. - :type text: str - :yield: Tuple[int, int] - """ - raw_tokens = self.tokenize(text) - - # Convert converted quotes back to original double quotes - # Do this only if original text contains double quote(s) or double - # single-quotes (because '' might be transformed to `` if it is - # treated as starting quotes). - if ('"' in text) or ("''" in text): - # Find double quotes and converted quotes - matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)] - - # Replace converted quotes back to double quotes - tokens = [ - matched.pop(0) if tok in ['"', "``", "''"] else tok - for tok in raw_tokens - ] - else: - tokens = raw_tokens - - yield from align_tokens(tokens, text) diff --git a/pipeline/nltk/tokenize/legality_principle.py b/pipeline/nltk/tokenize/legality_principle.py deleted file mode 100644 index 547827cefe1af65209e1f44237b7ac160b167920..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tokenize/legality_principle.py +++ /dev/null @@ -1,147 +0,0 @@ -# Natural Language Toolkit: Tokenizers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Christopher Hench -# Alex Estes -# URL: -# For license information, see LICENSE.TXT - -""" -The Legality Principle is a language agnostic principle maintaining that syllable -onsets and codas (the beginning and ends of syllables not including the vowel) -are only legal if they are found as word onsets or codas in the language. The English -word ''admit'' must then be syllabified as ''ad-mit'' since ''dm'' is not found -word-initially in the English language (Bartlett et al.). This principle was first proposed -in Daniel Kahn's 1976 dissertation, ''Syllable-based generalizations in English phonology''. - -Kahn further argues that there is a ''strong tendency to syllabify in such a way that -initial clusters are of maximal length, consistent with the general constraints on -word-initial consonant clusters.'' Consequently, in addition to being legal onsets, -the longest legal onset is preferable---''Onset Maximization''. - -The default implementation assumes an English vowel set, but the `vowels` attribute -can be set to IPA or any other alphabet's vowel set for the use-case. -Both a valid set of vowels as well as a text corpus of words in the language -are necessary to determine legal onsets and subsequently syllabify words. - -The legality principle with onset maximization is a universal syllabification algorithm, -but that does not mean it performs equally across languages. Bartlett et al. (2009) -is a good benchmark for English accuracy if utilizing IPA (pg. 311). - -References: - -- Otto Jespersen. 1904. Lehrbuch der Phonetik. - Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203. -- Theo Vennemann, ''On the Theory of Syllabic Phonology,'' 1972, p. 11. -- Daniel Kahn, ''Syllable-based generalizations in English phonology'', (PhD diss., MIT, 1976). -- Elisabeth Selkirk. 1984. On the major class features and syllable theory. - In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology. - Cambridge, MIT Press. pp. 107-136. -- Jeremy Goslin and Ulrich Frauenfelder. 2001. A comparison of theoretical and human syllabification. Language and Speech, 44:409–436. -- Susan Bartlett, et al. 2009. On the Syllabification of Phonemes. - In HLT-NAACL. pp. 308-316. -- Christopher Hench. 2017. Resonances in Middle High German: New Methodologies in Prosody. UC Berkeley. -""" - -from collections import Counter - -from nltk.tokenize.api import TokenizerI - - -class LegalitySyllableTokenizer(TokenizerI): - """ - Syllabifies words based on the Legality Principle and Onset Maximization. - - >>> from nltk.tokenize import LegalitySyllableTokenizer - >>> from nltk import word_tokenize - >>> from nltk.corpus import words - >>> text = "This is a wonderful sentence." - >>> text_words = word_tokenize(text) - >>> LP = LegalitySyllableTokenizer(words.words()) - >>> [LP.tokenize(word) for word in text_words] - [['This'], ['is'], ['a'], ['won', 'der', 'ful'], ['sen', 'ten', 'ce'], ['.']] - """ - - def __init__( - self, tokenized_source_text, vowels="aeiouy", legal_frequency_threshold=0.001 - ): - """ - :param tokenized_source_text: List of valid tokens in the language - :type tokenized_source_text: list(str) - :param vowels: Valid vowels in language or IPA representation - :type vowels: str - :param legal_frequency_threshold: Lowest frequency of all onsets to be considered a legal onset - :type legal_frequency_threshold: float - """ - self.legal_frequency_threshold = legal_frequency_threshold - self.vowels = vowels - self.legal_onsets = self.find_legal_onsets(tokenized_source_text) - - def find_legal_onsets(self, words): - """ - Gathers all onsets and then return only those above the frequency threshold - - :param words: List of words in a language - :type words: list(str) - :return: Set of legal onsets - :rtype: set(str) - """ - onsets = [self.onset(word) for word in words] - legal_onsets = [ - k - for k, v in Counter(onsets).items() - if (v / len(onsets)) > self.legal_frequency_threshold - ] - return set(legal_onsets) - - def onset(self, word): - """ - Returns consonant cluster of word, i.e. all characters until the first vowel. - - :param word: Single word or token - :type word: str - :return: String of characters of onset - :rtype: str - """ - onset = "" - for c in word.lower(): - if c in self.vowels: - return onset - else: - onset += c - return onset - - def tokenize(self, token): - """ - Apply the Legality Principle in combination with - Onset Maximization to return a list of syllables. - - :param token: Single word or token - :type token: str - :return syllable_list: Single word or token broken up into syllables. - :rtype: list(str) - """ - syllables = [] - syllable, current_onset = "", "" - vowel, onset = False, False - for char in token[::-1]: - char_lower = char.lower() - if not vowel: - syllable += char - vowel = bool(char_lower in self.vowels) - else: - if char_lower + current_onset[::-1] in self.legal_onsets: - syllable += char - current_onset += char_lower - onset = True - elif char_lower in self.vowels and not onset: - syllable += char - current_onset += char_lower - else: - syllables.append(syllable) - syllable = char - current_onset = "" - vowel = bool(char_lower in self.vowels) - syllables.append(syllable) - syllables_ordered = [syllable[::-1] for syllable in syllables][::-1] - return syllables_ordered diff --git a/pipeline/nltk/tokenize/mwe.py b/pipeline/nltk/tokenize/mwe.py deleted file mode 100644 index c39244c7b1c7a9be96331548150c60ce9aaae8be..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tokenize/mwe.py +++ /dev/null @@ -1,124 +0,0 @@ -# Multi-Word Expression tokenizer -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Rob Malouf -# URL: -# For license information, see LICENSE.TXT - -""" -Multi-Word Expression Tokenizer - -A ``MWETokenizer`` takes a string which has already been divided into tokens and -retokenizes it, merging multi-word expressions into single tokens, using a lexicon -of MWEs: - - - >>> from nltk.tokenize import MWETokenizer - - >>> tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')]) - >>> tokenizer.add_mwe(('in', 'spite', 'of')) - - >>> tokenizer.tokenize('Testing testing testing one two three'.split()) - ['Testing', 'testing', 'testing', 'one', 'two', 'three'] - - >>> tokenizer.tokenize('This is a test in spite'.split()) - ['This', 'is', 'a', 'test', 'in', 'spite'] - - >>> tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split()) - ['In', 'a_little', 'or', 'a_little_bit', 'or', 'a_lot', 'in_spite_of'] - -""" -from nltk.tokenize.api import TokenizerI -from nltk.util import Trie - - -class MWETokenizer(TokenizerI): - """A tokenizer that processes tokenized text and merges multi-word expressions - into single tokens. - """ - - def __init__(self, mwes=None, separator="_"): - """Initialize the multi-word tokenizer with a list of expressions and a - separator - - :type mwes: list(list(str)) - :param mwes: A sequence of multi-word expressions to be merged, where - each MWE is a sequence of strings. - :type separator: str - :param separator: String that should be inserted between words in a multi-word - expression token. (Default is '_') - - """ - if not mwes: - mwes = [] - self._mwes = Trie(mwes) - self._separator = separator - - def add_mwe(self, mwe): - """Add a multi-word expression to the lexicon (stored as a word trie) - - We use ``util.Trie`` to represent the trie. Its form is a dict of dicts. - The key True marks the end of a valid MWE. - - :param mwe: The multi-word expression we're adding into the word trie - :type mwe: tuple(str) or list(str) - - :Example: - - >>> tokenizer = MWETokenizer() - >>> tokenizer.add_mwe(('a', 'b')) - >>> tokenizer.add_mwe(('a', 'b', 'c')) - >>> tokenizer.add_mwe(('a', 'x')) - >>> expected = {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}} - >>> tokenizer._mwes == expected - True - - """ - self._mwes.insert(mwe) - - def tokenize(self, text): - """ - - :param text: A list containing tokenized text - :type text: list(str) - :return: A list of the tokenized text with multi-words merged together - :rtype: list(str) - - :Example: - - >>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+') - >>> tokenizer.tokenize("An hors d'oeuvre tonight, sir?".split()) - ['An', "hors+d'oeuvre", 'tonight,', 'sir?'] - - """ - i = 0 - n = len(text) - result = [] - - while i < n: - if text[i] in self._mwes: - # possible MWE match - j = i - trie = self._mwes - last_match = -1 - while j < n and text[j] in trie: # and len(trie[text[j]]) > 0 : - trie = trie[text[j]] - j = j + 1 - if Trie.LEAF in trie: - last_match = j - else: - if last_match > -1: - j = last_match - - if Trie.LEAF in trie or last_match > -1: - # success! - result.append(self._separator.join(text[i:j])) - i = j - else: - # no match, so backtrack - result.append(text[i]) - i += 1 - else: - result.append(text[i]) - i += 1 - return result diff --git a/pipeline/nltk/tokenize/nist.py b/pipeline/nltk/tokenize/nist.py deleted file mode 100644 index b9e13dad28b81d91891a838d89bcdf5a0c1ad086..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tokenize/nist.py +++ /dev/null @@ -1,179 +0,0 @@ -# Natural Language Toolkit: Python port of the mteval-v14.pl tokenizer. -# -# Copyright (C) 2001-2015 NLTK Project -# Author: Liling Tan (ported from ftp://jaguar.ncsl.nist.gov/mt/resources/mteval-v14.pl) -# Contributors: Ozan Caglayan, Wiktor Stribizew -# -# URL: -# For license information, see LICENSE.TXT - -""" -This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script, -https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926 -which was also ported into Python in -https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162 -""" - - -import io -import re - -from nltk.corpus import perluniprops -from nltk.tokenize.api import TokenizerI -from nltk.tokenize.util import xml_unescape - - -class NISTTokenizer(TokenizerI): - """ - This NIST tokenizer is sentence-based instead of the original - paragraph-based tokenization from mteval-14.pl; The sentence-based - tokenization is consistent with the other tokenizers available in NLTK. - - >>> from nltk.tokenize.nist import NISTTokenizer - >>> nist = NISTTokenizer() - >>> s = "Good muffins cost $3.88 in New York." - >>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.'] - >>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.'] - >>> nist.tokenize(s, lowercase=False) == expected_cased - True - >>> nist.tokenize(s, lowercase=True) == expected_lower # Lowercased. - True - - The international_tokenize() is the preferred function when tokenizing - non-european text, e.g. - - >>> from nltk.tokenize.nist import NISTTokenizer - >>> nist = NISTTokenizer() - - # Input strings. - >>> albb = u'Alibaba Group Holding Limited (Chinese: 阿里巴巴集团控股 有限公司) us a Chinese e-commerce company...' - >>> amz = u'Amazon.com, Inc. (/ˈæməzɒn/) is an American electronic commerce...' - >>> rkt = u'Rakuten, Inc. (楽天株式会社 Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.' - - # Expected tokens. - >>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'\u963f\u91cc\u5df4\u5df4\u96c6\u56e2\u63a7\u80a1', u'\u6709\u9650\u516c\u53f8', u')'] - >>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'\u02c8\xe6', u'm'] - >>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'\u697d\u5929\u682a\u5f0f\u4f1a\u793e', u'Rakuten', u'Kabushiki', u'-', u'gaisha'] - - >>> nist.international_tokenize(albb)[:10] == expected_albb - True - >>> nist.international_tokenize(amz)[:10] == expected_amz - True - >>> nist.international_tokenize(rkt)[:10] == expected_rkt - True - - # Doctest for patching issue #1926 - >>> sent = u'this is a foo\u2604sentence.' - >>> expected_sent = [u'this', u'is', u'a', u'foo', u'\u2604', u'sentence', u'.'] - >>> nist.international_tokenize(sent) == expected_sent - True - """ - - # Strip "skipped" tags - STRIP_SKIP = re.compile(""), "" - # Strip end-of-line hyphenation and join lines - STRIP_EOL_HYPHEN = re.compile("\u2028"), " " - # Tokenize punctuation. - PUNCT = re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), " \\1 " - # Tokenize period and comma unless preceded by a digit. - PERIOD_COMMA_PRECEED = re.compile(r"([^0-9])([\.,])"), "\\1 \\2 " - # Tokenize period and comma unless followed by a digit. - PERIOD_COMMA_FOLLOW = re.compile(r"([\.,])([^0-9])"), " \\1 \\2" - # Tokenize dash when preceded by a digit - DASH_PRECEED_DIGIT = re.compile("([0-9])(-)"), "\\1 \\2 " - - LANG_DEPENDENT_REGEXES = [ - PUNCT, - PERIOD_COMMA_PRECEED, - PERIOD_COMMA_FOLLOW, - DASH_PRECEED_DIGIT, - ] - - # Perluniprops characters used in NIST tokenizer. - pup_number = str("".join(set(perluniprops.chars("Number")))) # i.e. \p{N} - pup_punct = str("".join(set(perluniprops.chars("Punctuation")))) # i.e. \p{P} - pup_symbol = str("".join(set(perluniprops.chars("Symbol")))) # i.e. \p{S} - - # Python regexes needs to escape some special symbols, see - # see https://stackoverflow.com/q/45670950/610569 - number_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_number) - punct_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_punct) - symbol_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_symbol) - - # Note: In the original perl implementation, \p{Z} and \p{Zl} were used to - # (i) strip trailing and heading spaces and - # (ii) de-deuplicate spaces. - # In Python, this would do: ' '.join(str.strip().split()) - # Thus, the next two lines were commented out. - # Line_Separator = str(''.join(perluniprops.chars('Line_Separator'))) # i.e. \p{Zl} - # Separator = str(''.join(perluniprops.chars('Separator'))) # i.e. \p{Z} - - # Pads non-ascii strings with space. - NONASCII = re.compile("([\x00-\x7f]+)"), r" \1 " - # Tokenize any punctuation unless followed AND preceded by a digit. - PUNCT_1 = ( - re.compile(f"([{number_regex}])([{punct_regex}])"), - "\\1 \\2 ", - ) - PUNCT_2 = ( - re.compile(f"([{punct_regex}])([{number_regex}])"), - " \\1 \\2", - ) - # Tokenize symbols - SYMBOLS = re.compile(f"([{symbol_regex}])"), " \\1 " - - INTERNATIONAL_REGEXES = [NONASCII, PUNCT_1, PUNCT_2, SYMBOLS] - - def lang_independent_sub(self, text): - """Performs the language independent string substituitions.""" - # It's a strange order of regexes. - # It'll be better to unescape after STRIP_EOL_HYPHEN - # but let's keep it close to the original NIST implementation. - regexp, substitution = self.STRIP_SKIP - text = regexp.sub(substitution, text) - text = xml_unescape(text) - regexp, substitution = self.STRIP_EOL_HYPHEN - text = regexp.sub(substitution, text) - return text - - def tokenize(self, text, lowercase=False, western_lang=True, return_str=False): - text = str(text) - # Language independent regex. - text = self.lang_independent_sub(text) - # Language dependent regex. - if western_lang: - # Pad string with whitespace. - text = " " + text + " " - if lowercase: - text = text.lower() - for regexp, substitution in self.LANG_DEPENDENT_REGEXES: - text = regexp.sub(substitution, text) - # Remove contiguous whitespaces. - text = " ".join(text.split()) - # Finally, strips heading and trailing spaces - # and converts output string into unicode. - text = str(text.strip()) - return text if return_str else text.split() - - def international_tokenize( - self, text, lowercase=False, split_non_ascii=True, return_str=False - ): - text = str(text) - # Different from the 'normal' tokenize(), STRIP_EOL_HYPHEN is applied - # first before unescaping. - regexp, substitution = self.STRIP_SKIP - text = regexp.sub(substitution, text) - regexp, substitution = self.STRIP_EOL_HYPHEN - text = regexp.sub(substitution, text) - text = xml_unescape(text) - - if lowercase: - text = text.lower() - - for regexp, substitution in self.INTERNATIONAL_REGEXES: - text = regexp.sub(substitution, text) - - # Make sure that there's only one space only between words. - # Strip leading and trailing spaces. - text = " ".join(text.strip().split()) - return text if return_str else text.split() diff --git a/pipeline/nltk/tokenize/punkt.py b/pipeline/nltk/tokenize/punkt.py deleted file mode 100644 index 129bd49c270c301d97a44eec5e58d7e19f15cabe..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tokenize/punkt.py +++ /dev/null @@ -1,1767 +0,0 @@ -# Natural Language Toolkit: Punkt sentence tokenizer -# -# Copyright (C) 2001-2023 NLTK Project -# Algorithm: Kiss & Strunk (2006) -# Author: Willy (original Python port) -# Steven Bird (additions) -# Edward Loper (rewrite) -# Joel Nothman (almost rewrite) -# Arthur Darcet (fixes) -# Tom Aarsen <> (tackle ReDoS & performance issues) -# URL: -# For license information, see LICENSE.TXT - -r""" -Punkt Sentence Tokenizer - -This tokenizer divides a text into a list of sentences -by using an unsupervised algorithm to build a model for abbreviation -words, collocations, and words that start sentences. It must be -trained on a large collection of plaintext in the target language -before it can be used. - -The NLTK data package includes a pre-trained Punkt tokenizer for -English. - - >>> import nltk.data - >>> text = ''' - ... Punkt knows that the periods in Mr. Smith and Johann S. Bach - ... do not mark sentence boundaries. And sometimes sentences - ... can start with non-capitalized words. i is a good variable - ... name. - ... ''' - >>> sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') - >>> print('\n-----\n'.join(sent_detector.tokenize(text.strip()))) - Punkt knows that the periods in Mr. Smith and Johann S. Bach - do not mark sentence boundaries. - ----- - And sometimes sentences - can start with non-capitalized words. - ----- - i is a good variable - name. - -(Note that whitespace from the original text, including newlines, is -retained in the output.) - -Punctuation following sentences is also included by default -(from NLTK 3.0 onwards). It can be excluded with the realign_boundaries -flag. - - >>> text = ''' - ... (How does it deal with this parenthesis?) "It should be part of the - ... previous sentence." "(And the same with this one.)" ('And this one!') - ... "('(And (this)) '?)" [(and this. )] - ... ''' - >>> print('\n-----\n'.join( - ... sent_detector.tokenize(text.strip()))) - (How does it deal with this parenthesis?) - ----- - "It should be part of the - previous sentence." - ----- - "(And the same with this one.)" - ----- - ('And this one!') - ----- - "('(And (this)) '?)" - ----- - [(and this. )] - >>> print('\n-----\n'.join( - ... sent_detector.tokenize(text.strip(), realign_boundaries=False))) - (How does it deal with this parenthesis? - ----- - ) "It should be part of the - previous sentence. - ----- - " "(And the same with this one. - ----- - )" ('And this one! - ----- - ') - "('(And (this)) '? - ----- - )" [(and this. - ----- - )] - -However, Punkt is designed to learn parameters (a list of abbreviations, etc.) -unsupervised from a corpus similar to the target domain. The pre-packaged models -may therefore be unsuitable: use ``PunktSentenceTokenizer(text)`` to learn -parameters from the given text. - -:class:`.PunktTrainer` learns parameters such as a list of abbreviations -(without supervision) from portions of text. Using a ``PunktTrainer`` directly -allows for incremental training and modification of the hyper-parameters used -to decide what is considered an abbreviation, etc. - -The algorithm for this tokenizer is described in:: - - Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence - Boundary Detection. Computational Linguistics 32: 485-525. -""" - -# TODO: Make orthographic heuristic less susceptible to overtraining -# TODO: Frequent sentence starters optionally exclude always-capitalised words -# FIXME: Problem with ending string with e.g. '!!!' -> '!! !' - -import math -import re -import string -from collections import defaultdict -from typing import Any, Dict, Iterator, List, Match, Optional, Tuple, Union - -from nltk.probability import FreqDist -from nltk.tokenize.api import TokenizerI - -###################################################################### -# { Orthographic Context Constants -###################################################################### -# The following constants are used to describe the orthographic -# contexts in which a word can occur. BEG=beginning, MID=middle, -# UNK=unknown, UC=uppercase, LC=lowercase, NC=no case. - -_ORTHO_BEG_UC = 1 << 1 -"""Orthographic context: beginning of a sentence with upper case.""" - -_ORTHO_MID_UC = 1 << 2 -"""Orthographic context: middle of a sentence with upper case.""" - -_ORTHO_UNK_UC = 1 << 3 -"""Orthographic context: unknown position in a sentence with upper case.""" - -_ORTHO_BEG_LC = 1 << 4 -"""Orthographic context: beginning of a sentence with lower case.""" - -_ORTHO_MID_LC = 1 << 5 -"""Orthographic context: middle of a sentence with lower case.""" - -_ORTHO_UNK_LC = 1 << 6 -"""Orthographic context: unknown position in a sentence with lower case.""" - -_ORTHO_UC = _ORTHO_BEG_UC + _ORTHO_MID_UC + _ORTHO_UNK_UC -"""Orthographic context: occurs with upper case.""" - -_ORTHO_LC = _ORTHO_BEG_LC + _ORTHO_MID_LC + _ORTHO_UNK_LC -"""Orthographic context: occurs with lower case.""" - -_ORTHO_MAP = { - ("initial", "upper"): _ORTHO_BEG_UC, - ("internal", "upper"): _ORTHO_MID_UC, - ("unknown", "upper"): _ORTHO_UNK_UC, - ("initial", "lower"): _ORTHO_BEG_LC, - ("internal", "lower"): _ORTHO_MID_LC, - ("unknown", "lower"): _ORTHO_UNK_LC, -} -"""A map from context position and first-letter case to the -appropriate orthographic context flag.""" - -# } (end orthographic context constants) -###################################################################### - -###################################################################### -# { Decision reasons for debugging -###################################################################### - -REASON_DEFAULT_DECISION = "default decision" -REASON_KNOWN_COLLOCATION = "known collocation (both words)" -REASON_ABBR_WITH_ORTHOGRAPHIC_HEURISTIC = "abbreviation + orthographic heuristic" -REASON_ABBR_WITH_SENTENCE_STARTER = "abbreviation + frequent sentence starter" -REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC = "initial + orthographic heuristic" -REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC = "initial + orthographic heuristic" -REASON_INITIAL_WITH_SPECIAL_ORTHOGRAPHIC_HEURISTIC = ( - "initial + special orthographic heuristic" -) - - -# } (end decision reasons for debugging) -###################################################################### - -###################################################################### -# { Language-dependent variables -###################################################################### - - -class PunktLanguageVars: - """ - Stores variables, mostly regular expressions, which may be - language-dependent for correct application of the algorithm. - An extension of this class may modify its properties to suit - a language other than English; an instance can then be passed - as an argument to PunktSentenceTokenizer and PunktTrainer - constructors. - """ - - __slots__ = ("_re_period_context", "_re_word_tokenizer") - - def __getstate__(self): - # All modifications to the class are performed by inheritance. - # Non-default parameters to be pickled must be defined in the inherited - # class. - return 1 - - def __setstate__(self, state): - return 1 - - sent_end_chars = (".", "?", "!") - """Characters which are candidates for sentence boundaries""" - - @property - def _re_sent_end_chars(self): - return "[%s]" % re.escape("".join(self.sent_end_chars)) - - internal_punctuation = ",:;" # might want to extend this.. - """sentence internal punctuation, which indicates an abbreviation if - preceded by a period-final token.""" - - re_boundary_realignment = re.compile(r'["\')\]}]+?(?:\s+|(?=--)|$)', re.MULTILINE) - """Used to realign punctuation that should be included in a sentence - although it follows the period (or ?, !).""" - - _re_word_start = r"[^\(\"\`{\[:;&\#\*@\)}\]\-,]" - """Excludes some characters from starting word tokens""" - - @property - def _re_non_word_chars(self): - return r"(?:[)\";}\]\*:@\'\({\[%s])" % re.escape( - "".join(set(self.sent_end_chars) - {"."}) - ) - - """Characters that cannot appear within words""" - - _re_multi_char_punct = r"(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)" - """Hyphen and ellipsis are multi-character punctuation""" - - _word_tokenize_fmt = r"""( - %(MultiChar)s - | - (?=%(WordStart)s)\S+? # Accept word characters until end is found - (?= # Sequences marking a word's end - \s| # White-space - $| # End-of-string - %(NonWord)s|%(MultiChar)s| # Punctuation - ,(?=$|\s|%(NonWord)s|%(MultiChar)s) # Comma if at end of word - ) - | - \S - )""" - """Format of a regular expression to split punctuation from words, - excluding period.""" - - def _word_tokenizer_re(self): - """Compiles and returns a regular expression for word tokenization""" - try: - return self._re_word_tokenizer - except AttributeError: - self._re_word_tokenizer = re.compile( - self._word_tokenize_fmt - % { - "NonWord": self._re_non_word_chars, - "MultiChar": self._re_multi_char_punct, - "WordStart": self._re_word_start, - }, - re.UNICODE | re.VERBOSE, - ) - return self._re_word_tokenizer - - def word_tokenize(self, s): - """Tokenize a string to split off punctuation other than periods""" - return self._word_tokenizer_re().findall(s) - - _period_context_fmt = r""" - %(SentEndChars)s # a potential sentence ending - (?=(?P - %(NonWord)s # either other punctuation - | - \s+(?P\S+) # or whitespace and some other token - ))""" - """Format of a regular expression to find contexts including possible - sentence boundaries. Matches token which the possible sentence boundary - ends, and matches the following token within a lookahead expression.""" - - def period_context_re(self): - """Compiles and returns a regular expression to find contexts - including possible sentence boundaries.""" - try: - return self._re_period_context - except: - self._re_period_context = re.compile( - self._period_context_fmt - % { - "NonWord": self._re_non_word_chars, - "SentEndChars": self._re_sent_end_chars, - }, - re.UNICODE | re.VERBOSE, - ) - return self._re_period_context - - -_re_non_punct = re.compile(r"[^\W\d]", re.UNICODE) -"""Matches token types that are not merely punctuation. (Types for -numeric tokens are changed to ##number## and hence contain alpha.)""" - - -# } -###################################################################### - - -# //////////////////////////////////////////////////////////// -# { Helper Functions -# //////////////////////////////////////////////////////////// - - -def _pair_iter(iterator): - """ - Yields pairs of tokens from the given iterator such that each input - token will appear as the first element in a yielded tuple. The last - pair will have None as its second element. - """ - iterator = iter(iterator) - try: - prev = next(iterator) - except StopIteration: - return - for el in iterator: - yield (prev, el) - prev = el - yield (prev, None) - - -###################################################################### -# { Punkt Parameters -###################################################################### - - -class PunktParameters: - """Stores data used to perform sentence boundary detection with Punkt.""" - - def __init__(self): - self.abbrev_types = set() - """A set of word types for known abbreviations.""" - - self.collocations = set() - """A set of word type tuples for known common collocations - where the first word ends in a period. E.g., ('S.', 'Bach') - is a common collocation in a text that discusses 'Johann - S. Bach'. These count as negative evidence for sentence - boundaries.""" - - self.sent_starters = set() - """A set of word types for words that often appear at the - beginning of sentences.""" - - self.ortho_context = defaultdict(int) - """A dictionary mapping word types to the set of orthographic - contexts that word type appears in. Contexts are represented - by adding orthographic context flags: ...""" - - def clear_abbrevs(self): - self.abbrev_types = set() - - def clear_collocations(self): - self.collocations = set() - - def clear_sent_starters(self): - self.sent_starters = set() - - def clear_ortho_context(self): - self.ortho_context = defaultdict(int) - - def add_ortho_context(self, typ, flag): - self.ortho_context[typ] |= flag - - def _debug_ortho_context(self, typ): - context = self.ortho_context[typ] - if context & _ORTHO_BEG_UC: - yield "BEG-UC" - if context & _ORTHO_MID_UC: - yield "MID-UC" - if context & _ORTHO_UNK_UC: - yield "UNK-UC" - if context & _ORTHO_BEG_LC: - yield "BEG-LC" - if context & _ORTHO_MID_LC: - yield "MID-LC" - if context & _ORTHO_UNK_LC: - yield "UNK-LC" - - -###################################################################### -# { PunktToken -###################################################################### - - -class PunktToken: - """Stores a token of text with annotations produced during - sentence boundary detection.""" - - _properties = ["parastart", "linestart", "sentbreak", "abbr", "ellipsis"] - __slots__ = ["tok", "type", "period_final"] + _properties - - def __init__(self, tok, **params): - self.tok = tok - self.type = self._get_type(tok) - self.period_final = tok.endswith(".") - - for prop in self._properties: - setattr(self, prop, None) - for k in params: - setattr(self, k, params[k]) - - # //////////////////////////////////////////////////////////// - # { Regular expressions for properties - # //////////////////////////////////////////////////////////// - # Note: [A-Za-z] is approximated by [^\W\d] in the general case. - _RE_ELLIPSIS = re.compile(r"\.\.+$") - _RE_NUMERIC = re.compile(r"^-?[\.,]?\d[\d,\.-]*\.?$") - _RE_INITIAL = re.compile(r"[^\W\d]\.$", re.UNICODE) - _RE_ALPHA = re.compile(r"[^\W\d]+$", re.UNICODE) - - # //////////////////////////////////////////////////////////// - # { Derived properties - # //////////////////////////////////////////////////////////// - - def _get_type(self, tok): - """Returns a case-normalized representation of the token.""" - return self._RE_NUMERIC.sub("##number##", tok.lower()) - - @property - def type_no_period(self): - """ - The type with its final period removed if it has one. - """ - if len(self.type) > 1 and self.type[-1] == ".": - return self.type[:-1] - return self.type - - @property - def type_no_sentperiod(self): - """ - The type with its final period removed if it is marked as a - sentence break. - """ - if self.sentbreak: - return self.type_no_period - return self.type - - @property - def first_upper(self): - """True if the token's first character is uppercase.""" - return self.tok[0].isupper() - - @property - def first_lower(self): - """True if the token's first character is lowercase.""" - return self.tok[0].islower() - - @property - def first_case(self): - if self.first_lower: - return "lower" - if self.first_upper: - return "upper" - return "none" - - @property - def is_ellipsis(self): - """True if the token text is that of an ellipsis.""" - return self._RE_ELLIPSIS.match(self.tok) - - @property - def is_number(self): - """True if the token text is that of a number.""" - return self.type.startswith("##number##") - - @property - def is_initial(self): - """True if the token text is that of an initial.""" - return self._RE_INITIAL.match(self.tok) - - @property - def is_alpha(self): - """True if the token text is all alphabetic.""" - return self._RE_ALPHA.match(self.tok) - - @property - def is_non_punct(self): - """True if the token is either a number or is alphabetic.""" - return _re_non_punct.search(self.type) - - # //////////////////////////////////////////////////////////// - # { String representation - # //////////////////////////////////////////////////////////// - - def __repr__(self): - """ - A string representation of the token that can reproduce it - with eval(), which lists all the token's non-default - annotations. - """ - typestr = " type=%s," % repr(self.type) if self.type != self.tok else "" - - propvals = ", ".join( - f"{p}={repr(getattr(self, p))}" - for p in self._properties - if getattr(self, p) - ) - - return "{}({},{} {})".format( - self.__class__.__name__, - repr(self.tok), - typestr, - propvals, - ) - - def __str__(self): - """ - A string representation akin to that used by Kiss and Strunk. - """ - res = self.tok - if self.abbr: - res += "" - if self.ellipsis: - res += "" - if self.sentbreak: - res += "" - return res - - -###################################################################### -# { Punkt base class -###################################################################### - - -class PunktBaseClass: - """ - Includes common components of PunktTrainer and PunktSentenceTokenizer. - """ - - def __init__(self, lang_vars=None, token_cls=PunktToken, params=None): - if lang_vars is None: - lang_vars = PunktLanguageVars() - if params is None: - params = PunktParameters() - self._params = params - self._lang_vars = lang_vars - self._Token = token_cls - """The collection of parameters that determines the behavior - of the punkt tokenizer.""" - - # //////////////////////////////////////////////////////////// - # { Word tokenization - # //////////////////////////////////////////////////////////// - - def _tokenize_words(self, plaintext): - """ - Divide the given text into tokens, using the punkt word - segmentation regular expression, and generate the resulting list - of tokens augmented as three-tuples with two boolean values for whether - the given token occurs at the start of a paragraph or a new line, - respectively. - """ - parastart = False - for line in plaintext.split("\n"): - if line.strip(): - line_toks = iter(self._lang_vars.word_tokenize(line)) - - try: - tok = next(line_toks) - except StopIteration: - continue - - yield self._Token(tok, parastart=parastart, linestart=True) - parastart = False - - for tok in line_toks: - yield self._Token(tok) - else: - parastart = True - - # //////////////////////////////////////////////////////////// - # { Annotation Procedures - # //////////////////////////////////////////////////////////// - - def _annotate_first_pass( - self, tokens: Iterator[PunktToken] - ) -> Iterator[PunktToken]: - """ - Perform the first pass of annotation, which makes decisions - based purely based on the word type of each word: - - - '?', '!', and '.' are marked as sentence breaks. - - sequences of two or more periods are marked as ellipsis. - - any word ending in '.' that's a known abbreviation is - marked as an abbreviation. - - any other word ending in '.' is marked as a sentence break. - - Return these annotations as a tuple of three sets: - - - sentbreak_toks: The indices of all sentence breaks. - - abbrev_toks: The indices of all abbreviations. - - ellipsis_toks: The indices of all ellipsis marks. - """ - for aug_tok in tokens: - self._first_pass_annotation(aug_tok) - yield aug_tok - - def _first_pass_annotation(self, aug_tok: PunktToken) -> None: - """ - Performs type-based annotation on a single token. - """ - - tok = aug_tok.tok - - if tok in self._lang_vars.sent_end_chars: - aug_tok.sentbreak = True - elif aug_tok.is_ellipsis: - aug_tok.ellipsis = True - elif aug_tok.period_final and not tok.endswith(".."): - if ( - tok[:-1].lower() in self._params.abbrev_types - or tok[:-1].lower().split("-")[-1] in self._params.abbrev_types - ): - - aug_tok.abbr = True - else: - aug_tok.sentbreak = True - - return - - -###################################################################### -# { Punkt Trainer -###################################################################### - - -class PunktTrainer(PunktBaseClass): - """Learns parameters used in Punkt sentence boundary detection.""" - - def __init__( - self, train_text=None, verbose=False, lang_vars=None, token_cls=PunktToken - ): - - PunktBaseClass.__init__(self, lang_vars=lang_vars, token_cls=token_cls) - - self._type_fdist = FreqDist() - """A frequency distribution giving the frequency of each - case-normalized token type in the training data.""" - - self._num_period_toks = 0 - """The number of words ending in period in the training data.""" - - self._collocation_fdist = FreqDist() - """A frequency distribution giving the frequency of all - bigrams in the training data where the first word ends in a - period. Bigrams are encoded as tuples of word types. - Especially common collocations are extracted from this - frequency distribution, and stored in - ``_params``.``collocations ``.""" - - self._sent_starter_fdist = FreqDist() - """A frequency distribution giving the frequency of all words - that occur at the training data at the beginning of a sentence - (after the first pass of annotation). Especially common - sentence starters are extracted from this frequency - distribution, and stored in ``_params.sent_starters``. - """ - - self._sentbreak_count = 0 - """The total number of sentence breaks identified in training, used for - calculating the frequent sentence starter heuristic.""" - - self._finalized = True - """A flag as to whether the training has been finalized by finding - collocations and sentence starters, or whether finalize_training() - still needs to be called.""" - - if train_text: - self.train(train_text, verbose, finalize=True) - - def get_params(self): - """ - Calculates and returns parameters for sentence boundary detection as - derived from training.""" - if not self._finalized: - self.finalize_training() - return self._params - - # //////////////////////////////////////////////////////////// - # { Customization Variables - # //////////////////////////////////////////////////////////// - - ABBREV = 0.3 - """cut-off value whether a 'token' is an abbreviation""" - - IGNORE_ABBREV_PENALTY = False - """allows the disabling of the abbreviation penalty heuristic, which - exponentially disadvantages words that are found at times without a - final period.""" - - ABBREV_BACKOFF = 5 - """upper cut-off for Mikheev's(2002) abbreviation detection algorithm""" - - COLLOCATION = 7.88 - """minimal log-likelihood value that two tokens need to be considered - as a collocation""" - - SENT_STARTER = 30 - """minimal log-likelihood value that a token requires to be considered - as a frequent sentence starter""" - - INCLUDE_ALL_COLLOCS = False - """this includes as potential collocations all word pairs where the first - word ends in a period. It may be useful in corpora where there is a lot - of variation that makes abbreviations like Mr difficult to identify.""" - - INCLUDE_ABBREV_COLLOCS = False - """this includes as potential collocations all word pairs where the first - word is an abbreviation. Such collocations override the orthographic - heuristic, but not the sentence starter heuristic. This is overridden by - INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials - and ordinals are considered.""" - """""" - - MIN_COLLOC_FREQ = 1 - """this sets a minimum bound on the number of times a bigram needs to - appear before it can be considered a collocation, in addition to log - likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True.""" - - # //////////////////////////////////////////////////////////// - # { Training.. - # //////////////////////////////////////////////////////////// - - def train(self, text, verbose=False, finalize=True): - """ - Collects training data from a given text. If finalize is True, it - will determine all the parameters for sentence boundary detection. If - not, this will be delayed until get_params() or finalize_training() is - called. If verbose is True, abbreviations found will be listed. - """ - # Break the text into tokens; record which token indices correspond to - # line starts and paragraph starts; and determine their types. - self._train_tokens(self._tokenize_words(text), verbose) - if finalize: - self.finalize_training(verbose) - - def train_tokens(self, tokens, verbose=False, finalize=True): - """ - Collects training data from a given list of tokens. - """ - self._train_tokens((self._Token(t) for t in tokens), verbose) - if finalize: - self.finalize_training(verbose) - - def _train_tokens(self, tokens, verbose): - self._finalized = False - - # Ensure tokens are a list - tokens = list(tokens) - - # Find the frequency of each case-normalized type. (Don't - # strip off final periods.) Also keep track of the number of - # tokens that end in periods. - for aug_tok in tokens: - self._type_fdist[aug_tok.type] += 1 - if aug_tok.period_final: - self._num_period_toks += 1 - - # Look for new abbreviations, and for types that no longer are - unique_types = self._unique_types(tokens) - for abbr, score, is_add in self._reclassify_abbrev_types(unique_types): - if score >= self.ABBREV: - if is_add: - self._params.abbrev_types.add(abbr) - if verbose: - print(f" Abbreviation: [{score:6.4f}] {abbr}") - else: - if not is_add: - self._params.abbrev_types.remove(abbr) - if verbose: - print(f" Removed abbreviation: [{score:6.4f}] {abbr}") - - # Make a preliminary pass through the document, marking likely - # sentence breaks, abbreviations, and ellipsis tokens. - tokens = list(self._annotate_first_pass(tokens)) - - # Check what contexts each word type can appear in, given the - # case of its first letter. - self._get_orthography_data(tokens) - - # We need total number of sentence breaks to find sentence starters - self._sentbreak_count += self._get_sentbreak_count(tokens) - - # The remaining heuristics relate to pairs of tokens where the first - # ends in a period. - for aug_tok1, aug_tok2 in _pair_iter(tokens): - if not aug_tok1.period_final or not aug_tok2: - continue - - # Is the first token a rare abbreviation? - if self._is_rare_abbrev_type(aug_tok1, aug_tok2): - self._params.abbrev_types.add(aug_tok1.type_no_period) - if verbose: - print(" Rare Abbrev: %s" % aug_tok1.type) - - # Does second token have a high likelihood of starting a sentence? - if self._is_potential_sent_starter(aug_tok2, aug_tok1): - self._sent_starter_fdist[aug_tok2.type] += 1 - - # Is this bigram a potential collocation? - if self._is_potential_collocation(aug_tok1, aug_tok2): - self._collocation_fdist[ - (aug_tok1.type_no_period, aug_tok2.type_no_sentperiod) - ] += 1 - - def _unique_types(self, tokens): - return {aug_tok.type for aug_tok in tokens} - - def finalize_training(self, verbose=False): - """ - Uses data that has been gathered in training to determine likely - collocations and sentence starters. - """ - self._params.clear_sent_starters() - for typ, log_likelihood in self._find_sent_starters(): - self._params.sent_starters.add(typ) - if verbose: - print(f" Sent Starter: [{log_likelihood:6.4f}] {typ!r}") - - self._params.clear_collocations() - for (typ1, typ2), log_likelihood in self._find_collocations(): - self._params.collocations.add((typ1, typ2)) - if verbose: - print(f" Collocation: [{log_likelihood:6.4f}] {typ1!r}+{typ2!r}") - - self._finalized = True - - # //////////////////////////////////////////////////////////// - # { Overhead reduction - # //////////////////////////////////////////////////////////// - - def freq_threshold( - self, ortho_thresh=2, type_thresh=2, colloc_thres=2, sentstart_thresh=2 - ): - """ - Allows memory use to be reduced after much training by removing data - about rare tokens that are unlikely to have a statistical effect with - further training. Entries occurring above the given thresholds will be - retained. - """ - if ortho_thresh > 1: - old_oc = self._params.ortho_context - self._params.clear_ortho_context() - for tok in self._type_fdist: - count = self._type_fdist[tok] - if count >= ortho_thresh: - self._params.ortho_context[tok] = old_oc[tok] - - self._type_fdist = self._freq_threshold(self._type_fdist, type_thresh) - self._collocation_fdist = self._freq_threshold( - self._collocation_fdist, colloc_thres - ) - self._sent_starter_fdist = self._freq_threshold( - self._sent_starter_fdist, sentstart_thresh - ) - - def _freq_threshold(self, fdist, threshold): - """ - Returns a FreqDist containing only data with counts below a given - threshold, as well as a mapping (None -> count_removed). - """ - # We assume that there is more data below the threshold than above it - # and so create a new FreqDist rather than working in place. - res = FreqDist() - num_removed = 0 - for tok in fdist: - count = fdist[tok] - if count < threshold: - num_removed += 1 - else: - res[tok] += count - res[None] += num_removed - return res - - # //////////////////////////////////////////////////////////// - # { Orthographic data - # //////////////////////////////////////////////////////////// - - def _get_orthography_data(self, tokens): - """ - Collect information about whether each token type occurs - with different case patterns (i) overall, (ii) at - sentence-initial positions, and (iii) at sentence-internal - positions. - """ - # 'initial' or 'internal' or 'unknown' - context = "internal" - tokens = list(tokens) - - for aug_tok in tokens: - # If we encounter a paragraph break, then it's a good sign - # that it's a sentence break. But err on the side of - # caution (by not positing a sentence break) if we just - # saw an abbreviation. - if aug_tok.parastart and context != "unknown": - context = "initial" - - # If we're at the beginning of a line, then we can't decide - # between 'internal' and 'initial'. - if aug_tok.linestart and context == "internal": - context = "unknown" - - # Find the case-normalized type of the token. If it's a - # sentence-final token, strip off the period. - typ = aug_tok.type_no_sentperiod - - # Update the orthographic context table. - flag = _ORTHO_MAP.get((context, aug_tok.first_case), 0) - if flag: - self._params.add_ortho_context(typ, flag) - - # Decide whether the next word is at a sentence boundary. - if aug_tok.sentbreak: - if not (aug_tok.is_number or aug_tok.is_initial): - context = "initial" - else: - context = "unknown" - elif aug_tok.ellipsis or aug_tok.abbr: - context = "unknown" - else: - context = "internal" - - # //////////////////////////////////////////////////////////// - # { Abbreviations - # //////////////////////////////////////////////////////////// - - def _reclassify_abbrev_types(self, types): - """ - (Re)classifies each given token if - - it is period-final and not a known abbreviation; or - - it is not period-final and is otherwise a known abbreviation - by checking whether its previous classification still holds according - to the heuristics of section 3. - Yields triples (abbr, score, is_add) where abbr is the type in question, - score is its log-likelihood with penalties applied, and is_add specifies - whether the present type is a candidate for inclusion or exclusion as an - abbreviation, such that: - - (is_add and score >= 0.3) suggests a new abbreviation; and - - (not is_add and score < 0.3) suggests excluding an abbreviation. - """ - # (While one could recalculate abbreviations from all .-final tokens at - # every iteration, in cases requiring efficiency, the number of tokens - # in the present training document will be much less.) - - for typ in types: - # Check some basic conditions, to rule out words that are - # clearly not abbrev_types. - if not _re_non_punct.search(typ) or typ == "##number##": - continue - - if typ.endswith("."): - if typ in self._params.abbrev_types: - continue - typ = typ[:-1] - is_add = True - else: - if typ not in self._params.abbrev_types: - continue - is_add = False - - # Count how many periods & nonperiods are in the - # candidate. - num_periods = typ.count(".") + 1 - num_nonperiods = len(typ) - num_periods + 1 - - # Let be the candidate without the period, and - # be the period. Find a log likelihood ratio that - # indicates whether occurs as a single unit (high - # value of log_likelihood), or as two independent units and - # (low value of log_likelihood). - count_with_period = self._type_fdist[typ + "."] - count_without_period = self._type_fdist[typ] - log_likelihood = self._dunning_log_likelihood( - count_with_period + count_without_period, - self._num_period_toks, - count_with_period, - self._type_fdist.N(), - ) - - # Apply three scaling factors to 'tweak' the basic log - # likelihood ratio: - # F_length: long word -> less likely to be an abbrev - # F_periods: more periods -> more likely to be an abbrev - # F_penalty: penalize occurrences w/o a period - f_length = math.exp(-num_nonperiods) - f_periods = num_periods - f_penalty = int(self.IGNORE_ABBREV_PENALTY) or math.pow( - num_nonperiods, -count_without_period - ) - score = log_likelihood * f_length * f_periods * f_penalty - - yield typ, score, is_add - - def find_abbrev_types(self): - """ - Recalculates abbreviations given type frequencies, despite no prior - determination of abbreviations. - This fails to include abbreviations otherwise found as "rare". - """ - self._params.clear_abbrevs() - tokens = (typ for typ in self._type_fdist if typ and typ.endswith(".")) - for abbr, score, _is_add in self._reclassify_abbrev_types(tokens): - if score >= self.ABBREV: - self._params.abbrev_types.add(abbr) - - # This function combines the work done by the original code's - # functions `count_orthography_context`, `get_orthography_count`, - # and `get_rare_abbreviations`. - def _is_rare_abbrev_type(self, cur_tok, next_tok): - """ - A word type is counted as a rare abbreviation if... - - it's not already marked as an abbreviation - - it occurs fewer than ABBREV_BACKOFF times - - either it is followed by a sentence-internal punctuation - mark, *or* it is followed by a lower-case word that - sometimes appears with upper case, but never occurs with - lower case at the beginning of sentences. - """ - if cur_tok.abbr or not cur_tok.sentbreak: - return False - - # Find the case-normalized type of the token. If it's - # a sentence-final token, strip off the period. - typ = cur_tok.type_no_sentperiod - - # Proceed only if the type hasn't been categorized as an - # abbreviation already, and is sufficiently rare... - count = self._type_fdist[typ] + self._type_fdist[typ[:-1]] - if typ in self._params.abbrev_types or count >= self.ABBREV_BACKOFF: - return False - - # Record this token as an abbreviation if the next - # token is a sentence-internal punctuation mark. - # [XX] :1 or check the whole thing?? - if next_tok.tok[:1] in self._lang_vars.internal_punctuation: - return True - - # Record this type as an abbreviation if the next - # token... (i) starts with a lower case letter, - # (ii) sometimes occurs with an uppercase letter, - # and (iii) never occus with an uppercase letter - # sentence-internally. - # [xx] should the check for (ii) be modified?? - if next_tok.first_lower: - typ2 = next_tok.type_no_sentperiod - typ2ortho_context = self._params.ortho_context[typ2] - if (typ2ortho_context & _ORTHO_BEG_UC) and not ( - typ2ortho_context & _ORTHO_MID_UC - ): - return True - - # //////////////////////////////////////////////////////////// - # { Log Likelihoods - # //////////////////////////////////////////////////////////// - - # helper for _reclassify_abbrev_types: - @staticmethod - def _dunning_log_likelihood(count_a, count_b, count_ab, N): - """ - A function that calculates the modified Dunning log-likelihood - ratio scores for abbreviation candidates. The details of how - this works is available in the paper. - """ - p1 = count_b / N - p2 = 0.99 - - null_hypo = count_ab * math.log(p1) + (count_a - count_ab) * math.log(1.0 - p1) - alt_hypo = count_ab * math.log(p2) + (count_a - count_ab) * math.log(1.0 - p2) - - likelihood = null_hypo - alt_hypo - - return -2.0 * likelihood - - @staticmethod - def _col_log_likelihood(count_a, count_b, count_ab, N): - """ - A function that will just compute log-likelihood estimate, in - the original paper it's described in algorithm 6 and 7. - - This *should* be the original Dunning log-likelihood values, - unlike the previous log_l function where it used modified - Dunning log-likelihood values - """ - p = count_b / N - p1 = count_ab / count_a - try: - p2 = (count_b - count_ab) / (N - count_a) - except ZeroDivisionError: - p2 = 1 - - try: - summand1 = count_ab * math.log(p) + (count_a - count_ab) * math.log(1.0 - p) - except ValueError: - summand1 = 0 - - try: - summand2 = (count_b - count_ab) * math.log(p) + ( - N - count_a - count_b + count_ab - ) * math.log(1.0 - p) - except ValueError: - summand2 = 0 - - if count_a == count_ab or p1 <= 0 or p1 >= 1: - summand3 = 0 - else: - summand3 = count_ab * math.log(p1) + (count_a - count_ab) * math.log( - 1.0 - p1 - ) - - if count_b == count_ab or p2 <= 0 or p2 >= 1: - summand4 = 0 - else: - summand4 = (count_b - count_ab) * math.log(p2) + ( - N - count_a - count_b + count_ab - ) * math.log(1.0 - p2) - - likelihood = summand1 + summand2 - summand3 - summand4 - - return -2.0 * likelihood - - # //////////////////////////////////////////////////////////// - # { Collocation Finder - # //////////////////////////////////////////////////////////// - - def _is_potential_collocation(self, aug_tok1, aug_tok2): - """ - Returns True if the pair of tokens may form a collocation given - log-likelihood statistics. - """ - return ( - ( - self.INCLUDE_ALL_COLLOCS - or (self.INCLUDE_ABBREV_COLLOCS and aug_tok1.abbr) - or (aug_tok1.sentbreak and (aug_tok1.is_number or aug_tok1.is_initial)) - ) - and aug_tok1.is_non_punct - and aug_tok2.is_non_punct - ) - - def _find_collocations(self): - """ - Generates likely collocations and their log-likelihood. - """ - for types in self._collocation_fdist: - try: - typ1, typ2 = types - except TypeError: - # types may be None after calling freq_threshold() - continue - if typ2 in self._params.sent_starters: - continue - - col_count = self._collocation_fdist[types] - typ1_count = self._type_fdist[typ1] + self._type_fdist[typ1 + "."] - typ2_count = self._type_fdist[typ2] + self._type_fdist[typ2 + "."] - if ( - typ1_count > 1 - and typ2_count > 1 - and self.MIN_COLLOC_FREQ < col_count <= min(typ1_count, typ2_count) - ): - - log_likelihood = self._col_log_likelihood( - typ1_count, typ2_count, col_count, self._type_fdist.N() - ) - # Filter out the not-so-collocative - if log_likelihood >= self.COLLOCATION and ( - self._type_fdist.N() / typ1_count > typ2_count / col_count - ): - yield (typ1, typ2), log_likelihood - - # //////////////////////////////////////////////////////////// - # { Sentence-Starter Finder - # //////////////////////////////////////////////////////////// - - def _is_potential_sent_starter(self, cur_tok, prev_tok): - """ - Returns True given a token and the token that precedes it if it - seems clear that the token is beginning a sentence. - """ - # If a token (i) is preceded by a sentece break that is - # not a potential ordinal number or initial, and (ii) is - # alphabetic, then it is a a sentence-starter. - return ( - prev_tok.sentbreak - and not (prev_tok.is_number or prev_tok.is_initial) - and cur_tok.is_alpha - ) - - def _find_sent_starters(self): - """ - Uses collocation heuristics for each candidate token to - determine if it frequently starts sentences. - """ - for typ in self._sent_starter_fdist: - if not typ: - continue - - typ_at_break_count = self._sent_starter_fdist[typ] - typ_count = self._type_fdist[typ] + self._type_fdist[typ + "."] - if typ_count < typ_at_break_count: - # needed after freq_threshold - continue - - log_likelihood = self._col_log_likelihood( - self._sentbreak_count, - typ_count, - typ_at_break_count, - self._type_fdist.N(), - ) - - if ( - log_likelihood >= self.SENT_STARTER - and self._type_fdist.N() / self._sentbreak_count - > typ_count / typ_at_break_count - ): - yield typ, log_likelihood - - def _get_sentbreak_count(self, tokens): - """ - Returns the number of sentence breaks marked in a given set of - augmented tokens. - """ - return sum(1 for aug_tok in tokens if aug_tok.sentbreak) - - -###################################################################### -# { Punkt Sentence Tokenizer -###################################################################### - - -class PunktSentenceTokenizer(PunktBaseClass, TokenizerI): - """ - A sentence tokenizer which uses an unsupervised algorithm to build - a model for abbreviation words, collocations, and words that start - sentences; and then uses that model to find sentence boundaries. - This approach has been shown to work well for many European - languages. - """ - - def __init__( - self, train_text=None, verbose=False, lang_vars=None, token_cls=PunktToken - ): - """ - train_text can either be the sole training text for this sentence - boundary detector, or can be a PunktParameters object. - """ - PunktBaseClass.__init__(self, lang_vars=lang_vars, token_cls=token_cls) - - if train_text: - self._params = self.train(train_text, verbose) - - def train(self, train_text, verbose=False): - """ - Derives parameters from a given training text, or uses the parameters - given. Repeated calls to this method destroy previous parameters. For - incremental training, instantiate a separate PunktTrainer instance. - """ - if not isinstance(train_text, str): - return train_text - return PunktTrainer( - train_text, lang_vars=self._lang_vars, token_cls=self._Token - ).get_params() - - # //////////////////////////////////////////////////////////// - # { Tokenization - # //////////////////////////////////////////////////////////// - - def tokenize(self, text: str, realign_boundaries: bool = True) -> List[str]: - """ - Given a text, returns a list of the sentences in that text. - """ - return list(self.sentences_from_text(text, realign_boundaries)) - - def debug_decisions(self, text: str) -> Iterator[Dict[str, Any]]: - """ - Classifies candidate periods as sentence breaks, yielding a dict for - each that may be used to understand why the decision was made. - - See format_debug_decision() to help make this output readable. - """ - - for match, decision_text in self._match_potential_end_contexts(text): - tokens = self._tokenize_words(decision_text) - tokens = list(self._annotate_first_pass(tokens)) - while tokens and not tokens[0].tok.endswith(self._lang_vars.sent_end_chars): - tokens.pop(0) - yield { - "period_index": match.end() - 1, - "text": decision_text, - "type1": tokens[0].type, - "type2": tokens[1].type, - "type1_in_abbrs": bool(tokens[0].abbr), - "type1_is_initial": bool(tokens[0].is_initial), - "type2_is_sent_starter": tokens[1].type_no_sentperiod - in self._params.sent_starters, - "type2_ortho_heuristic": self._ortho_heuristic(tokens[1]), - "type2_ortho_contexts": set( - self._params._debug_ortho_context(tokens[1].type_no_sentperiod) - ), - "collocation": ( - tokens[0].type_no_sentperiod, - tokens[1].type_no_sentperiod, - ) - in self._params.collocations, - "reason": self._second_pass_annotation(tokens[0], tokens[1]) - or REASON_DEFAULT_DECISION, - "break_decision": tokens[0].sentbreak, - } - - def span_tokenize( - self, text: str, realign_boundaries: bool = True - ) -> Iterator[Tuple[int, int]]: - """ - Given a text, generates (start, end) spans of sentences - in the text. - """ - slices = self._slices_from_text(text) - if realign_boundaries: - slices = self._realign_boundaries(text, slices) - for sentence in slices: - yield (sentence.start, sentence.stop) - - def sentences_from_text( - self, text: str, realign_boundaries: bool = True - ) -> List[str]: - """ - Given a text, generates the sentences in that text by only - testing candidate sentence breaks. If realign_boundaries is - True, includes in the sentence closing punctuation that - follows the period. - """ - return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)] - - def _get_last_whitespace_index(self, text: str) -> int: - """ - Given a text, find the index of the *last* occurrence of *any* - whitespace character, i.e. " ", "\n", "\t", "\r", etc. - If none is found, return 0. - """ - for i in range(len(text) - 1, -1, -1): - if text[i] in string.whitespace: - return i - return 0 - - def _match_potential_end_contexts(self, text: str) -> Iterator[Tuple[Match, str]]: - """ - Given a text, find the matches of potential sentence breaks, - alongside the contexts surrounding these sentence breaks. - - Since the fix for the ReDOS discovered in issue #2866, we no longer match - the word before a potential end of sentence token. Instead, we use a separate - regex for this. As a consequence, `finditer`'s desire to find non-overlapping - matches no longer aids us in finding the single longest match. - Where previously, we could use:: - - >>> pst = PunktSentenceTokenizer() - >>> text = "Very bad acting!!! I promise." - >>> list(pst._lang_vars.period_context_re().finditer(text)) # doctest: +SKIP - [] - - Now we have to find the word before (i.e. 'acting') separately, and `finditer` - returns:: - - >>> pst = PunktSentenceTokenizer() - >>> text = "Very bad acting!!! I promise." - >>> list(pst._lang_vars.period_context_re().finditer(text)) # doctest: +NORMALIZE_WHITESPACE - [, - , - ] - - So, we need to find the word before the match from right to left, and then manually remove - the overlaps. That is what this method does:: - - >>> pst = PunktSentenceTokenizer() - >>> text = "Very bad acting!!! I promise." - >>> list(pst._match_potential_end_contexts(text)) - [(, 'acting!!! I')] - - :param text: String of one or more sentences - :type text: str - :return: Generator of match-context tuples. - :rtype: Iterator[Tuple[Match, str]] - """ - previous_slice = slice(0, 0) - previous_match = None - for match in self._lang_vars.period_context_re().finditer(text): - - # Get the slice of the previous word - before_text = text[previous_slice.stop : match.start()] - index_after_last_space = self._get_last_whitespace_index(before_text) - if index_after_last_space: - # + 1 to exclude the space itself - index_after_last_space += previous_slice.stop + 1 - else: - index_after_last_space = previous_slice.start - prev_word_slice = slice(index_after_last_space, match.start()) - - # If the previous slice does not overlap with this slice, then - # we can yield the previous match and slice. If there is an overlap, - # then we do not yield the previous match and slice. - if previous_match and previous_slice.stop <= prev_word_slice.start: - yield ( - previous_match, - text[previous_slice] - + previous_match.group() - + previous_match.group("after_tok"), - ) - previous_match = match - previous_slice = prev_word_slice - - # Yield the last match and context, if it exists - if previous_match: - yield ( - previous_match, - text[previous_slice] - + previous_match.group() - + previous_match.group("after_tok"), - ) - - def _slices_from_text(self, text: str) -> Iterator[slice]: - last_break = 0 - for match, context in self._match_potential_end_contexts(text): - if self.text_contains_sentbreak(context): - yield slice(last_break, match.end()) - if match.group("next_tok"): - # next sentence starts after whitespace - last_break = match.start("next_tok") - else: - # next sentence starts at following punctuation - last_break = match.end() - # The last sentence should not contain trailing whitespace. - yield slice(last_break, len(text.rstrip())) - - def _realign_boundaries( - self, text: str, slices: Iterator[slice] - ) -> Iterator[slice]: - """ - Attempts to realign punctuation that falls after the period but - should otherwise be included in the same sentence. - - For example: "(Sent1.) Sent2." will otherwise be split as:: - - ["(Sent1.", ") Sent1."]. - - This method will produce:: - - ["(Sent1.)", "Sent2."]. - """ - realign = 0 - for sentence1, sentence2 in _pair_iter(slices): - sentence1 = slice(sentence1.start + realign, sentence1.stop) - if not sentence2: - if text[sentence1]: - yield sentence1 - continue - - m = self._lang_vars.re_boundary_realignment.match(text[sentence2]) - if m: - yield slice(sentence1.start, sentence2.start + len(m.group(0).rstrip())) - realign = m.end() - else: - realign = 0 - if text[sentence1]: - yield sentence1 - - def text_contains_sentbreak(self, text: str) -> bool: - """ - Returns True if the given text includes a sentence break. - """ - found = False # used to ignore last token - for tok in self._annotate_tokens(self._tokenize_words(text)): - if found: - return True - if tok.sentbreak: - found = True - return False - - def sentences_from_text_legacy(self, text: str) -> Iterator[str]: - """ - Given a text, generates the sentences in that text. Annotates all - tokens, rather than just those with possible sentence breaks. Should - produce the same results as ``sentences_from_text``. - """ - tokens = self._annotate_tokens(self._tokenize_words(text)) - return self._build_sentence_list(text, tokens) - - def sentences_from_tokens( - self, tokens: Iterator[PunktToken] - ) -> Iterator[PunktToken]: - """ - Given a sequence of tokens, generates lists of tokens, each list - corresponding to a sentence. - """ - tokens = iter(self._annotate_tokens(self._Token(t) for t in tokens)) - sentence = [] - for aug_tok in tokens: - sentence.append(aug_tok.tok) - if aug_tok.sentbreak: - yield sentence - sentence = [] - if sentence: - yield sentence - - def _annotate_tokens(self, tokens: Iterator[PunktToken]) -> Iterator[PunktToken]: - """ - Given a set of tokens augmented with markers for line-start and - paragraph-start, returns an iterator through those tokens with full - annotation including predicted sentence breaks. - """ - # Make a preliminary pass through the document, marking likely - # sentence breaks, abbreviations, and ellipsis tokens. - tokens = self._annotate_first_pass(tokens) - - # Make a second pass through the document, using token context - # information to change our preliminary decisions about where - # sentence breaks, abbreviations, and ellipsis occurs. - tokens = self._annotate_second_pass(tokens) - - ## [XX] TESTING - # tokens = list(tokens) - # self.dump(tokens) - - return tokens - - def _build_sentence_list( - self, text: str, tokens: Iterator[PunktToken] - ) -> Iterator[str]: - """ - Given the original text and the list of augmented word tokens, - construct and return a tokenized list of sentence strings. - """ - # Most of the work here is making sure that we put the right - # pieces of whitespace back in all the right places. - - # Our position in the source text, used to keep track of which - # whitespace to add: - pos = 0 - - # A regular expression that finds pieces of whitespace: - white_space_regexp = re.compile(r"\s*") - - sentence = "" - for aug_tok in tokens: - tok = aug_tok.tok - - # Find the whitespace before this token, and update pos. - white_space = white_space_regexp.match(text, pos).group() - pos += len(white_space) - - # Some of the rules used by the punkt word tokenizer - # strip whitespace out of the text, resulting in tokens - # that contain whitespace in the source text. If our - # token doesn't match, see if adding whitespace helps. - # If so, then use the version with whitespace. - if text[pos : pos + len(tok)] != tok: - pat = r"\s*".join(re.escape(c) for c in tok) - m = re.compile(pat).match(text, pos) - if m: - tok = m.group() - - # Move our position pointer to the end of the token. - assert text[pos : pos + len(tok)] == tok - pos += len(tok) - - # Add this token. If it's not at the beginning of the - # sentence, then include any whitespace that separated it - # from the previous token. - if sentence: - sentence += white_space - sentence += tok - - # If we're at a sentence break, then start a new sentence. - if aug_tok.sentbreak: - yield sentence - sentence = "" - - # If the last sentence is empty, discard it. - if sentence: - yield sentence - - # [XX] TESTING - def dump(self, tokens: Iterator[PunktToken]) -> None: - print("writing to /tmp/punkt.new...") - with open("/tmp/punkt.new", "w") as outfile: - for aug_tok in tokens: - if aug_tok.parastart: - outfile.write("\n\n") - elif aug_tok.linestart: - outfile.write("\n") - else: - outfile.write(" ") - - outfile.write(str(aug_tok)) - - # //////////////////////////////////////////////////////////// - # { Customization Variables - # //////////////////////////////////////////////////////////// - - PUNCTUATION = tuple(";:,.!?") - - # //////////////////////////////////////////////////////////// - # { Annotation Procedures - # //////////////////////////////////////////////////////////// - - def _annotate_second_pass( - self, tokens: Iterator[PunktToken] - ) -> Iterator[PunktToken]: - """ - Performs a token-based classification (section 4) over the given - tokens, making use of the orthographic heuristic (4.1.1), collocation - heuristic (4.1.2) and frequent sentence starter heuristic (4.1.3). - """ - for token1, token2 in _pair_iter(tokens): - self._second_pass_annotation(token1, token2) - yield token1 - - def _second_pass_annotation( - self, aug_tok1: PunktToken, aug_tok2: Optional[PunktToken] - ) -> Optional[str]: - """ - Performs token-based classification over a pair of contiguous tokens - updating the first. - """ - # Is it the last token? We can't do anything then. - if not aug_tok2: - return - - if not aug_tok1.period_final: - # We only care about words ending in periods. - return - typ = aug_tok1.type_no_period - next_typ = aug_tok2.type_no_sentperiod - tok_is_initial = aug_tok1.is_initial - - # [4.1.2. Collocation Heuristic] If there's a - # collocation between the word before and after the - # period, then label tok as an abbreviation and NOT - # a sentence break. Note that collocations with - # frequent sentence starters as their second word are - # excluded in training. - if (typ, next_typ) in self._params.collocations: - aug_tok1.sentbreak = False - aug_tok1.abbr = True - return REASON_KNOWN_COLLOCATION - - # [4.2. Token-Based Reclassification of Abbreviations] If - # the token is an abbreviation or an ellipsis, then decide - # whether we should *also* classify it as a sentbreak. - if (aug_tok1.abbr or aug_tok1.ellipsis) and (not tok_is_initial): - # [4.1.1. Orthographic Heuristic] Check if there's - # orthogrpahic evidence about whether the next word - # starts a sentence or not. - is_sent_starter = self._ortho_heuristic(aug_tok2) - if is_sent_starter == True: - aug_tok1.sentbreak = True - return REASON_ABBR_WITH_ORTHOGRAPHIC_HEURISTIC - - # [4.1.3. Frequent Sentence Starter Heruistic] If the - # next word is capitalized, and is a member of the - # frequent-sentence-starters list, then label tok as a - # sentence break. - if aug_tok2.first_upper and next_typ in self._params.sent_starters: - aug_tok1.sentbreak = True - return REASON_ABBR_WITH_SENTENCE_STARTER - - # [4.3. Token-Based Detection of Initials and Ordinals] - # Check if any initials or ordinals tokens that are marked - # as sentbreaks should be reclassified as abbreviations. - if tok_is_initial or typ == "##number##": - - # [4.1.1. Orthographic Heuristic] Check if there's - # orthogrpahic evidence about whether the next word - # starts a sentence or not. - is_sent_starter = self._ortho_heuristic(aug_tok2) - - if is_sent_starter == False: - aug_tok1.sentbreak = False - aug_tok1.abbr = True - if tok_is_initial: - return REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC - return REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC - - # Special heuristic for initials: if orthogrpahic - # heuristic is unknown, and next word is always - # capitalized, then mark as abbrev (eg: J. Bach). - if ( - is_sent_starter == "unknown" - and tok_is_initial - and aug_tok2.first_upper - and not (self._params.ortho_context[next_typ] & _ORTHO_LC) - ): - aug_tok1.sentbreak = False - aug_tok1.abbr = True - return REASON_INITIAL_WITH_SPECIAL_ORTHOGRAPHIC_HEURISTIC - - return - - def _ortho_heuristic(self, aug_tok: PunktToken) -> Union[bool, str]: - """ - Decide whether the given token is the first token in a sentence. - """ - # Sentences don't start with punctuation marks: - if aug_tok.tok in self.PUNCTUATION: - return False - - ortho_context = self._params.ortho_context[aug_tok.type_no_sentperiod] - - # If the word is capitalized, occurs at least once with a - # lower case first letter, and never occurs with an upper case - # first letter sentence-internally, then it's a sentence starter. - if ( - aug_tok.first_upper - and (ortho_context & _ORTHO_LC) - and not (ortho_context & _ORTHO_MID_UC) - ): - return True - - # If the word is lower case, and either (a) we've seen it used - # with upper case, or (b) we've never seen it used - # sentence-initially with lower case, then it's not a sentence - # starter. - if aug_tok.first_lower and ( - (ortho_context & _ORTHO_UC) or not (ortho_context & _ORTHO_BEG_LC) - ): - return False - - # Otherwise, we're not sure. - return "unknown" - - -DEBUG_DECISION_FMT = """Text: {text!r} (at offset {period_index}) -Sentence break? {break_decision} ({reason}) -Collocation? {collocation} -{type1!r}: - known abbreviation: {type1_in_abbrs} - is initial: {type1_is_initial} -{type2!r}: - known sentence starter: {type2_is_sent_starter} - orthographic heuristic suggests is a sentence starter? {type2_ortho_heuristic} - orthographic contexts in training: {type2_ortho_contexts} -""" - - -def format_debug_decision(d): - return DEBUG_DECISION_FMT.format(**d) - - -def demo(text, tok_cls=PunktSentenceTokenizer, train_cls=PunktTrainer): - """Builds a punkt model and applies it to the same text""" - cleanup = ( - lambda s: re.compile(r"(?:\r|^\s+)", re.MULTILINE).sub("", s).replace("\n", " ") - ) - trainer = train_cls() - trainer.INCLUDE_ALL_COLLOCS = True - trainer.train(text) - sbd = tok_cls(trainer.get_params()) - for sentence in sbd.sentences_from_text(text): - print(cleanup(sentence)) diff --git a/pipeline/nltk/tokenize/regexp.py b/pipeline/nltk/tokenize/regexp.py deleted file mode 100644 index e3875b1447ba2843b7e6f186de24b4e67baf8844..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tokenize/regexp.py +++ /dev/null @@ -1,220 +0,0 @@ -# Natural Language Toolkit: Tokenizers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird -# Trevor Cohn -# URL: -# For license information, see LICENSE.TXT - -r""" -Regular-Expression Tokenizers - -A ``RegexpTokenizer`` splits a string into substrings using a regular expression. -For example, the following tokenizer forms tokens out of alphabetic sequences, -money expressions, and any other non-whitespace sequences: - - >>> from nltk.tokenize import RegexpTokenizer - >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." - >>> tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+') - >>> tokenizer.tokenize(s) # doctest: +NORMALIZE_WHITESPACE - ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.', - 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] - -A ``RegexpTokenizer`` can use its regexp to match delimiters instead: - - >>> tokenizer = RegexpTokenizer(r'\s+', gaps=True) - >>> tokenizer.tokenize(s) # doctest: +NORMALIZE_WHITESPACE - ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', - 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.'] - -Note that empty tokens are not returned when the delimiter appears at -the start or end of the string. - -The material between the tokens is discarded. For example, -the following tokenizer selects just the capitalized words: - - >>> capword_tokenizer = RegexpTokenizer(r'[A-Z]\w+') - >>> capword_tokenizer.tokenize(s) - ['Good', 'New', 'York', 'Please', 'Thanks'] - -This module contains several subclasses of ``RegexpTokenizer`` -that use pre-defined regular expressions. - - >>> from nltk.tokenize import BlanklineTokenizer - >>> # Uses '\s*\n\s*\n\s*': - >>> BlanklineTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE - ['Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.', - 'Thanks.'] - -All of the regular expression tokenizers are also available as functions: - - >>> from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize - >>> regexp_tokenize(s, pattern=r'\w+|\$[\d\.]+|\S+') # doctest: +NORMALIZE_WHITESPACE - ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.', - 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] - >>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE - ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', - '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] - >>> blankline_tokenize(s) - ['Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.', 'Thanks.'] - -Caution: The function ``regexp_tokenize()`` takes the text as its -first argument, and the regular expression pattern as its second -argument. This differs from the conventions used by Python's -``re`` functions, where the pattern is always the first argument. -(This is for consistency with the other NLTK tokenizers.) -""" - -import re - -from nltk.tokenize.api import TokenizerI -from nltk.tokenize.util import regexp_span_tokenize - - -class RegexpTokenizer(TokenizerI): - r""" - A tokenizer that splits a string using a regular expression, which - matches either the tokens or the separators between tokens. - - >>> tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+') - - :type pattern: str - :param pattern: The pattern used to build this tokenizer. - (This pattern must not contain capturing parentheses; - Use non-capturing parentheses, e.g. (?:...), instead) - :type gaps: bool - :param gaps: True if this tokenizer's pattern should be used - to find separators between tokens; False if this - tokenizer's pattern should be used to find the tokens - themselves. - :type discard_empty: bool - :param discard_empty: True if any empty tokens `''` - generated by the tokenizer should be discarded. Empty - tokens can only be generated if `_gaps == True`. - :type flags: int - :param flags: The regexp flags used to compile this - tokenizer's pattern. By default, the following flags are - used: `re.UNICODE | re.MULTILINE | re.DOTALL`. - - """ - - def __init__( - self, - pattern, - gaps=False, - discard_empty=True, - flags=re.UNICODE | re.MULTILINE | re.DOTALL, - ): - # If they gave us a regexp object, extract the pattern. - pattern = getattr(pattern, "pattern", pattern) - - self._pattern = pattern - self._gaps = gaps - self._discard_empty = discard_empty - self._flags = flags - self._regexp = None - - def _check_regexp(self): - if self._regexp is None: - self._regexp = re.compile(self._pattern, self._flags) - - def tokenize(self, text): - self._check_regexp() - # If our regexp matches gaps, use re.split: - if self._gaps: - if self._discard_empty: - return [tok for tok in self._regexp.split(text) if tok] - else: - return self._regexp.split(text) - - # If our regexp matches tokens, use re.findall: - else: - return self._regexp.findall(text) - - def span_tokenize(self, text): - self._check_regexp() - - if self._gaps: - for left, right in regexp_span_tokenize(text, self._regexp): - if not (self._discard_empty and left == right): - yield left, right - else: - for m in re.finditer(self._regexp, text): - yield m.span() - - def __repr__(self): - return "{}(pattern={!r}, gaps={!r}, discard_empty={!r}, flags={!r})".format( - self.__class__.__name__, - self._pattern, - self._gaps, - self._discard_empty, - self._flags, - ) - - -class WhitespaceTokenizer(RegexpTokenizer): - r""" - Tokenize a string on whitespace (space, tab, newline). - In general, users should use the string ``split()`` method instead. - - >>> from nltk.tokenize import WhitespaceTokenizer - >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." - >>> WhitespaceTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE - ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', - 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.'] - """ - - def __init__(self): - RegexpTokenizer.__init__(self, r"\s+", gaps=True) - - -class BlanklineTokenizer(RegexpTokenizer): - """ - Tokenize a string, treating any sequence of blank lines as a delimiter. - Blank lines are defined as lines containing no characters, except for - space or tab characters. - """ - - def __init__(self): - RegexpTokenizer.__init__(self, r"\s*\n\s*\n\s*", gaps=True) - - -class WordPunctTokenizer(RegexpTokenizer): - r""" - Tokenize a text into a sequence of alphabetic and - non-alphabetic characters, using the regexp ``\w+|[^\w\s]+``. - - >>> from nltk.tokenize import WordPunctTokenizer - >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." - >>> WordPunctTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE - ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', - '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] - """ - - def __init__(self): - RegexpTokenizer.__init__(self, r"\w+|[^\w\s]+") - - -###################################################################### -# { Tokenization Functions -###################################################################### - - -def regexp_tokenize( - text, - pattern, - gaps=False, - discard_empty=True, - flags=re.UNICODE | re.MULTILINE | re.DOTALL, -): - """ - Return a tokenized copy of *text*. See :class:`.RegexpTokenizer` - for descriptions of the arguments. - """ - tokenizer = RegexpTokenizer(pattern, gaps, discard_empty, flags) - return tokenizer.tokenize(text) - - -blankline_tokenize = BlanklineTokenizer().tokenize -wordpunct_tokenize = WordPunctTokenizer().tokenize diff --git a/pipeline/nltk/tokenize/repp.py b/pipeline/nltk/tokenize/repp.py deleted file mode 100644 index 6e0740a94645f14ec6162814cdb3c92167f503bb..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tokenize/repp.py +++ /dev/null @@ -1,149 +0,0 @@ -# Natural Language Toolkit: Interface to the Repp Tokenizer -# -# Copyright (C) 2001-2015 NLTK Project -# Authors: Rebecca Dridan and Stephan Oepen -# Contributors: Liling Tan -# -# URL: -# For license information, see LICENSE.TXT - -import os -import re -import subprocess -import sys -import tempfile - -from nltk.data import ZipFilePathPointer -from nltk.internals import find_dir -from nltk.tokenize.api import TokenizerI - - -class ReppTokenizer(TokenizerI): - """ - A class for word tokenization using the REPP parser described in - Rebecca Dridan and Stephan Oepen (2012) Tokenization: Returning to a - Long Solved Problem - A Survey, Contrastive Experiment, Recommendations, - and Toolkit. In ACL. http://anthology.aclweb.org/P/P12/P12-2.pdf#page=406 - - >>> sents = ['Tokenization is widely regarded as a solved problem due to the high accuracy that rulebased tokenizers achieve.' , - ... 'But rule-based tokenizers are hard to maintain and their rules language specific.' , - ... 'We evaluated our method on three languages and obtained error rates of 0.27% (English), 0.35% (Dutch) and 0.76% (Italian) for our best models.' - ... ] - >>> tokenizer = ReppTokenizer('/home/alvas/repp/') # doctest: +SKIP - >>> for sent in sents: # doctest: +SKIP - ... tokenizer.tokenize(sent) # doctest: +SKIP - ... - (u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.') - (u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.') - (u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.') - - >>> for sent in tokenizer.tokenize_sents(sents): # doctest: +SKIP - ... print(sent) # doctest: +SKIP - ... - (u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.') - (u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.') - (u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.') - >>> for sent in tokenizer.tokenize_sents(sents, keep_token_positions=True): # doctest: +SKIP - ... print(sent) # doctest: +SKIP - ... - [(u'Tokenization', 0, 12), (u'is', 13, 15), (u'widely', 16, 22), (u'regarded', 23, 31), (u'as', 32, 34), (u'a', 35, 36), (u'solved', 37, 43), (u'problem', 44, 51), (u'due', 52, 55), (u'to', 56, 58), (u'the', 59, 62), (u'high', 63, 67), (u'accuracy', 68, 76), (u'that', 77, 81), (u'rulebased', 82, 91), (u'tokenizers', 92, 102), (u'achieve', 103, 110), (u'.', 110, 111)] - [(u'But', 0, 3), (u'rule-based', 4, 14), (u'tokenizers', 15, 25), (u'are', 26, 29), (u'hard', 30, 34), (u'to', 35, 37), (u'maintain', 38, 46), (u'and', 47, 50), (u'their', 51, 56), (u'rules', 57, 62), (u'language', 63, 71), (u'specific', 72, 80), (u'.', 80, 81)] - [(u'We', 0, 2), (u'evaluated', 3, 12), (u'our', 13, 16), (u'method', 17, 23), (u'on', 24, 26), (u'three', 27, 32), (u'languages', 33, 42), (u'and', 43, 46), (u'obtained', 47, 55), (u'error', 56, 61), (u'rates', 62, 67), (u'of', 68, 70), (u'0.27', 71, 75), (u'%', 75, 76), (u'(', 77, 78), (u'English', 78, 85), (u')', 85, 86), (u',', 86, 87), (u'0.35', 88, 92), (u'%', 92, 93), (u'(', 94, 95), (u'Dutch', 95, 100), (u')', 100, 101), (u'and', 102, 105), (u'0.76', 106, 110), (u'%', 110, 111), (u'(', 112, 113), (u'Italian', 113, 120), (u')', 120, 121), (u'for', 122, 125), (u'our', 126, 129), (u'best', 130, 134), (u'models', 135, 141), (u'.', 141, 142)] - """ - - def __init__(self, repp_dir, encoding="utf8"): - self.repp_dir = self.find_repptokenizer(repp_dir) - # Set a directory to store the temporary files. - self.working_dir = tempfile.gettempdir() - # Set an encoding for the input strings. - self.encoding = encoding - - def tokenize(self, sentence): - """ - Use Repp to tokenize a single sentence. - - :param sentence: A single sentence string. - :type sentence: str - :return: A tuple of tokens. - :rtype: tuple(str) - """ - return next(self.tokenize_sents([sentence])) - - def tokenize_sents(self, sentences, keep_token_positions=False): - """ - Tokenize multiple sentences using Repp. - - :param sentences: A list of sentence strings. - :type sentences: list(str) - :return: A list of tuples of tokens - :rtype: iter(tuple(str)) - """ - with tempfile.NamedTemporaryFile( - prefix="repp_input.", dir=self.working_dir, mode="w", delete=False - ) as input_file: - # Write sentences to temporary input file. - for sent in sentences: - input_file.write(str(sent) + "\n") - input_file.close() - # Generate command to run REPP. - cmd = self.generate_repp_command(input_file.name) - # Decode the stdout and strips the ending newline. - repp_output = self._execute(cmd).decode(self.encoding).strip() - for tokenized_sent in self.parse_repp_outputs(repp_output): - if not keep_token_positions: - # Removes token position information. - tokenized_sent, starts, ends = zip(*tokenized_sent) - yield tokenized_sent - - def generate_repp_command(self, inputfilename): - """ - This module generates the REPP command to be used at the terminal. - - :param inputfilename: path to the input file - :type inputfilename: str - """ - cmd = [self.repp_dir + "/src/repp"] - cmd += ["-c", self.repp_dir + "/erg/repp.set"] - cmd += ["--format", "triple"] - cmd += [inputfilename] - return cmd - - @staticmethod - def _execute(cmd): - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = p.communicate() - return stdout - - @staticmethod - def parse_repp_outputs(repp_output): - """ - This module parses the tri-tuple format that REPP outputs using the - "--format triple" option and returns an generator with tuple of string - tokens. - - :param repp_output: - :type repp_output: type - :return: an iterable of the tokenized sentences as tuples of strings - :rtype: iter(tuple) - """ - line_regex = re.compile(r"^\((\d+), (\d+), (.+)\)$", re.MULTILINE) - for section in repp_output.split("\n\n"): - words_with_positions = [ - (token, int(start), int(end)) - for start, end, token in line_regex.findall(section) - ] - words = tuple(t[2] for t in words_with_positions) - yield words_with_positions - - def find_repptokenizer(self, repp_dirname): - """ - A module to find REPP tokenizer binary and its *repp.set* config file. - """ - if os.path.exists(repp_dirname): # If a full path is given. - _repp_dir = repp_dirname - else: # Try to find path to REPP directory in environment variables. - _repp_dir = find_dir(repp_dirname, env_vars=("REPP_TOKENIZER",)) - # Checks for the REPP binary and erg/repp.set config file. - assert os.path.exists(_repp_dir + "/src/repp") - assert os.path.exists(_repp_dir + "/erg/repp.set") - return _repp_dir diff --git a/pipeline/nltk/tokenize/sexpr.py b/pipeline/nltk/tokenize/sexpr.py deleted file mode 100644 index 0776642fbd2759c3f37352a97b18d915198cc20c..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tokenize/sexpr.py +++ /dev/null @@ -1,140 +0,0 @@ -# Natural Language Toolkit: Tokenizers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Yoav Goldberg -# Steven Bird (minor edits) -# URL: -# For license information, see LICENSE.TXT - -""" -S-Expression Tokenizer - -``SExprTokenizer`` is used to find parenthesized expressions in a -string. In particular, it divides a string into a sequence of -substrings that are either parenthesized expressions (including any -nested parenthesized expressions), or other whitespace-separated -tokens. - - >>> from nltk.tokenize import SExprTokenizer - >>> SExprTokenizer().tokenize('(a b (c d)) e f (g)') - ['(a b (c d))', 'e', 'f', '(g)'] - -By default, `SExprTokenizer` will raise a ``ValueError`` exception if -used to tokenize an expression with non-matching parentheses: - - >>> SExprTokenizer().tokenize('c) d) e (f (g') - Traceback (most recent call last): - ... - ValueError: Un-matched close paren at char 1 - -The ``strict`` argument can be set to False to allow for -non-matching parentheses. Any unmatched close parentheses will be -listed as their own s-expression; and the last partial sexpr with -unmatched open parentheses will be listed as its own sexpr: - - >>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g') - ['c', ')', 'd', ')', 'e', '(f (g'] - -The characters used for open and close parentheses may be customized -using the ``parens`` argument to the `SExprTokenizer` constructor: - - >>> SExprTokenizer(parens='{}').tokenize('{a b {c d}} e f {g}') - ['{a b {c d}}', 'e', 'f', '{g}'] - -The s-expression tokenizer is also available as a function: - - >>> from nltk.tokenize import sexpr_tokenize - >>> sexpr_tokenize('(a b (c d)) e f (g)') - ['(a b (c d))', 'e', 'f', '(g)'] - -""" - -import re - -from nltk.tokenize.api import TokenizerI - - -class SExprTokenizer(TokenizerI): - """ - A tokenizer that divides strings into s-expressions. - An s-expresion can be either: - - - a parenthesized expression, including any nested parenthesized - expressions, or - - a sequence of non-whitespace non-parenthesis characters. - - For example, the string ``(a (b c)) d e (f)`` consists of four - s-expressions: ``(a (b c))``, ``d``, ``e``, and ``(f)``. - - By default, the characters ``(`` and ``)`` are treated as open and - close parentheses, but alternative strings may be specified. - - :param parens: A two-element sequence specifying the open and close parentheses - that should be used to find sexprs. This will typically be either a - two-character string, or a list of two strings. - :type parens: str or list - :param strict: If true, then raise an exception when tokenizing an ill-formed sexpr. - """ - - def __init__(self, parens="()", strict=True): - if len(parens) != 2: - raise ValueError("parens must contain exactly two strings") - self._strict = strict - self._open_paren = parens[0] - self._close_paren = parens[1] - self._paren_regexp = re.compile( - f"{re.escape(parens[0])}|{re.escape(parens[1])}" - ) - - def tokenize(self, text): - """ - Return a list of s-expressions extracted from *text*. - For example: - - >>> SExprTokenizer().tokenize('(a b (c d)) e f (g)') - ['(a b (c d))', 'e', 'f', '(g)'] - - All parentheses are assumed to mark s-expressions. - (No special processing is done to exclude parentheses that occur - inside strings, or following backslash characters.) - - If the given expression contains non-matching parentheses, - then the behavior of the tokenizer depends on the ``strict`` - parameter to the constructor. If ``strict`` is ``True``, then - raise a ``ValueError``. If ``strict`` is ``False``, then any - unmatched close parentheses will be listed as their own - s-expression; and the last partial s-expression with unmatched open - parentheses will be listed as its own s-expression: - - >>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g') - ['c', ')', 'd', ')', 'e', '(f (g'] - - :param text: the string to be tokenized - :type text: str or iter(str) - :rtype: iter(str) - """ - result = [] - pos = 0 - depth = 0 - for m in self._paren_regexp.finditer(text): - paren = m.group() - if depth == 0: - result += text[pos : m.start()].split() - pos = m.start() - if paren == self._open_paren: - depth += 1 - if paren == self._close_paren: - if self._strict and depth == 0: - raise ValueError("Un-matched close paren at char %d" % m.start()) - depth = max(0, depth - 1) - if depth == 0: - result.append(text[pos : m.end()]) - pos = m.end() - if self._strict and depth > 0: - raise ValueError("Un-matched open paren at char %d" % pos) - if pos < len(text): - result.append(text[pos:]) - return result - - -sexpr_tokenize = SExprTokenizer().tokenize diff --git a/pipeline/nltk/tokenize/simple.py b/pipeline/nltk/tokenize/simple.py deleted file mode 100644 index f87b60a274c8121303ff60f203e1f3b991da1547..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tokenize/simple.py +++ /dev/null @@ -1,137 +0,0 @@ -# Natural Language Toolkit: Simple Tokenizers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird -# URL: -# For license information, see LICENSE.TXT - -r""" -Simple Tokenizers - -These tokenizers divide strings into substrings using the string -``split()`` method. -When tokenizing using a particular delimiter string, use -the string ``split()`` method directly, as this is more efficient. - -The simple tokenizers are *not* available as separate functions; -instead, you should just use the string ``split()`` method directly: - - >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." - >>> s.split() # doctest: +NORMALIZE_WHITESPACE - ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', - 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.'] - >>> s.split(' ') # doctest: +NORMALIZE_WHITESPACE - ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '', - 'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.'] - >>> s.split('\n') # doctest: +NORMALIZE_WHITESPACE - ['Good muffins cost $3.88', 'in New York. Please buy me', - 'two of them.', '', 'Thanks.'] - -The simple tokenizers are mainly useful because they follow the -standard ``TokenizerI`` interface, and so can be used with any code -that expects a tokenizer. For example, these tokenizers can be used -to specify the tokenization conventions when building a `CorpusReader`. - -""" - -from nltk.tokenize.api import StringTokenizer, TokenizerI -from nltk.tokenize.util import regexp_span_tokenize, string_span_tokenize - - -class SpaceTokenizer(StringTokenizer): - r"""Tokenize a string using the space character as a delimiter, - which is the same as ``s.split(' ')``. - - >>> from nltk.tokenize import SpaceTokenizer - >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." - >>> SpaceTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE - ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '', - 'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.'] - """ - - _string = " " - - -class TabTokenizer(StringTokenizer): - r"""Tokenize a string use the tab character as a delimiter, - the same as ``s.split('\t')``. - - >>> from nltk.tokenize import TabTokenizer - >>> TabTokenizer().tokenize('a\tb c\n\t d') - ['a', 'b c\n', ' d'] - """ - - _string = "\t" - - -class CharTokenizer(StringTokenizer): - """Tokenize a string into individual characters. If this functionality - is ever required directly, use ``for char in string``. - """ - - def tokenize(self, s): - return list(s) - - def span_tokenize(self, s): - yield from enumerate(range(1, len(s) + 1)) - - -class LineTokenizer(TokenizerI): - r"""Tokenize a string into its lines, optionally discarding blank lines. - This is similar to ``s.split('\n')``. - - >>> from nltk.tokenize import LineTokenizer - >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." - >>> LineTokenizer(blanklines='keep').tokenize(s) # doctest: +NORMALIZE_WHITESPACE - ['Good muffins cost $3.88', 'in New York. Please buy me', - 'two of them.', '', 'Thanks.'] - >>> # same as [l for l in s.split('\n') if l.strip()]: - >>> LineTokenizer(blanklines='discard').tokenize(s) # doctest: +NORMALIZE_WHITESPACE - ['Good muffins cost $3.88', 'in New York. Please buy me', - 'two of them.', 'Thanks.'] - - :param blanklines: Indicates how blank lines should be handled. Valid values are: - - - ``discard``: strip blank lines out of the token list before returning it. - A line is considered blank if it contains only whitespace characters. - - ``keep``: leave all blank lines in the token list. - - ``discard-eof``: if the string ends with a newline, then do not generate - a corresponding token ``''`` after that newline. - """ - - def __init__(self, blanklines="discard"): - valid_blanklines = ("discard", "keep", "discard-eof") - if blanklines not in valid_blanklines: - raise ValueError( - "Blank lines must be one of: %s" % " ".join(valid_blanklines) - ) - - self._blanklines = blanklines - - def tokenize(self, s): - lines = s.splitlines() - # If requested, strip off blank lines. - if self._blanklines == "discard": - lines = [l for l in lines if l.rstrip()] - elif self._blanklines == "discard-eof": - if lines and not lines[-1].strip(): - lines.pop() - return lines - - # discard-eof not implemented - def span_tokenize(self, s): - if self._blanklines == "keep": - yield from string_span_tokenize(s, r"\n") - else: - yield from regexp_span_tokenize(s, r"\n(\s+\n)*") - - -###################################################################### -# { Tokenization Functions -###################################################################### -# XXX: it is stated in module docs that there is no function versions - - -def line_tokenize(text, blanklines="discard"): - return LineTokenizer(blanklines).tokenize(text) diff --git a/pipeline/nltk/tokenize/sonority_sequencing.py b/pipeline/nltk/tokenize/sonority_sequencing.py deleted file mode 100644 index 24e43caae2dae6e3c76e66704fa9b856a6dc348c..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tokenize/sonority_sequencing.py +++ /dev/null @@ -1,194 +0,0 @@ -# Natural Language Toolkit: Tokenizers -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Christopher Hench -# Alex Estes -# URL: -# For license information, see LICENSE.TXT - -""" -The Sonority Sequencing Principle (SSP) is a language agnostic algorithm proposed -by Otto Jesperson in 1904. The sonorous quality of a phoneme is judged by the -openness of the lips. Syllable breaks occur before troughs in sonority. For more -on the SSP see Selkirk (1984). - -The default implementation uses the English alphabet, but the `sonority_hiearchy` -can be modified to IPA or any other alphabet for the use-case. The SSP is a -universal syllabification algorithm, but that does not mean it performs equally -across languages. Bartlett et al. (2009) is a good benchmark for English accuracy -if utilizing IPA (pg. 311). - -Importantly, if a custom hierarchy is supplied and vowels span across more than -one level, they should be given separately to the `vowels` class attribute. - -References: - -- Otto Jespersen. 1904. Lehrbuch der Phonetik. - Leipzig, Teubner. Chapter 13, Silbe, pp. 185-203. -- Elisabeth Selkirk. 1984. On the major class features and syllable theory. - In Aronoff & Oehrle (eds.) Language Sound Structure: Studies in Phonology. - Cambridge, MIT Press. pp. 107-136. -- Susan Bartlett, et al. 2009. On the Syllabification of Phonemes. - In HLT-NAACL. pp. 308-316. -""" - -import re -import warnings -from string import punctuation - -from nltk.tokenize.api import TokenizerI -from nltk.util import ngrams - - -class SyllableTokenizer(TokenizerI): - """ - Syllabifies words based on the Sonority Sequencing Principle (SSP). - - >>> from nltk.tokenize import SyllableTokenizer - >>> from nltk import word_tokenize - >>> SSP = SyllableTokenizer() - >>> SSP.tokenize('justification') - ['jus', 'ti', 'fi', 'ca', 'tion'] - >>> text = "This is a foobar-like sentence." - >>> [SSP.tokenize(token) for token in word_tokenize(text)] - [['This'], ['is'], ['a'], ['foo', 'bar', '-', 'li', 'ke'], ['sen', 'ten', 'ce'], ['.']] - """ - - def __init__(self, lang="en", sonority_hierarchy=False): - """ - :param lang: Language parameter, default is English, 'en' - :type lang: str - :param sonority_hierarchy: Sonority hierarchy according to the - Sonority Sequencing Principle. - :type sonority_hierarchy: list(str) - """ - # Sonority hierarchy should be provided in descending order. - # If vowels are spread across multiple levels, they should be - # passed assigned self.vowels var together, otherwise should be - # placed in first index of hierarchy. - if not sonority_hierarchy and lang == "en": - sonority_hierarchy = [ - "aeiouy", # vowels. - "lmnrw", # nasals. - "zvsf", # fricatives. - "bcdgtkpqxhj", # stops. - ] - - self.vowels = sonority_hierarchy[0] - self.phoneme_map = {} - for i, level in enumerate(sonority_hierarchy): - for c in level: - sonority_level = len(sonority_hierarchy) - i - self.phoneme_map[c] = sonority_level - self.phoneme_map[c.upper()] = sonority_level - - def assign_values(self, token): - """ - Assigns each phoneme its value from the sonority hierarchy. - Note: Sentence/text has to be tokenized first. - - :param token: Single word or token - :type token: str - :return: List of tuples, first element is character/phoneme and - second is the soronity value. - :rtype: list(tuple(str, int)) - """ - syllables_values = [] - for c in token: - try: - syllables_values.append((c, self.phoneme_map[c])) - except KeyError: - if c not in "0123456789" and c not in punctuation: - warnings.warn( - "Character not defined in sonority_hierarchy," - " assigning as vowel: '{}'".format(c) - ) - syllables_values.append((c, max(self.phoneme_map.values()))) - if c not in self.vowels: - self.vowels += c - else: # If it's a punctuation or numbers, assign -1. - syllables_values.append((c, -1)) - return syllables_values - - def validate_syllables(self, syllable_list): - """ - Ensures each syllable has at least one vowel. - If the following syllable doesn't have vowel, add it to the current one. - - :param syllable_list: Single word or token broken up into syllables. - :type syllable_list: list(str) - :return: Single word or token broken up into syllables - (with added syllables if necessary) - :rtype: list(str) - """ - valid_syllables = [] - front = "" - vowel_pattern = re.compile("|".join(self.vowels)) - for i, syllable in enumerate(syllable_list): - if syllable in punctuation: - valid_syllables.append(syllable) - continue - if not vowel_pattern.search(syllable): - if len(valid_syllables) == 0: - front += syllable - else: - valid_syllables = valid_syllables[:-1] + [ - valid_syllables[-1] + syllable - ] - else: - if len(valid_syllables) == 0: - valid_syllables.append(front + syllable) - else: - valid_syllables.append(syllable) - - return valid_syllables - - def tokenize(self, token): - """ - Apply the SSP to return a list of syllables. - Note: Sentence/text has to be tokenized first. - - :param token: Single word or token - :type token: str - :return syllable_list: Single word or token broken up into syllables. - :rtype: list(str) - """ - # assign values from hierarchy - syllables_values = self.assign_values(token) - - # if only one vowel return word - if sum(token.count(x) for x in self.vowels) <= 1: - return [token] - - syllable_list = [] - syllable = syllables_values[0][0] # start syllable with first phoneme - for trigram in ngrams(syllables_values, n=3): - phonemes, values = zip(*trigram) - # Sonority of previous, focal and following phoneme - prev_value, focal_value, next_value = values - # Focal phoneme. - focal_phoneme = phonemes[1] - - # These cases trigger syllable break. - if focal_value == -1: # If it's a punctuation, just break. - syllable_list.append(syllable) - syllable_list.append(focal_phoneme) - syllable = "" - elif prev_value >= focal_value == next_value: - syllable += focal_phoneme - syllable_list.append(syllable) - syllable = "" - - elif prev_value > focal_value < next_value: - syllable_list.append(syllable) - syllable = "" - syllable += focal_phoneme - - # no syllable break - else: - syllable += focal_phoneme - - syllable += syllables_values[-1][0] # append last phoneme - syllable_list.append(syllable) - - return self.validate_syllables(syllable_list) diff --git a/pipeline/nltk/tokenize/stanford.py b/pipeline/nltk/tokenize/stanford.py deleted file mode 100644 index 81a2d8584aee1d4c39042af6a150bd41c838ee14..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tokenize/stanford.py +++ /dev/null @@ -1,115 +0,0 @@ -# Natural Language Toolkit: Interface to the Stanford Tokenizer -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Xu -# -# URL: -# For license information, see LICENSE.TXT - -import json -import os -import tempfile -import warnings -from subprocess import PIPE - -from nltk.internals import _java_options, config_java, find_jar, java -from nltk.parse.corenlp import CoreNLPParser -from nltk.tokenize.api import TokenizerI - -_stanford_url = "https://nlp.stanford.edu/software/tokenizer.shtml" - - -class StanfordTokenizer(TokenizerI): - r""" - Interface to the Stanford Tokenizer - - >>> from nltk.tokenize.stanford import StanfordTokenizer - >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks." - >>> StanfordTokenizer().tokenize(s) # doctest: +SKIP - ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] - >>> s = "The colour of the wall is blue." - >>> StanfordTokenizer(options={"americanize": True}).tokenize(s) # doctest: +SKIP - ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.'] - """ - - _JAR = "stanford-postagger.jar" - - def __init__( - self, - path_to_jar=None, - encoding="utf8", - options=None, - verbose=False, - java_options="-mx1000m", - ): - # Raise deprecation warning. - warnings.warn( - str( - "\nThe StanfordTokenizer will " - "be deprecated in version 3.2.5.\n" - "Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead.'" - ), - DeprecationWarning, - stacklevel=2, - ) - - self._stanford_jar = find_jar( - self._JAR, - path_to_jar, - env_vars=("STANFORD_POSTAGGER",), - searchpath=(), - url=_stanford_url, - verbose=verbose, - ) - - self._encoding = encoding - self.java_options = java_options - - options = {} if options is None else options - self._options_cmd = ",".join(f"{key}={val}" for key, val in options.items()) - - @staticmethod - def _parse_tokenized_output(s): - return s.splitlines() - - def tokenize(self, s): - """ - Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences. - """ - cmd = ["edu.stanford.nlp.process.PTBTokenizer"] - return self._parse_tokenized_output(self._execute(cmd, s)) - - def _execute(self, cmd, input_, verbose=False): - encoding = self._encoding - cmd.extend(["-charset", encoding]) - _options_cmd = self._options_cmd - if _options_cmd: - cmd.extend(["-options", self._options_cmd]) - - default_options = " ".join(_java_options) - - # Configure java. - config_java(options=self.java_options, verbose=verbose) - - # Windows is incompatible with NamedTemporaryFile() without passing in delete=False. - with tempfile.NamedTemporaryFile(mode="wb", delete=False) as input_file: - # Write the actual sentences to the temporary input file - if isinstance(input_, str) and encoding: - input_ = input_.encode(encoding) - input_file.write(input_) - input_file.flush() - - cmd.append(input_file.name) - - # Run the tagger and get the output. - stdout, stderr = java( - cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE - ) - stdout = stdout.decode(encoding) - - os.unlink(input_file.name) - - # Return java configurations to their default values. - config_java(options=default_options, verbose=False) - - return stdout diff --git a/pipeline/nltk/tokenize/stanford_segmenter.py b/pipeline/nltk/tokenize/stanford_segmenter.py deleted file mode 100644 index ff3f16621e3a3c38ee0265e817b04c655856dd70..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tokenize/stanford_segmenter.py +++ /dev/null @@ -1,292 +0,0 @@ -#!/usr/bin/env python -# Natural Language Toolkit: Interface to the Stanford Segmenter -# for Chinese and Arabic -# -# Copyright (C) 2001-2023 NLTK Project -# Author: 52nlp <52nlpcn@gmail.com> -# Casper Lehmann-Strøm -# Alex Constantin -# -# URL: -# For license information, see LICENSE.TXT - -import json -import os -import tempfile -import warnings -from subprocess import PIPE - -from nltk.internals import ( - _java_options, - config_java, - find_dir, - find_file, - find_jar, - java, -) -from nltk.tokenize.api import TokenizerI - -_stanford_url = "https://nlp.stanford.edu/software" - - -class StanfordSegmenter(TokenizerI): - """Interface to the Stanford Segmenter - - If stanford-segmenter version is older than 2016-10-31, then path_to_slf4j - should be provieded, for example:: - - seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar') - - >>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter - >>> seg = StanfordSegmenter() # doctest: +SKIP - >>> seg.default_config('zh') # doctest: +SKIP - >>> sent = u'这是斯坦福中文分词器测试' - >>> print(seg.segment(sent)) # doctest: +SKIP - \u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5 - - >>> seg.default_config('ar') # doctest: +SKIP - >>> sent = u'هذا هو تصنيف ستانفورد العربي للكلمات' - >>> print(seg.segment(sent.split())) # doctest: +SKIP - \u0647\u0630\u0627 \u0647\u0648 \u062a\u0635\u0646\u064a\u0641 \u0633\u062a\u0627\u0646\u0641\u0648\u0631\u062f \u0627\u0644\u0639\u0631\u0628\u064a \u0644 \u0627\u0644\u0643\u0644\u0645\u0627\u062a - - """ - - _JAR = "stanford-segmenter.jar" - - def __init__( - self, - path_to_jar=None, - path_to_slf4j=None, - java_class=None, - path_to_model=None, - path_to_dict=None, - path_to_sihan_corpora_dict=None, - sihan_post_processing="false", - keep_whitespaces="false", - encoding="UTF-8", - options=None, - verbose=False, - java_options="-mx2g", - ): - # Raise deprecation warning. - warnings.simplefilter("always", DeprecationWarning) - warnings.warn( - str( - "\nThe StanfordTokenizer will " - "be deprecated in version 3.2.5.\n" - "Please use \033[91mnltk.parse.corenlp.CoreNLPTokenizer\033[0m instead.'" - ), - DeprecationWarning, - stacklevel=2, - ) - warnings.simplefilter("ignore", DeprecationWarning) - - stanford_segmenter = find_jar( - self._JAR, - path_to_jar, - env_vars=("STANFORD_SEGMENTER",), - searchpath=(), - url=_stanford_url, - verbose=verbose, - ) - if path_to_slf4j is not None: - slf4j = find_jar( - "slf4j-api.jar", - path_to_slf4j, - env_vars=("SLF4J", "STANFORD_SEGMENTER"), - searchpath=(), - url=_stanford_url, - verbose=verbose, - ) - else: - slf4j = None - - # This is passed to java as the -cp option, the old version of segmenter needs slf4j. - # The new version of stanford-segmenter-2016-10-31 doesn't need slf4j - self._stanford_jar = os.pathsep.join( - _ for _ in [stanford_segmenter, slf4j] if _ is not None - ) - - self._java_class = java_class - self._model = path_to_model - self._sihan_corpora_dict = path_to_sihan_corpora_dict - self._sihan_post_processing = sihan_post_processing - self._keep_whitespaces = keep_whitespaces - self._dict = path_to_dict - - self._encoding = encoding - self.java_options = java_options - options = {} if options is None else options - self._options_cmd = ",".join( - f"{key}={json.dumps(val)}" for key, val in options.items() - ) - - def default_config(self, lang): - """ - Attempt to initialize Stanford Word Segmenter for the specified language - using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables - """ - - search_path = () - if os.environ.get("STANFORD_SEGMENTER"): - search_path = {os.path.join(os.environ.get("STANFORD_SEGMENTER"), "data")} - - # init for Chinese-specific files - self._dict = None - self._sihan_corpora_dict = None - self._sihan_post_processing = "false" - - if lang == "ar": - self._java_class = ( - "edu.stanford.nlp.international.arabic.process.ArabicSegmenter" - ) - model = "arabic-segmenter-atb+bn+arztrain.ser.gz" - - elif lang == "zh": - self._java_class = "edu.stanford.nlp.ie.crf.CRFClassifier" - model = "pku.gz" - self._sihan_post_processing = "true" - - path_to_dict = "dict-chris6.ser.gz" - try: - self._dict = find_file( - path_to_dict, - searchpath=search_path, - url=_stanford_url, - verbose=False, - env_vars=("STANFORD_MODELS",), - ) - except LookupError as e: - raise LookupError( - "Could not find '%s' (tried using env. " - "variables STANFORD_MODELS and /data/)" - % path_to_dict - ) from e - - sihan_dir = "./data/" - try: - path_to_sihan_dir = find_dir( - sihan_dir, - url=_stanford_url, - verbose=False, - env_vars=("STANFORD_SEGMENTER",), - ) - self._sihan_corpora_dict = os.path.join(path_to_sihan_dir, sihan_dir) - except LookupError as e: - raise LookupError( - "Could not find '%s' (tried using the " - "STANFORD_SEGMENTER environment variable)" % sihan_dir - ) from e - else: - raise LookupError(f"Unsupported language {lang}") - - try: - self._model = find_file( - model, - searchpath=search_path, - url=_stanford_url, - verbose=False, - env_vars=("STANFORD_MODELS", "STANFORD_SEGMENTER"), - ) - except LookupError as e: - raise LookupError( - "Could not find '%s' (tried using env. " - "variables STANFORD_MODELS and /data/)" % model - ) from e - - def tokenize(self, s): - super().tokenize(s) - - def segment_file(self, input_file_path): - """ """ - cmd = [ - self._java_class, - "-loadClassifier", - self._model, - "-keepAllWhitespaces", - self._keep_whitespaces, - "-textFile", - input_file_path, - ] - if self._sihan_corpora_dict is not None: - cmd.extend( - [ - "-serDictionary", - self._dict, - "-sighanCorporaDict", - self._sihan_corpora_dict, - "-sighanPostProcessing", - self._sihan_post_processing, - ] - ) - - stdout = self._execute(cmd) - - return stdout - - def segment(self, tokens): - return self.segment_sents([tokens]) - - def segment_sents(self, sentences): - """ """ - encoding = self._encoding - # Create a temporary input file - _input_fh, self._input_file_path = tempfile.mkstemp(text=True) - - # Write the actural sentences to the temporary input file - _input_fh = os.fdopen(_input_fh, "wb") - _input = "\n".join(" ".join(x) for x in sentences) - if isinstance(_input, str) and encoding: - _input = _input.encode(encoding) - _input_fh.write(_input) - _input_fh.close() - - cmd = [ - self._java_class, - "-loadClassifier", - self._model, - "-keepAllWhitespaces", - self._keep_whitespaces, - "-textFile", - self._input_file_path, - ] - if self._sihan_corpora_dict is not None: - cmd.extend( - [ - "-serDictionary", - self._dict, - "-sighanCorporaDict", - self._sihan_corpora_dict, - "-sighanPostProcessing", - self._sihan_post_processing, - ] - ) - - stdout = self._execute(cmd) - - # Delete the temporary file - os.unlink(self._input_file_path) - - return stdout - - def _execute(self, cmd, verbose=False): - encoding = self._encoding - cmd.extend(["-inputEncoding", encoding]) - _options_cmd = self._options_cmd - if _options_cmd: - cmd.extend(["-options", self._options_cmd]) - - default_options = " ".join(_java_options) - - # Configure java. - config_java(options=self.java_options, verbose=verbose) - - stdout, _stderr = java( - cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE - ) - stdout = stdout.decode(encoding) - - # Return java configurations to their default values. - config_java(options=default_options, verbose=False) - - return stdout diff --git a/pipeline/nltk/tokenize/texttiling.py b/pipeline/nltk/tokenize/texttiling.py deleted file mode 100644 index b5b770b2d08a998538d85803126e74cc13139d11..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tokenize/texttiling.py +++ /dev/null @@ -1,475 +0,0 @@ -# Natural Language Toolkit: TextTiling -# -# Copyright (C) 2001-2023 NLTK Project -# Author: George Boutsioukis -# -# URL: -# For license information, see LICENSE.TXT - -import math -import re - -try: - import numpy -except ImportError: - pass - -from nltk.tokenize.api import TokenizerI - -BLOCK_COMPARISON, VOCABULARY_INTRODUCTION = 0, 1 -LC, HC = 0, 1 -DEFAULT_SMOOTHING = [0] - - -class TextTilingTokenizer(TokenizerI): - """Tokenize a document into topical sections using the TextTiling algorithm. - This algorithm detects subtopic shifts based on the analysis of lexical - co-occurrence patterns. - - The process starts by tokenizing the text into pseudosentences of - a fixed size w. Then, depending on the method used, similarity - scores are assigned at sentence gaps. The algorithm proceeds by - detecting the peak differences between these scores and marking - them as boundaries. The boundaries are normalized to the closest - paragraph break and the segmented text is returned. - - :param w: Pseudosentence size - :type w: int - :param k: Size (in sentences) of the block used in the block comparison method - :type k: int - :param similarity_method: The method used for determining similarity scores: - `BLOCK_COMPARISON` (default) or `VOCABULARY_INTRODUCTION`. - :type similarity_method: constant - :param stopwords: A list of stopwords that are filtered out (defaults to NLTK's stopwords corpus) - :type stopwords: list(str) - :param smoothing_method: The method used for smoothing the score plot: - `DEFAULT_SMOOTHING` (default) - :type smoothing_method: constant - :param smoothing_width: The width of the window used by the smoothing method - :type smoothing_width: int - :param smoothing_rounds: The number of smoothing passes - :type smoothing_rounds: int - :param cutoff_policy: The policy used to determine the number of boundaries: - `HC` (default) or `LC` - :type cutoff_policy: constant - - >>> from nltk.corpus import brown - >>> tt = TextTilingTokenizer(demo_mode=True) - >>> text = brown.raw()[:4000] - >>> s, ss, d, b = tt.tokenize(text) - >>> b - [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0] - """ - - def __init__( - self, - w=20, - k=10, - similarity_method=BLOCK_COMPARISON, - stopwords=None, - smoothing_method=DEFAULT_SMOOTHING, - smoothing_width=2, - smoothing_rounds=1, - cutoff_policy=HC, - demo_mode=False, - ): - - if stopwords is None: - from nltk.corpus import stopwords - - stopwords = stopwords.words("english") - self.__dict__.update(locals()) - del self.__dict__["self"] - - def tokenize(self, text): - """Return a tokenized copy of *text*, where each "token" represents - a separate topic.""" - - lowercase_text = text.lower() - paragraph_breaks = self._mark_paragraph_breaks(text) - text_length = len(lowercase_text) - - # Tokenization step starts here - - # Remove punctuation - nopunct_text = "".join( - c for c in lowercase_text if re.match(r"[a-z\-' \n\t]", c) - ) - nopunct_par_breaks = self._mark_paragraph_breaks(nopunct_text) - - tokseqs = self._divide_to_tokensequences(nopunct_text) - - # The morphological stemming step mentioned in the TextTile - # paper is not implemented. A comment in the original C - # implementation states that it offers no benefit to the - # process. It might be interesting to test the existing - # stemmers though. - # words = _stem_words(words) - - # Filter stopwords - for ts in tokseqs: - ts.wrdindex_list = [ - wi for wi in ts.wrdindex_list if wi[0] not in self.stopwords - ] - - token_table = self._create_token_table(tokseqs, nopunct_par_breaks) - # End of the Tokenization step - - # Lexical score determination - if self.similarity_method == BLOCK_COMPARISON: - gap_scores = self._block_comparison(tokseqs, token_table) - elif self.similarity_method == VOCABULARY_INTRODUCTION: - raise NotImplementedError("Vocabulary introduction not implemented") - else: - raise ValueError( - f"Similarity method {self.similarity_method} not recognized" - ) - - if self.smoothing_method == DEFAULT_SMOOTHING: - smooth_scores = self._smooth_scores(gap_scores) - else: - raise ValueError(f"Smoothing method {self.smoothing_method} not recognized") - # End of Lexical score Determination - - # Boundary identification - depth_scores = self._depth_scores(smooth_scores) - segment_boundaries = self._identify_boundaries(depth_scores) - - normalized_boundaries = self._normalize_boundaries( - text, segment_boundaries, paragraph_breaks - ) - # End of Boundary Identification - segmented_text = [] - prevb = 0 - - for b in normalized_boundaries: - if b == 0: - continue - segmented_text.append(text[prevb:b]) - prevb = b - - if prevb < text_length: # append any text that may be remaining - segmented_text.append(text[prevb:]) - - if not segmented_text: - segmented_text = [text] - - if self.demo_mode: - return gap_scores, smooth_scores, depth_scores, segment_boundaries - return segmented_text - - def _block_comparison(self, tokseqs, token_table): - """Implements the block comparison method""" - - def blk_frq(tok, block): - ts_occs = filter(lambda o: o[0] in block, token_table[tok].ts_occurences) - freq = sum(tsocc[1] for tsocc in ts_occs) - return freq - - gap_scores = [] - numgaps = len(tokseqs) - 1 - - for curr_gap in range(numgaps): - score_dividend, score_divisor_b1, score_divisor_b2 = 0.0, 0.0, 0.0 - score = 0.0 - # adjust window size for boundary conditions - if curr_gap < self.k - 1: - window_size = curr_gap + 1 - elif curr_gap > numgaps - self.k: - window_size = numgaps - curr_gap - else: - window_size = self.k - - b1 = [ts.index for ts in tokseqs[curr_gap - window_size + 1 : curr_gap + 1]] - b2 = [ts.index for ts in tokseqs[curr_gap + 1 : curr_gap + window_size + 1]] - - for t in token_table: - score_dividend += blk_frq(t, b1) * blk_frq(t, b2) - score_divisor_b1 += blk_frq(t, b1) ** 2 - score_divisor_b2 += blk_frq(t, b2) ** 2 - try: - score = score_dividend / math.sqrt(score_divisor_b1 * score_divisor_b2) - except ZeroDivisionError: - pass # score += 0.0 - - gap_scores.append(score) - - return gap_scores - - def _smooth_scores(self, gap_scores): - "Wraps the smooth function from the SciPy Cookbook" - return list( - smooth(numpy.array(gap_scores[:]), window_len=self.smoothing_width + 1) - ) - - def _mark_paragraph_breaks(self, text): - """Identifies indented text or line breaks as the beginning of - paragraphs""" - MIN_PARAGRAPH = 100 - pattern = re.compile("[ \t\r\f\v]*\n[ \t\r\f\v]*\n[ \t\r\f\v]*") - matches = pattern.finditer(text) - - last_break = 0 - pbreaks = [0] - for pb in matches: - if pb.start() - last_break < MIN_PARAGRAPH: - continue - else: - pbreaks.append(pb.start()) - last_break = pb.start() - - return pbreaks - - def _divide_to_tokensequences(self, text): - "Divides the text into pseudosentences of fixed size" - w = self.w - wrdindex_list = [] - matches = re.finditer(r"\w+", text) - for match in matches: - wrdindex_list.append((match.group(), match.start())) - return [ - TokenSequence(i / w, wrdindex_list[i : i + w]) - for i in range(0, len(wrdindex_list), w) - ] - - def _create_token_table(self, token_sequences, par_breaks): - "Creates a table of TokenTableFields" - token_table = {} - current_par = 0 - current_tok_seq = 0 - pb_iter = par_breaks.__iter__() - current_par_break = next(pb_iter) - if current_par_break == 0: - try: - current_par_break = next(pb_iter) # skip break at 0 - except StopIteration as e: - raise ValueError( - "No paragraph breaks were found(text too short perhaps?)" - ) from e - for ts in token_sequences: - for word, index in ts.wrdindex_list: - try: - while index > current_par_break: - current_par_break = next(pb_iter) - current_par += 1 - except StopIteration: - # hit bottom - pass - - if word in token_table: - token_table[word].total_count += 1 - - if token_table[word].last_par != current_par: - token_table[word].last_par = current_par - token_table[word].par_count += 1 - - if token_table[word].last_tok_seq != current_tok_seq: - token_table[word].last_tok_seq = current_tok_seq - token_table[word].ts_occurences.append([current_tok_seq, 1]) - else: - token_table[word].ts_occurences[-1][1] += 1 - else: # new word - token_table[word] = TokenTableField( - first_pos=index, - ts_occurences=[[current_tok_seq, 1]], - total_count=1, - par_count=1, - last_par=current_par, - last_tok_seq=current_tok_seq, - ) - - current_tok_seq += 1 - - return token_table - - def _identify_boundaries(self, depth_scores): - """Identifies boundaries at the peaks of similarity score - differences""" - - boundaries = [0 for x in depth_scores] - - avg = sum(depth_scores) / len(depth_scores) - stdev = numpy.std(depth_scores) - - if self.cutoff_policy == LC: - cutoff = avg - stdev - else: - cutoff = avg - stdev / 2.0 - - depth_tuples = sorted(zip(depth_scores, range(len(depth_scores)))) - depth_tuples.reverse() - hp = list(filter(lambda x: x[0] > cutoff, depth_tuples)) - - for dt in hp: - boundaries[dt[1]] = 1 - for dt2 in hp: # undo if there is a boundary close already - if ( - dt[1] != dt2[1] - and abs(dt2[1] - dt[1]) < 4 - and boundaries[dt2[1]] == 1 - ): - boundaries[dt[1]] = 0 - return boundaries - - def _depth_scores(self, scores): - """Calculates the depth of each gap, i.e. the average difference - between the left and right peaks and the gap's score""" - - depth_scores = [0 for x in scores] - # clip boundaries: this holds on the rule of thumb(my thumb) - # that a section shouldn't be smaller than at least 2 - # pseudosentences for small texts and around 5 for larger ones. - - clip = min(max(len(scores) // 10, 2), 5) - index = clip - - for gapscore in scores[clip:-clip]: - lpeak = gapscore - for score in scores[index::-1]: - if score >= lpeak: - lpeak = score - else: - break - rpeak = gapscore - for score in scores[index:]: - if score >= rpeak: - rpeak = score - else: - break - depth_scores[index] = lpeak + rpeak - 2 * gapscore - index += 1 - - return depth_scores - - def _normalize_boundaries(self, text, boundaries, paragraph_breaks): - """Normalize the boundaries identified to the original text's - paragraph breaks""" - - norm_boundaries = [] - char_count, word_count, gaps_seen = 0, 0, 0 - seen_word = False - - for char in text: - char_count += 1 - if char in " \t\n" and seen_word: - seen_word = False - word_count += 1 - if char not in " \t\n" and not seen_word: - seen_word = True - if gaps_seen < len(boundaries) and word_count > ( - max(gaps_seen * self.w, self.w) - ): - if boundaries[gaps_seen] == 1: - # find closest paragraph break - best_fit = len(text) - for br in paragraph_breaks: - if best_fit > abs(br - char_count): - best_fit = abs(br - char_count) - bestbr = br - else: - break - if bestbr not in norm_boundaries: # avoid duplicates - norm_boundaries.append(bestbr) - gaps_seen += 1 - - return norm_boundaries - - -class TokenTableField: - """A field in the token table holding parameters for each token, - used later in the process""" - - def __init__( - self, - first_pos, - ts_occurences, - total_count=1, - par_count=1, - last_par=0, - last_tok_seq=None, - ): - self.__dict__.update(locals()) - del self.__dict__["self"] - - -class TokenSequence: - "A token list with its original length and its index" - - def __init__(self, index, wrdindex_list, original_length=None): - original_length = original_length or len(wrdindex_list) - self.__dict__.update(locals()) - del self.__dict__["self"] - - -# Pasted from the SciPy cookbook: https://www.scipy.org/Cookbook/SignalSmooth -def smooth(x, window_len=11, window="flat"): - """smooth the data using a window with requested size. - - This method is based on the convolution of a scaled window with the signal. - The signal is prepared by introducing reflected copies of the signal - (with the window size) in both ends so that transient parts are minimized - in the beginning and end part of the output signal. - - :param x: the input signal - :param window_len: the dimension of the smoothing window; should be an odd integer - :param window: the type of window from 'flat', 'hanning', 'hamming', 'bartlett', 'blackman' - flat window will produce a moving average smoothing. - - :return: the smoothed signal - - example:: - - t=linspace(-2,2,0.1) - x=sin(t)+randn(len(t))*0.1 - y=smooth(x) - - :see also: numpy.hanning, numpy.hamming, numpy.bartlett, numpy.blackman, numpy.convolve, - scipy.signal.lfilter - - TODO: the window parameter could be the window itself if an array instead of a string - """ - - if x.ndim != 1: - raise ValueError("smooth only accepts 1 dimension arrays.") - - if x.size < window_len: - raise ValueError("Input vector needs to be bigger than window size.") - - if window_len < 3: - return x - - if window not in ["flat", "hanning", "hamming", "bartlett", "blackman"]: - raise ValueError( - "Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'" - ) - - s = numpy.r_[2 * x[0] - x[window_len:1:-1], x, 2 * x[-1] - x[-1:-window_len:-1]] - - # print(len(s)) - if window == "flat": # moving average - w = numpy.ones(window_len, "d") - else: - w = eval("numpy." + window + "(window_len)") - - y = numpy.convolve(w / w.sum(), s, mode="same") - - return y[window_len - 1 : -window_len + 1] - - -def demo(text=None): - from matplotlib import pylab - - from nltk.corpus import brown - - tt = TextTilingTokenizer(demo_mode=True) - if text is None: - text = brown.raw()[:10000] - s, ss, d, b = tt.tokenize(text) - pylab.xlabel("Sentence Gap index") - pylab.ylabel("Gap Scores") - pylab.plot(range(len(s)), s, label="Gap Scores") - pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores") - pylab.plot(range(len(d)), d, label="Depth scores") - pylab.stem(range(len(b)), b) - pylab.legend() - pylab.show() diff --git a/pipeline/nltk/tokenize/toktok.py b/pipeline/nltk/tokenize/toktok.py deleted file mode 100644 index 4229a7327743ad9788449a82c8d2350b9c8db392..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tokenize/toktok.py +++ /dev/null @@ -1,179 +0,0 @@ -# Natural Language Toolkit: Python port of the tok-tok.pl tokenizer. -# -# Copyright (C) 2001-2015 NLTK Project -# Author: Jon Dehdari -# Contributors: Liling Tan, Selcuk Ayguney, ikegami, Martijn Pieters -# -# URL: -# For license information, see LICENSE.TXT - -""" -The tok-tok tokenizer is a simple, general tokenizer, where the input has one -sentence per line; thus only final period is tokenized. - -Tok-tok has been tested on, and gives reasonably good results for English, -Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others. -The input should be in UTF-8 encoding. - -Reference: -Jon Dehdari. 2014. A Neurophysiologically-Inspired Statistical Language -Model (Doctoral dissertation). Columbus, OH, USA: The Ohio State University. -""" - -import re - -from nltk.tokenize.api import TokenizerI - - -class ToktokTokenizer(TokenizerI): - """ - This is a Python port of the tok-tok.pl from - https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl - - >>> toktok = ToktokTokenizer() - >>> text = u'Is 9.5 or 525,600 my favorite number?' - >>> print(toktok.tokenize(text, return_str=True)) - Is 9.5 or 525,600 my favorite number ? - >>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things' - >>> print(toktok.tokenize(text, return_str=True)) - The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things - >>> text = u'\xa1This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf' - >>> expected = u'\xa1 This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf' - >>> assert toktok.tokenize(text, return_str=True) == expected - >>> toktok.tokenize(text) == [u'\xa1', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'\xbb', u'symbols', u'\u2026', u'appearing', u'everywhere', u'\xbf'] - True - """ - - # Replace non-breaking spaces with normal spaces. - NON_BREAKING = re.compile("\u00A0"), " " - - # Pad some funky punctuation. - FUNKY_PUNCT_1 = re.compile(r'([،;؛¿!"\])}»›”؟¡%٪°±©®।॥…])'), r" \1 " - # Pad more funky punctuation. - FUNKY_PUNCT_2 = re.compile(r"([({\[“‘„‚«‹「『])"), r" \1 " - # Pad En dash and em dash - EN_EM_DASHES = re.compile("([–—])"), r" \1 " - - # Replace problematic character with numeric character reference. - AMPERCENT = re.compile("& "), "& " - TAB = re.compile("\t"), " " - PIPE = re.compile(r"\|"), " | " - - # Pad numbers with commas to keep them from further tokenization. - COMMA_IN_NUM = re.compile(r"(? "something ..." - # "something." -> "something ." - FINAL_PERIOD_1 = re.compile(r"(? "... stuff ." - FINAL_PERIOD_2 = re.compile(r"""(? -# Michael Heilman (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed) -# Tom Aarsen <> (modifications) -# -# URL: -# For license information, see LICENSE.TXT - -r""" - -Penn Treebank Tokenizer - -The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank. -This implementation is a port of the tokenizer sed script written by Robert McIntyre -and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed. -""" - -import re -import warnings -from typing import Iterator, List, Tuple - -from nltk.tokenize.api import TokenizerI -from nltk.tokenize.destructive import MacIntyreContractions -from nltk.tokenize.util import align_tokens - - -class TreebankWordTokenizer(TokenizerI): - r""" - The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank. - - This tokenizer performs the following steps: - - - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll`` - - treat most punctuation characters as separate tokens - - split off commas and single quotes, when followed by whitespace - - separate periods that appear at the end of line - - >>> from nltk.tokenize import TreebankWordTokenizer - >>> s = '''Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.''' - >>> TreebankWordTokenizer().tokenize(s) - ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.'] - >>> s = "They'll save and invest more." - >>> TreebankWordTokenizer().tokenize(s) - ['They', "'ll", 'save', 'and', 'invest', 'more', '.'] - >>> s = "hi, my name can't hello," - >>> TreebankWordTokenizer().tokenize(s) - ['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ','] - """ - - # starting quotes - STARTING_QUOTES = [ - (re.compile(r"^\""), r"``"), - (re.compile(r"(``)"), r" \1 "), - (re.compile(r"([ \(\[{<])(\"|\'{2})"), r"\1 `` "), - ] - - # punctuation - PUNCTUATION = [ - (re.compile(r"([:,])([^\d])"), r" \1 \2"), - (re.compile(r"([:,])$"), r" \1 "), - (re.compile(r"\.\.\."), r" ... "), - (re.compile(r"[;@#$%&]"), r" \g<0> "), - ( - re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'), - r"\1 \2\3 ", - ), # Handles the final period. - (re.compile(r"[?!]"), r" \g<0> "), - (re.compile(r"([^'])' "), r"\1 ' "), - ] - - # Pads parentheses - PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ") - - # Optionally: Convert parentheses, brackets and converts them to PTB symbols. - CONVERT_PARENTHESES = [ - (re.compile(r"\("), "-LRB-"), - (re.compile(r"\)"), "-RRB-"), - (re.compile(r"\["), "-LSB-"), - (re.compile(r"\]"), "-RSB-"), - (re.compile(r"\{"), "-LCB-"), - (re.compile(r"\}"), "-RCB-"), - ] - - DOUBLE_DASHES = (re.compile(r"--"), r" -- ") - - # ending quotes - ENDING_QUOTES = [ - (re.compile(r"''"), " '' "), - (re.compile(r'"'), " '' "), - (re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "), - (re.compile(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1 \2 "), - ] - - # List of contractions adapted from Robert MacIntyre's tokenizer. - _contractions = MacIntyreContractions() - CONTRACTIONS2 = list(map(re.compile, _contractions.CONTRACTIONS2)) - CONTRACTIONS3 = list(map(re.compile, _contractions.CONTRACTIONS3)) - - def tokenize( - self, text: str, convert_parentheses: bool = False, return_str: bool = False - ) -> List[str]: - r"""Return a tokenized copy of `text`. - - >>> from nltk.tokenize import TreebankWordTokenizer - >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.''' - >>> TreebankWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE - ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36', - 'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', - 'of', 'them.', 'Thanks', '.'] - >>> TreebankWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE - ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36', - 'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', - 'of', 'them.', 'Thanks', '.'] - - :param text: A string with a sentence or sentences. - :type text: str - :param convert_parentheses: if True, replace parentheses to PTB symbols, - e.g. `(` to `-LRB-`. Defaults to False. - :type convert_parentheses: bool, optional - :param return_str: If True, return tokens as space-separated string, - defaults to False. - :type return_str: bool, optional - :return: List of tokens from `text`. - :rtype: List[str] - """ - if return_str is not False: - warnings.warn( - "Parameter 'return_str' has been deprecated and should no " - "longer be used.", - category=DeprecationWarning, - stacklevel=2, - ) - - for regexp, substitution in self.STARTING_QUOTES: - text = regexp.sub(substitution, text) - - for regexp, substitution in self.PUNCTUATION: - text = regexp.sub(substitution, text) - - # Handles parentheses. - regexp, substitution = self.PARENS_BRACKETS - text = regexp.sub(substitution, text) - # Optionally convert parentheses - if convert_parentheses: - for regexp, substitution in self.CONVERT_PARENTHESES: - text = regexp.sub(substitution, text) - - # Handles double dash. - regexp, substitution = self.DOUBLE_DASHES - text = regexp.sub(substitution, text) - - # add extra space to make things easier - text = " " + text + " " - - for regexp, substitution in self.ENDING_QUOTES: - text = regexp.sub(substitution, text) - - for regexp in self.CONTRACTIONS2: - text = regexp.sub(r" \1 \2 ", text) - for regexp in self.CONTRACTIONS3: - text = regexp.sub(r" \1 \2 ", text) - - # We are not using CONTRACTIONS4 since - # they are also commented out in the SED scripts - # for regexp in self._contractions.CONTRACTIONS4: - # text = regexp.sub(r' \1 \2 \3 ', text) - - return text.split() - - def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]: - r""" - Returns the spans of the tokens in ``text``. - Uses the post-hoc nltk.tokens.align_tokens to return the offset spans. - - >>> from nltk.tokenize import TreebankWordTokenizer - >>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).''' - >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), - ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), - ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), - ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] - >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected - True - >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in', - ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')', - ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.'] - >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected - True - - :param text: A string with a sentence or sentences. - :type text: str - :yield: Tuple[int, int] - """ - raw_tokens = self.tokenize(text) - - # Convert converted quotes back to original double quotes - # Do this only if original text contains double quote(s) or double - # single-quotes (because '' might be transformed to `` if it is - # treated as starting quotes). - if ('"' in text) or ("''" in text): - # Find double quotes and converted quotes - matched = [m.group() for m in re.finditer(r"``|'{2}|\"", text)] - - # Replace converted quotes back to double quotes - tokens = [ - matched.pop(0) if tok in ['"', "``", "''"] else tok - for tok in raw_tokens - ] - else: - tokens = raw_tokens - - yield from align_tokens(tokens, text) - - -class TreebankWordDetokenizer(TokenizerI): - r""" - The Treebank detokenizer uses the reverse regex operations corresponding to - the Treebank tokenizer's regexes. - - Note: - - - There're additional assumption mades when undoing the padding of ``[;@#$%&]`` - punctuation symbols that isn't presupposed in the TreebankTokenizer. - - There're additional regexes added in reversing the parentheses tokenization, - such as the ``r'([\]\)\}\>])\s([:;,.])'``, which removes the additional right - padding added to the closing parentheses precedding ``[:;,.]``. - - It's not possible to return the original whitespaces as they were because - there wasn't explicit records of where `'\n'`, `'\t'` or `'\s'` were removed at - the text.split() operation. - - >>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer - >>> s = '''Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.''' - >>> d = TreebankWordDetokenizer() - >>> t = TreebankWordTokenizer() - >>> toks = t.tokenize(s) - >>> d.detokenize(toks) - 'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.' - - The MXPOST parentheses substitution can be undone using the ``convert_parentheses`` - parameter: - - >>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).''' - >>> expected_tokens = ['Good', 'muffins', 'cost', '$', '3.88', 'in', - ... 'New', '-LRB-', 'York', '-RRB-', '.', 'Please', '-LRB-', 'buy', - ... '-RRB-', 'me', 'two', 'of', 'them.', '-LRB-', 'Thanks', '-RRB-', '.'] - >>> expected_tokens == t.tokenize(s, convert_parentheses=True) - True - >>> expected_detoken = 'Good muffins cost $3.88 in New (York). Please (buy) me two of them. (Thanks).' - >>> expected_detoken == d.detokenize(t.tokenize(s, convert_parentheses=True), convert_parentheses=True) - True - - During tokenization it's safe to add more spaces but during detokenization, - simply undoing the padding doesn't really help. - - - During tokenization, left and right pad is added to ``[!?]``, when - detokenizing, only left shift the ``[!?]`` is needed. - Thus ``(re.compile(r'\s([?!])'), r'\g<1>')``. - - - During tokenization ``[:,]`` are left and right padded but when detokenizing, - only left shift is necessary and we keep right pad after comma/colon - if the string after is a non-digit. - Thus ``(re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')``. - - >>> from nltk.tokenize.treebank import TreebankWordDetokenizer - >>> toks = ['hello', ',', 'i', 'ca', "n't", 'feel', 'my', 'feet', '!', 'Help', '!', '!'] - >>> twd = TreebankWordDetokenizer() - >>> twd.detokenize(toks) - "hello, i can't feel my feet! Help!!" - - >>> toks = ['hello', ',', 'i', "can't", 'feel', ';', 'my', 'feet', '!', - ... 'Help', '!', '!', 'He', 'said', ':', 'Help', ',', 'help', '?', '!'] - >>> twd.detokenize(toks) - "hello, i can't feel; my feet! Help!! He said: Help, help?!" - """ - - _contractions = MacIntyreContractions() - CONTRACTIONS2 = [ - re.compile(pattern.replace("(?#X)", r"\s")) - for pattern in _contractions.CONTRACTIONS2 - ] - CONTRACTIONS3 = [ - re.compile(pattern.replace("(?#X)", r"\s")) - for pattern in _contractions.CONTRACTIONS3 - ] - - # ending quotes - ENDING_QUOTES = [ - (re.compile(r"([^' ])\s('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) "), r"\1\2 "), - (re.compile(r"([^' ])\s('[sS]|'[mM]|'[dD]|') "), r"\1\2 "), - (re.compile(r"(\S)\s(\'\')"), r"\1\2"), - ( - re.compile(r"(\'\')\s([.,:)\]>};%])"), - r"\1\2", - ), # Quotes followed by no-left-padded punctuations. - (re.compile(r"''"), '"'), - ] - - # Handles double dashes - DOUBLE_DASHES = (re.compile(r" -- "), r"--") - - # Optionally: Convert parentheses, brackets and converts them from PTB symbols. - CONVERT_PARENTHESES = [ - (re.compile("-LRB-"), "("), - (re.compile("-RRB-"), ")"), - (re.compile("-LSB-"), "["), - (re.compile("-RSB-"), "]"), - (re.compile("-LCB-"), "{"), - (re.compile("-RCB-"), "}"), - ] - - # Undo padding on parentheses. - PARENS_BRACKETS = [ - (re.compile(r"([\[\(\{\<])\s"), r"\g<1>"), - (re.compile(r"\s([\]\)\}\>])"), r"\g<1>"), - (re.compile(r"([\]\)\}\>])\s([:;,.])"), r"\1\2"), - ] - - # punctuation - PUNCTUATION = [ - (re.compile(r"([^'])\s'\s"), r"\1' "), - (re.compile(r"\s([?!])"), r"\g<1>"), # Strip left pad for [?!] - # (re.compile(r'\s([?!])\s'), r'\g<1>'), - (re.compile(r'([^\.])\s(\.)([\]\)}>"\']*)\s*$'), r"\1\2\3"), - # When tokenizing, [;@#$%&] are padded with whitespace regardless of - # whether there are spaces before or after them. - # But during detokenization, we need to distinguish between left/right - # pad, so we split this up. - (re.compile(r"([#$])\s"), r"\g<1>"), # Left pad. - (re.compile(r"\s([;%])"), r"\g<1>"), # Right pad. - # (re.compile(r"\s([&*])\s"), r" \g<1> "), # Unknown pad. - (re.compile(r"\s\.\.\.\s"), r"..."), - # (re.compile(r"\s([:,])\s$"), r"\1"), # .strip() takes care of it. - ( - re.compile(r"\s([:,])"), - r"\1", - ), # Just remove left padding. Punctuation in numbers won't be padded. - ] - - # starting quotes - STARTING_QUOTES = [ - (re.compile(r"([ (\[{<])\s``"), r"\1``"), - (re.compile(r"(``)\s"), r"\1"), - (re.compile(r"``"), r'"'), - ] - - def tokenize(self, tokens: List[str], convert_parentheses: bool = False) -> str: - """ - Treebank detokenizer, created by undoing the regexes from - the TreebankWordTokenizer.tokenize. - - :param tokens: A list of strings, i.e. tokenized text. - :type tokens: List[str] - :param convert_parentheses: if True, replace PTB symbols with parentheses, - e.g. `-LRB-` to `(`. Defaults to False. - :type convert_parentheses: bool, optional - :return: str - """ - text = " ".join(tokens) - - # Add extra space to make things easier - text = " " + text + " " - - # Reverse the contractions regexes. - # Note: CONTRACTIONS4 are not used in tokenization. - for regexp in self.CONTRACTIONS3: - text = regexp.sub(r"\1\2", text) - for regexp in self.CONTRACTIONS2: - text = regexp.sub(r"\1\2", text) - - # Reverse the regexes applied for ending quotes. - for regexp, substitution in self.ENDING_QUOTES: - text = regexp.sub(substitution, text) - - # Undo the space padding. - text = text.strip() - - # Reverse the padding on double dashes. - regexp, substitution = self.DOUBLE_DASHES - text = regexp.sub(substitution, text) - - if convert_parentheses: - for regexp, substitution in self.CONVERT_PARENTHESES: - text = regexp.sub(substitution, text) - - # Reverse the padding regexes applied for parenthesis/brackets. - for regexp, substitution in self.PARENS_BRACKETS: - text = regexp.sub(substitution, text) - - # Reverse the regexes applied for punctuations. - for regexp, substitution in self.PUNCTUATION: - text = regexp.sub(substitution, text) - - # Reverse the regexes applied for starting quotes. - for regexp, substitution in self.STARTING_QUOTES: - text = regexp.sub(substitution, text) - - return text.strip() - - def detokenize(self, tokens: List[str], convert_parentheses: bool = False) -> str: - """Duck-typing the abstract *tokenize()*.""" - return self.tokenize(tokens, convert_parentheses) diff --git a/pipeline/nltk/tokenize/util.py b/pipeline/nltk/tokenize/util.py deleted file mode 100644 index e496e0169aa89569b8f0428096b972d4776a0b2e..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tokenize/util.py +++ /dev/null @@ -1,295 +0,0 @@ -# Natural Language Toolkit: Tokenizer Utilities -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# URL: -# For license information, see LICENSE.TXT - -from re import finditer -from xml.sax.saxutils import escape, unescape - - -def string_span_tokenize(s, sep): - r""" - Return the offsets of the tokens in *s*, as a sequence of ``(start, end)`` - tuples, by splitting the string at each occurrence of *sep*. - - >>> from nltk.tokenize.util import string_span_tokenize - >>> s = '''Good muffins cost $3.88\nin New York. Please buy me - ... two of them.\n\nThanks.''' - >>> list(string_span_tokenize(s, " ")) # doctest: +NORMALIZE_WHITESPACE - [(0, 4), (5, 12), (13, 17), (18, 26), (27, 30), (31, 36), (37, 37), - (38, 44), (45, 48), (49, 55), (56, 58), (59, 73)] - - :param s: the string to be tokenized - :type s: str - :param sep: the token separator - :type sep: str - :rtype: iter(tuple(int, int)) - """ - if len(sep) == 0: - raise ValueError("Token delimiter must not be empty") - left = 0 - while True: - try: - right = s.index(sep, left) - if right != 0: - yield left, right - except ValueError: - if left != len(s): - yield left, len(s) - break - - left = right + len(sep) - - -def regexp_span_tokenize(s, regexp): - r""" - Return the offsets of the tokens in *s*, as a sequence of ``(start, end)`` - tuples, by splitting the string at each successive match of *regexp*. - - >>> from nltk.tokenize.util import regexp_span_tokenize - >>> s = '''Good muffins cost $3.88\nin New York. Please buy me - ... two of them.\n\nThanks.''' - >>> list(regexp_span_tokenize(s, r'\s')) # doctest: +NORMALIZE_WHITESPACE - [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), - (38, 44), (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)] - - :param s: the string to be tokenized - :type s: str - :param regexp: regular expression that matches token separators (must not be empty) - :type regexp: str - :rtype: iter(tuple(int, int)) - """ - left = 0 - for m in finditer(regexp, s): - right, next = m.span() - if right != left: - yield left, right - left = next - yield left, len(s) - - -def spans_to_relative(spans): - r""" - Return a sequence of relative spans, given a sequence of spans. - - >>> from nltk.tokenize import WhitespaceTokenizer - >>> from nltk.tokenize.util import spans_to_relative - >>> s = '''Good muffins cost $3.88\nin New York. Please buy me - ... two of them.\n\nThanks.''' - >>> list(spans_to_relative(WhitespaceTokenizer().span_tokenize(s))) # doctest: +NORMALIZE_WHITESPACE - [(0, 4), (1, 7), (1, 4), (1, 5), (1, 2), (1, 3), (1, 5), (2, 6), - (1, 3), (1, 2), (1, 3), (1, 2), (1, 5), (2, 7)] - - :param spans: a sequence of (start, end) offsets of the tokens - :type spans: iter(tuple(int, int)) - :rtype: iter(tuple(int, int)) - """ - prev = 0 - for left, right in spans: - yield left - prev, right - left - prev = right - - -class CJKChars: - """ - An object that enumerates the code points of the CJK characters as listed on - https://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane - - This is a Python port of the CJK code point enumerations of Moses tokenizer: - https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/detokenizer.perl#L309 - """ - - # Hangul Jamo (1100–11FF) - Hangul_Jamo = (4352, 4607) # (ord(u"\u1100"), ord(u"\u11ff")) - - # CJK Radicals Supplement (2E80–2EFF) - # Kangxi Radicals (2F00–2FDF) - # Ideographic Description Characters (2FF0–2FFF) - # CJK Symbols and Punctuation (3000–303F) - # Hiragana (3040–309F) - # Katakana (30A0–30FF) - # Bopomofo (3100–312F) - # Hangul Compatibility Jamo (3130–318F) - # Kanbun (3190–319F) - # Bopomofo Extended (31A0–31BF) - # CJK Strokes (31C0–31EF) - # Katakana Phonetic Extensions (31F0–31FF) - # Enclosed CJK Letters and Months (3200–32FF) - # CJK Compatibility (3300–33FF) - # CJK Unified Ideographs Extension A (3400–4DBF) - # Yijing Hexagram Symbols (4DC0–4DFF) - # CJK Unified Ideographs (4E00–9FFF) - # Yi Syllables (A000–A48F) - # Yi Radicals (A490–A4CF) - CJK_Radicals = (11904, 42191) # (ord(u"\u2e80"), ord(u"\ua4cf")) - - # Phags-pa (A840–A87F) - Phags_Pa = (43072, 43135) # (ord(u"\ua840"), ord(u"\ua87f")) - - # Hangul Syllables (AC00–D7AF) - Hangul_Syllables = (44032, 55215) # (ord(u"\uAC00"), ord(u"\uD7AF")) - - # CJK Compatibility Ideographs (F900–FAFF) - CJK_Compatibility_Ideographs = (63744, 64255) # (ord(u"\uF900"), ord(u"\uFAFF")) - - # CJK Compatibility Forms (FE30–FE4F) - CJK_Compatibility_Forms = (65072, 65103) # (ord(u"\uFE30"), ord(u"\uFE4F")) - - # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters - Katakana_Hangul_Halfwidth = (65381, 65500) # (ord(u"\uFF65"), ord(u"\uFFDC")) - - # Supplementary Ideographic Plane 20000–2FFFF - Supplementary_Ideographic_Plane = ( - 131072, - 196607, - ) # (ord(u"\U00020000"), ord(u"\U0002FFFF")) - - ranges = [ - Hangul_Jamo, - CJK_Radicals, - Phags_Pa, - Hangul_Syllables, - CJK_Compatibility_Ideographs, - CJK_Compatibility_Forms, - Katakana_Hangul_Halfwidth, - Supplementary_Ideographic_Plane, - ] - - -def is_cjk(character): - """ - Python port of Moses' code to check for CJK character. - - >>> CJKChars().ranges - [(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215), (63744, 64255), (65072, 65103), (65381, 65500), (131072, 196607)] - >>> is_cjk(u'\u33fe') - True - >>> is_cjk(u'\uFE5F') - False - - :param character: The character that needs to be checked. - :type character: char - :return: bool - """ - return any( - [ - start <= ord(character) <= end - for start, end in [ - (4352, 4607), - (11904, 42191), - (43072, 43135), - (44032, 55215), - (63744, 64255), - (65072, 65103), - (65381, 65500), - (131072, 196607), - ] - ] - ) - - -def xml_escape(text): - """ - This function transforms the input text into an "escaped" version suitable - for well-formed XML formatting. - - Note that the default xml.sax.saxutils.escape() function don't escape - some characters that Moses does so we have to manually add them to the - entities dictionary. - - >>> input_str = ''')| & < > ' " ] [''' - >>> expected_output = ''')| & < > ' " ] [''' - >>> escape(input_str) == expected_output - True - >>> xml_escape(input_str) - ')| & < > ' " ] [' - - :param text: The text that needs to be escaped. - :type text: str - :rtype: str - """ - return escape( - text, - entities={ - r"'": r"'", - r'"': r""", - r"|": r"|", - r"[": r"[", - r"]": r"]", - }, - ) - - -def xml_unescape(text): - """ - This function transforms the "escaped" version suitable - for well-formed XML formatting into humanly-readable string. - - Note that the default xml.sax.saxutils.unescape() function don't unescape - some characters that Moses does so we have to manually add them to the - entities dictionary. - - >>> from xml.sax.saxutils import unescape - >>> s = ')| & < > ' " ] [' - >>> expected = ''')| & < > \' " ] [''' - >>> xml_unescape(s) == expected - True - - :param text: The text that needs to be unescaped. - :type text: str - :rtype: str - """ - return unescape( - text, - entities={ - r"'": r"'", - r""": r'"', - r"|": r"|", - r"[": r"[", - r"]": r"]", - }, - ) - - -def align_tokens(tokens, sentence): - """ - This module attempt to find the offsets of the tokens in *s*, as a sequence - of ``(start, end)`` tuples, given the tokens and also the source string. - - >>> from nltk.tokenize import TreebankWordTokenizer - >>> from nltk.tokenize.util import align_tokens - >>> s = str("The plane, bound for St Petersburg, crashed in Egypt's " - ... "Sinai desert just 23 minutes after take-off from Sharm el-Sheikh " - ... "on Saturday.") - >>> tokens = TreebankWordTokenizer().tokenize(s) - >>> expected = [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23), - ... (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54), - ... (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89), - ... (90, 98), (99, 103), (104, 109), (110, 119), (120, 122), - ... (123, 131), (131, 132)] - >>> output = list(align_tokens(tokens, s)) - >>> len(tokens) == len(expected) == len(output) # Check that length of tokens and tuples are the same. - True - >>> expected == list(align_tokens(tokens, s)) # Check that the output is as expected. - True - >>> tokens == [s[start:end] for start, end in output] # Check that the slices of the string corresponds to the tokens. - True - - :param tokens: The list of strings that are the result of tokenization - :type tokens: list(str) - :param sentence: The original string - :type sentence: str - :rtype: list(tuple(int,int)) - """ - point = 0 - offsets = [] - for token in tokens: - try: - start = sentence.index(token, point) - except ValueError as e: - raise ValueError(f'substring "{token}" not found in "{sentence}"') from e - point = start + len(token) - offsets.append((start, point)) - return offsets diff --git a/pipeline/nltk/toolbox.py b/pipeline/nltk/toolbox.py deleted file mode 100644 index 40155cbaec4f2554a26e1762f7b86bd7eeefb5b9..0000000000000000000000000000000000000000 --- a/pipeline/nltk/toolbox.py +++ /dev/null @@ -1,524 +0,0 @@ -# Natural Language Toolkit: Toolbox Reader -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Greg Aumann -# URL: -# For license information, see LICENSE.TXT - -""" -Module for reading, writing and manipulating -Toolbox databases and settings files. -""" - -import codecs -import re -from io import StringIO -from xml.etree.ElementTree import Element, ElementTree, SubElement, TreeBuilder - -from nltk.data import PathPointer, find - - -class StandardFormat: - """ - Class for reading and processing standard format marker files and strings. - """ - - def __init__(self, filename=None, encoding=None): - self._encoding = encoding - if filename is not None: - self.open(filename) - - def open(self, sfm_file): - """ - Open a standard format marker file for sequential reading. - - :param sfm_file: name of the standard format marker input file - :type sfm_file: str - """ - if isinstance(sfm_file, PathPointer): - self._file = sfm_file.open(self._encoding) - else: - self._file = codecs.open(sfm_file, "r", self._encoding) - - def open_string(self, s): - """ - Open a standard format marker string for sequential reading. - - :param s: string to parse as a standard format marker input file - :type s: str - """ - self._file = StringIO(s) - - def raw_fields(self): - """ - Return an iterator that returns the next field in a (marker, value) - tuple. Linebreaks and trailing white space are preserved except - for the final newline in each field. - - :rtype: iter(tuple(str, str)) - """ - join_string = "\n" - line_regexp = r"^%s(?:\\(\S+)\s*)?(.*)$" - # discard a BOM in the first line - first_line_pat = re.compile(line_regexp % "(?:\xef\xbb\xbf)?") - line_pat = re.compile(line_regexp % "") - # need to get first line outside the loop for correct handling - # of the first marker if it spans multiple lines - file_iter = iter(self._file) - # PEP 479, prevent RuntimeError when StopIteration is raised inside generator - try: - line = next(file_iter) - except StopIteration: - # no more data is available, terminate the generator - return - mobj = re.match(first_line_pat, line) - mkr, line_value = mobj.groups() - value_lines = [line_value] - self.line_num = 0 - for line in file_iter: - self.line_num += 1 - mobj = re.match(line_pat, line) - line_mkr, line_value = mobj.groups() - if line_mkr: - yield (mkr, join_string.join(value_lines)) - mkr = line_mkr - value_lines = [line_value] - else: - value_lines.append(line_value) - self.line_num += 1 - yield (mkr, join_string.join(value_lines)) - - def fields( - self, - strip=True, - unwrap=True, - encoding=None, - errors="strict", - unicode_fields=None, - ): - """ - Return an iterator that returns the next field in a ``(marker, value)`` - tuple, where ``marker`` and ``value`` are unicode strings if an ``encoding`` - was specified in the ``fields()`` method. Otherwise they are non-unicode strings. - - :param strip: strip trailing whitespace from the last line of each field - :type strip: bool - :param unwrap: Convert newlines in a field to spaces. - :type unwrap: bool - :param encoding: Name of an encoding to use. If it is specified then - the ``fields()`` method returns unicode strings rather than non - unicode strings. - :type encoding: str or None - :param errors: Error handling scheme for codec. Same as the ``decode()`` - builtin string method. - :type errors: str - :param unicode_fields: Set of marker names whose values are UTF-8 encoded. - Ignored if encoding is None. If the whole file is UTF-8 encoded set - ``encoding='utf8'`` and leave ``unicode_fields`` with its default - value of None. - :type unicode_fields: sequence - :rtype: iter(tuple(str, str)) - """ - if encoding is None and unicode_fields is not None: - raise ValueError("unicode_fields is set but not encoding.") - unwrap_pat = re.compile(r"\n+") - for mkr, val in self.raw_fields(): - if unwrap: - val = unwrap_pat.sub(" ", val) - if strip: - val = val.rstrip() - yield (mkr, val) - - def close(self): - """Close a previously opened standard format marker file or string.""" - self._file.close() - try: - del self.line_num - except AttributeError: - pass - - -class ToolboxData(StandardFormat): - def parse(self, grammar=None, **kwargs): - if grammar: - return self._chunk_parse(grammar=grammar, **kwargs) - else: - return self._record_parse(**kwargs) - - def _record_parse(self, key=None, **kwargs): - r""" - Returns an element tree structure corresponding to a toolbox data file with - all markers at the same level. - - Thus the following Toolbox database:: - \_sh v3.0 400 Rotokas Dictionary - \_DateStampHasFourDigitYear - - \lx kaa - \ps V.A - \ge gag - \gp nek i pas - - \lx kaa - \ps V.B - \ge strangle - \gp pasim nek - - after parsing will end up with the same structure (ignoring the extra - whitespace) as the following XML fragment after being parsed by - ElementTree:: - -
    - <_sh>v3.0 400 Rotokas Dictionary - <_DateStampHasFourDigitYear/> -
    - - - kaa - V.A - gag - nek i pas - - - - kaa - V.B - strangle - pasim nek - -
    - - :param key: Name of key marker at the start of each record. If set to - None (the default value) the first marker that doesn't begin with - an underscore is assumed to be the key. - :type key: str - :param kwargs: Keyword arguments passed to ``StandardFormat.fields()`` - :type kwargs: dict - :rtype: ElementTree._ElementInterface - :return: contents of toolbox data divided into header and records - """ - builder = TreeBuilder() - builder.start("toolbox_data", {}) - builder.start("header", {}) - in_records = False - for mkr, value in self.fields(**kwargs): - if key is None and not in_records and mkr[0] != "_": - key = mkr - if mkr == key: - if in_records: - builder.end("record") - else: - builder.end("header") - in_records = True - builder.start("record", {}) - builder.start(mkr, {}) - builder.data(value) - builder.end(mkr) - if in_records: - builder.end("record") - else: - builder.end("header") - builder.end("toolbox_data") - return builder.close() - - def _tree2etree(self, parent): - from nltk.tree import Tree - - root = Element(parent.label()) - for child in parent: - if isinstance(child, Tree): - root.append(self._tree2etree(child)) - else: - text, tag = child - e = SubElement(root, tag) - e.text = text - return root - - def _chunk_parse(self, grammar=None, root_label="record", trace=0, **kwargs): - """ - Returns an element tree structure corresponding to a toolbox data file - parsed according to the chunk grammar. - - :type grammar: str - :param grammar: Contains the chunking rules used to parse the - database. See ``chunk.RegExp`` for documentation. - :type root_label: str - :param root_label: The node value that should be used for the - top node of the chunk structure. - :type trace: int - :param trace: The level of tracing that should be used when - parsing a text. ``0`` will generate no tracing output; - ``1`` will generate normal tracing output; and ``2`` or - higher will generate verbose tracing output. - :type kwargs: dict - :param kwargs: Keyword arguments passed to ``toolbox.StandardFormat.fields()`` - :rtype: ElementTree._ElementInterface - """ - from nltk import chunk - from nltk.tree import Tree - - cp = chunk.RegexpParser(grammar, root_label=root_label, trace=trace) - db = self.parse(**kwargs) - tb_etree = Element("toolbox_data") - header = db.find("header") - tb_etree.append(header) - for record in db.findall("record"): - parsed = cp.parse([(elem.text, elem.tag) for elem in record]) - tb_etree.append(self._tree2etree(parsed)) - return tb_etree - - -_is_value = re.compile(r"\S") - - -def to_sfm_string(tree, encoding=None, errors="strict", unicode_fields=None): - """ - Return a string with a standard format representation of the toolbox - data in tree (tree can be a toolbox database or a single record). - - :param tree: flat representation of toolbox data (whole database or single record) - :type tree: ElementTree._ElementInterface - :param encoding: Name of an encoding to use. - :type encoding: str - :param errors: Error handling scheme for codec. Same as the ``encode()`` - builtin string method. - :type errors: str - :param unicode_fields: - :type unicode_fields: dict(str) or set(str) - :rtype: str - """ - if tree.tag == "record": - root = Element("toolbox_data") - root.append(tree) - tree = root - - if tree.tag != "toolbox_data": - raise ValueError("not a toolbox_data element structure") - if encoding is None and unicode_fields is not None: - raise ValueError( - "if encoding is not specified then neither should unicode_fields" - ) - l = [] - for rec in tree: - l.append("\n") - for field in rec: - mkr = field.tag - value = field.text - if encoding is not None: - if unicode_fields is not None and mkr in unicode_fields: - cur_encoding = "utf8" - else: - cur_encoding = encoding - if re.search(_is_value, value): - l.append((f"\\{mkr} {value}\n").encode(cur_encoding, errors)) - else: - l.append((f"\\{mkr}{value}\n").encode(cur_encoding, errors)) - else: - if re.search(_is_value, value): - l.append(f"\\{mkr} {value}\n") - else: - l.append(f"\\{mkr}{value}\n") - return "".join(l[1:]) - - -class ToolboxSettings(StandardFormat): - """This class is the base class for settings files.""" - - def __init__(self): - super().__init__() - - def parse(self, encoding=None, errors="strict", **kwargs): - """ - Return the contents of toolbox settings file with a nested structure. - - :param encoding: encoding used by settings file - :type encoding: str - :param errors: Error handling scheme for codec. Same as ``decode()`` builtin method. - :type errors: str - :param kwargs: Keyword arguments passed to ``StandardFormat.fields()`` - :type kwargs: dict - :rtype: ElementTree._ElementInterface - """ - builder = TreeBuilder() - for mkr, value in self.fields(encoding=encoding, errors=errors, **kwargs): - # Check whether the first char of the field marker - # indicates a block start (+) or end (-) - block = mkr[0] - if block in ("+", "-"): - mkr = mkr[1:] - else: - block = None - # Build tree on the basis of block char - if block == "+": - builder.start(mkr, {}) - builder.data(value) - elif block == "-": - builder.end(mkr) - else: - builder.start(mkr, {}) - builder.data(value) - builder.end(mkr) - return builder.close() - - -def to_settings_string(tree, encoding=None, errors="strict", unicode_fields=None): - # write XML to file - l = list() - _to_settings_string( - tree.getroot(), - l, - encoding=encoding, - errors=errors, - unicode_fields=unicode_fields, - ) - return "".join(l) - - -def _to_settings_string(node, l, **kwargs): - # write XML to file - tag = node.tag - text = node.text - if len(node) == 0: - if text: - l.append(f"\\{tag} {text}\n") - else: - l.append("\\%s\n" % tag) - else: - if text: - l.append(f"\\+{tag} {text}\n") - else: - l.append("\\+%s\n" % tag) - for n in node: - _to_settings_string(n, l, **kwargs) - l.append("\\-%s\n" % tag) - return - - -def remove_blanks(elem): - """ - Remove all elements and subelements with no text and no child elements. - - :param elem: toolbox data in an elementtree structure - :type elem: ElementTree._ElementInterface - """ - out = list() - for child in elem: - remove_blanks(child) - if child.text or len(child) > 0: - out.append(child) - elem[:] = out - - -def add_default_fields(elem, default_fields): - """ - Add blank elements and subelements specified in default_fields. - - :param elem: toolbox data in an elementtree structure - :type elem: ElementTree._ElementInterface - :param default_fields: fields to add to each type of element and subelement - :type default_fields: dict(tuple) - """ - for field in default_fields.get(elem.tag, []): - if elem.find(field) is None: - SubElement(elem, field) - for child in elem: - add_default_fields(child, default_fields) - - -def sort_fields(elem, field_orders): - """ - Sort the elements and subelements in order specified in field_orders. - - :param elem: toolbox data in an elementtree structure - :type elem: ElementTree._ElementInterface - :param field_orders: order of fields for each type of element and subelement - :type field_orders: dict(tuple) - """ - order_dicts = dict() - for field, order in field_orders.items(): - order_dicts[field] = order_key = dict() - for i, subfield in enumerate(order): - order_key[subfield] = i - _sort_fields(elem, order_dicts) - - -def _sort_fields(elem, orders_dicts): - """sort the children of elem""" - try: - order = orders_dicts[elem.tag] - except KeyError: - pass - else: - tmp = sorted( - ((order.get(child.tag, 1e9), i), child) for i, child in enumerate(elem) - ) - elem[:] = [child for key, child in tmp] - for child in elem: - if len(child): - _sort_fields(child, orders_dicts) - - -def add_blank_lines(tree, blanks_before, blanks_between): - """ - Add blank lines before all elements and subelements specified in blank_before. - - :param elem: toolbox data in an elementtree structure - :type elem: ElementTree._ElementInterface - :param blank_before: elements and subelements to add blank lines before - :type blank_before: dict(tuple) - """ - try: - before = blanks_before[tree.tag] - between = blanks_between[tree.tag] - except KeyError: - for elem in tree: - if len(elem): - add_blank_lines(elem, blanks_before, blanks_between) - else: - last_elem = None - for elem in tree: - tag = elem.tag - if last_elem is not None and last_elem.tag != tag: - if tag in before and last_elem is not None: - e = last_elem.getiterator()[-1] - e.text = (e.text or "") + "\n" - else: - if tag in between: - e = last_elem.getiterator()[-1] - e.text = (e.text or "") + "\n" - if len(elem): - add_blank_lines(elem, blanks_before, blanks_between) - last_elem = elem - - -def demo(): - from itertools import islice - - # zip_path = find('corpora/toolbox.zip') - # lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse() - file_path = find("corpora/toolbox/rotokas.dic") - lexicon = ToolboxData(file_path).parse() - print("first field in fourth record:") - print(lexicon[3][0].tag) - print(lexicon[3][0].text) - - print("\nfields in sequential order:") - for field in islice(lexicon.find("record"), 10): - print(field.tag, field.text) - - print("\nlx fields:") - for field in islice(lexicon.findall("record/lx"), 10): - print(field.text) - - settings = ToolboxSettings() - file_path = find("corpora/toolbox/MDF/MDF_AltH.typ") - settings.open(file_path) - # settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ')) - tree = settings.parse(unwrap=False, encoding="cp1252") - print(tree.find("expset/expMDF/rtfPageSetup/paperSize").text) - settings_tree = ElementTree(tree) - print(to_settings_string(settings_tree).encode("utf8")) - - -if __name__ == "__main__": - demo() diff --git a/pipeline/nltk/translate/__init__.py b/pipeline/nltk/translate/__init__.py deleted file mode 100644 index 0059c1e19003bc946f699ca5895f9932ed4ec341..0000000000000000000000000000000000000000 --- a/pipeline/nltk/translate/__init__.py +++ /dev/null @@ -1,32 +0,0 @@ -# Natural Language Toolkit: Machine Translation -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird , Tah Wei Hoon -# URL: -# For license information, see LICENSE.TXT - -""" -Experimental features for machine translation. -These interfaces are prone to change. - -isort:skip_file -""" - -from nltk.translate.api import AlignedSent, Alignment, PhraseTable -from nltk.translate.ibm_model import IBMModel -from nltk.translate.ibm1 import IBMModel1 -from nltk.translate.ibm2 import IBMModel2 -from nltk.translate.ibm3 import IBMModel3 -from nltk.translate.ibm4 import IBMModel4 -from nltk.translate.ibm5 import IBMModel5 -from nltk.translate.bleu_score import sentence_bleu as bleu -from nltk.translate.ribes_score import sentence_ribes as ribes -from nltk.translate.meteor_score import meteor_score as meteor -from nltk.translate.metrics import alignment_error_rate -from nltk.translate.stack_decoder import StackDecoder -from nltk.translate.nist_score import sentence_nist as nist -from nltk.translate.chrf_score import sentence_chrf as chrf -from nltk.translate.gale_church import trace -from nltk.translate.gdfa import grow_diag_final_and -from nltk.translate.gleu_score import sentence_gleu as gleu -from nltk.translate.phrase_based import extract diff --git a/pipeline/nltk/translate/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/translate/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index fea6eb30496c576d6af48eb87a67f9c6ab0f0a42..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/translate/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/translate/__pycache__/api.cpython-39.pyc b/pipeline/nltk/translate/__pycache__/api.cpython-39.pyc deleted file mode 100644 index 5efdbae4e28c5293d5766ab37a461e3a69f3bd32..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/translate/__pycache__/api.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/translate/__pycache__/bleu_score.cpython-39.pyc b/pipeline/nltk/translate/__pycache__/bleu_score.cpython-39.pyc deleted file mode 100644 index 2dff634eaeb5112ce44ccbab40eff42a5d88e987..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/translate/__pycache__/bleu_score.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/translate/__pycache__/chrf_score.cpython-39.pyc b/pipeline/nltk/translate/__pycache__/chrf_score.cpython-39.pyc deleted file mode 100644 index 19a65b4c655697cd624a4528e77bfdaae727cc92..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/translate/__pycache__/chrf_score.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/translate/__pycache__/gale_church.cpython-39.pyc b/pipeline/nltk/translate/__pycache__/gale_church.cpython-39.pyc deleted file mode 100644 index eb8e4d468bbe04985a308df97b46402c66eb250f..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/translate/__pycache__/gale_church.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/translate/__pycache__/gdfa.cpython-39.pyc b/pipeline/nltk/translate/__pycache__/gdfa.cpython-39.pyc deleted file mode 100644 index 3e5072b9358a0c0d0f7cb3bdd40d7258322cafc9..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/translate/__pycache__/gdfa.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/translate/__pycache__/gleu_score.cpython-39.pyc b/pipeline/nltk/translate/__pycache__/gleu_score.cpython-39.pyc deleted file mode 100644 index dc41a7187d500b1d132381e8cbdd8acbc4d95105..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/translate/__pycache__/gleu_score.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/translate/__pycache__/ibm1.cpython-39.pyc b/pipeline/nltk/translate/__pycache__/ibm1.cpython-39.pyc deleted file mode 100644 index 1ab05535cdb18d3fa2ecd97ece3ed6db25cc2933..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/translate/__pycache__/ibm1.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/translate/__pycache__/ibm2.cpython-39.pyc b/pipeline/nltk/translate/__pycache__/ibm2.cpython-39.pyc deleted file mode 100644 index 1ff8740d9afa341ff3f392adbb049758a6b9380e..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/translate/__pycache__/ibm2.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/translate/__pycache__/ibm3.cpython-39.pyc b/pipeline/nltk/translate/__pycache__/ibm3.cpython-39.pyc deleted file mode 100644 index 7a81c1dc6101eeccfbb11a787f3acbb39629ead1..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/translate/__pycache__/ibm3.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/translate/__pycache__/ibm4.cpython-39.pyc b/pipeline/nltk/translate/__pycache__/ibm4.cpython-39.pyc deleted file mode 100644 index 8c3135dc791b8e41859612862333e34e4bafbb36..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/translate/__pycache__/ibm4.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/translate/__pycache__/ibm5.cpython-39.pyc b/pipeline/nltk/translate/__pycache__/ibm5.cpython-39.pyc deleted file mode 100644 index 7e28366d7bb52d8cba6081c78efc966ecf5b3120..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/translate/__pycache__/ibm5.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/translate/__pycache__/ibm_model.cpython-39.pyc b/pipeline/nltk/translate/__pycache__/ibm_model.cpython-39.pyc deleted file mode 100644 index 7f59678c12de795bf8f469451c5bcb93db1d91aa..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/translate/__pycache__/ibm_model.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/translate/__pycache__/meteor_score.cpython-39.pyc b/pipeline/nltk/translate/__pycache__/meteor_score.cpython-39.pyc deleted file mode 100644 index 69b35bd314b514d6d6e3492478827959a47ea997..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/translate/__pycache__/meteor_score.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/translate/__pycache__/metrics.cpython-39.pyc b/pipeline/nltk/translate/__pycache__/metrics.cpython-39.pyc deleted file mode 100644 index 4bd8a3a4be7b1ee4ed5280696a53c737c99149f6..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/translate/__pycache__/metrics.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/translate/__pycache__/nist_score.cpython-39.pyc b/pipeline/nltk/translate/__pycache__/nist_score.cpython-39.pyc deleted file mode 100644 index 44d1162e7f11c98ccf72d8d321f127e7c268c5d1..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/translate/__pycache__/nist_score.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/translate/__pycache__/phrase_based.cpython-39.pyc b/pipeline/nltk/translate/__pycache__/phrase_based.cpython-39.pyc deleted file mode 100644 index 72dd282415c039c8c3e14a36c38cfef1b26537f3..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/translate/__pycache__/phrase_based.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/translate/__pycache__/ribes_score.cpython-39.pyc b/pipeline/nltk/translate/__pycache__/ribes_score.cpython-39.pyc deleted file mode 100644 index 7339833327d2b513071b4b2def4068bf3a2d2dc9..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/translate/__pycache__/ribes_score.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/translate/__pycache__/stack_decoder.cpython-39.pyc b/pipeline/nltk/translate/__pycache__/stack_decoder.cpython-39.pyc deleted file mode 100644 index ce2aaba24a56113dc024267f41e900996fd5db46..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/translate/__pycache__/stack_decoder.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/translate/api.py b/pipeline/nltk/translate/api.py deleted file mode 100644 index cf00f2b52f00cd7bf6df82d9b8d4557bb0592079..0000000000000000000000000000000000000000 --- a/pipeline/nltk/translate/api.py +++ /dev/null @@ -1,334 +0,0 @@ -# Natural Language Toolkit: API for alignment and translation objects -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Will Zhang -# Guan Gui -# Steven Bird -# Tah Wei Hoon -# URL: -# For license information, see LICENSE.TXT - -import subprocess -from collections import namedtuple - - -class AlignedSent: - """ - Return an aligned sentence object, which encapsulates two sentences - along with an ``Alignment`` between them. - - Typically used in machine translation to represent a sentence and - its translation. - - >>> from nltk.translate import AlignedSent, Alignment - >>> algnsent = AlignedSent(['klein', 'ist', 'das', 'Haus'], - ... ['the', 'house', 'is', 'small'], Alignment.fromstring('0-3 1-2 2-0 3-1')) - >>> algnsent.words - ['klein', 'ist', 'das', 'Haus'] - >>> algnsent.mots - ['the', 'house', 'is', 'small'] - >>> algnsent.alignment - Alignment([(0, 3), (1, 2), (2, 0), (3, 1)]) - >>> from nltk.corpus import comtrans - >>> print(comtrans.aligned_sents()[54]) - 'So why should EU arm...'> - >>> print(comtrans.aligned_sents()[54].alignment) - 0-0 0-1 1-0 2-2 3-4 3-5 4-7 5-8 6-3 7-9 8-9 9-10 9-11 10-12 11-6 12-6 13-13 - - :param words: Words in the target language sentence - :type words: list(str) - :param mots: Words in the source language sentence - :type mots: list(str) - :param alignment: Word-level alignments between ``words`` and ``mots``. - Each alignment is represented as a 2-tuple (words_index, mots_index). - :type alignment: Alignment - """ - - def __init__(self, words, mots, alignment=None): - self._words = words - self._mots = mots - if alignment is None: - self.alignment = Alignment([]) - else: - assert type(alignment) is Alignment - self.alignment = alignment - - @property - def words(self): - return self._words - - @property - def mots(self): - return self._mots - - def _get_alignment(self): - return self._alignment - - def _set_alignment(self, alignment): - _check_alignment(len(self.words), len(self.mots), alignment) - self._alignment = alignment - - alignment = property(_get_alignment, _set_alignment) - - def __repr__(self): - """ - Return a string representation for this ``AlignedSent``. - - :rtype: str - """ - words = "[%s]" % (", ".join("'%s'" % w for w in self._words)) - mots = "[%s]" % (", ".join("'%s'" % w for w in self._mots)) - - return f"AlignedSent({words}, {mots}, {self._alignment!r})" - - def _to_dot(self): - """ - Dot representation of the aligned sentence - """ - s = "graph align {\n" - s += "node[shape=plaintext]\n" - - # Declare node - for w in self._words: - s += f'"{w}_source" [label="{w}"] \n' - - for w in self._mots: - s += f'"{w}_target" [label="{w}"] \n' - - # Alignment - for u, v in self._alignment: - s += f'"{self._words[u]}_source" -- "{self._mots[v]}_target" \n' - - # Connect the source words - for i in range(len(self._words) - 1): - s += '"{}_source" -- "{}_source" [style=invis]\n'.format( - self._words[i], - self._words[i + 1], - ) - - # Connect the target words - for i in range(len(self._mots) - 1): - s += '"{}_target" -- "{}_target" [style=invis]\n'.format( - self._mots[i], - self._mots[i + 1], - ) - - # Put it in the same rank - s += "{rank = same; %s}\n" % (" ".join('"%s_source"' % w for w in self._words)) - s += "{rank = same; %s}\n" % (" ".join('"%s_target"' % w for w in self._mots)) - - s += "}" - - return s - - def _repr_svg_(self): - """ - Ipython magic : show SVG representation of this ``AlignedSent``. - """ - dot_string = self._to_dot().encode("utf8") - output_format = "svg" - try: - process = subprocess.Popen( - ["dot", "-T%s" % output_format], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - except OSError as e: - raise Exception("Cannot find the dot binary from Graphviz package") from e - out, err = process.communicate(dot_string) - - return out.decode("utf8") - - def __str__(self): - """ - Return a human-readable string representation for this ``AlignedSent``. - - :rtype: str - """ - source = " ".join(self._words)[:20] + "..." - target = " ".join(self._mots)[:20] + "..." - return f" '{target}'>" - - def invert(self): - """ - Return the aligned sentence pair, reversing the directionality - - :rtype: AlignedSent - """ - return AlignedSent(self._mots, self._words, self._alignment.invert()) - - -class Alignment(frozenset): - """ - A storage class for representing alignment between two sequences, s1, s2. - In general, an alignment is a set of tuples of the form (i, j, ...) - representing an alignment between the i-th element of s1 and the - j-th element of s2. Tuples are extensible (they might contain - additional data, such as a boolean to indicate sure vs possible alignments). - - >>> from nltk.translate import Alignment - >>> a = Alignment([(0, 0), (0, 1), (1, 2), (2, 2)]) - >>> a.invert() - Alignment([(0, 0), (1, 0), (2, 1), (2, 2)]) - >>> print(a.invert()) - 0-0 1-0 2-1 2-2 - >>> a[0] - [(0, 1), (0, 0)] - >>> a.invert()[2] - [(2, 1), (2, 2)] - >>> b = Alignment([(0, 0), (0, 1)]) - >>> b.issubset(a) - True - >>> c = Alignment.fromstring('0-0 0-1') - >>> b == c - True - """ - - def __new__(cls, pairs): - self = frozenset.__new__(cls, pairs) - self._len = max(p[0] for p in self) if self != frozenset([]) else 0 - self._index = None - return self - - @classmethod - def fromstring(cls, s): - """ - Read a giza-formatted string and return an Alignment object. - - >>> Alignment.fromstring('0-0 2-1 9-2 21-3 10-4 7-5') - Alignment([(0, 0), (2, 1), (7, 5), (9, 2), (10, 4), (21, 3)]) - - :type s: str - :param s: the positional alignments in giza format - :rtype: Alignment - :return: An Alignment object corresponding to the string representation ``s``. - """ - - return Alignment([_giza2pair(a) for a in s.split()]) - - def __getitem__(self, key): - """ - Look up the alignments that map from a given index or slice. - """ - if not self._index: - self._build_index() - return self._index.__getitem__(key) - - def invert(self): - """ - Return an Alignment object, being the inverted mapping. - """ - return Alignment(((p[1], p[0]) + p[2:]) for p in self) - - def range(self, positions=None): - """ - Work out the range of the mapping from the given positions. - If no positions are specified, compute the range of the entire mapping. - """ - image = set() - if not self._index: - self._build_index() - if not positions: - positions = list(range(len(self._index))) - for p in positions: - image.update(f for _, f in self._index[p]) - return sorted(image) - - def __repr__(self): - """ - Produce a Giza-formatted string representing the alignment. - """ - return "Alignment(%r)" % sorted(self) - - def __str__(self): - """ - Produce a Giza-formatted string representing the alignment. - """ - return " ".join("%d-%d" % p[:2] for p in sorted(self)) - - def _build_index(self): - """ - Build a list self._index such that self._index[i] is a list - of the alignments originating from word i. - """ - self._index = [[] for _ in range(self._len + 1)] - for p in self: - self._index[p[0]].append(p) - - -def _giza2pair(pair_string): - i, j = pair_string.split("-") - return int(i), int(j) - - -def _naacl2pair(pair_string): - i, j, p = pair_string.split("-") - return int(i), int(j) - - -def _check_alignment(num_words, num_mots, alignment): - """ - Check whether the alignments are legal. - - :param num_words: the number of source language words - :type num_words: int - :param num_mots: the number of target language words - :type num_mots: int - :param alignment: alignment to be checked - :type alignment: Alignment - :raise IndexError: if alignment falls outside the sentence - """ - - assert type(alignment) is Alignment - - if not all(0 <= pair[0] < num_words for pair in alignment): - raise IndexError("Alignment is outside boundary of words") - if not all(pair[1] is None or 0 <= pair[1] < num_mots for pair in alignment): - raise IndexError("Alignment is outside boundary of mots") - - -PhraseTableEntry = namedtuple("PhraseTableEntry", ["trg_phrase", "log_prob"]) - - -class PhraseTable: - """ - In-memory store of translations for a given phrase, and the log - probability of the those translations - """ - - def __init__(self): - self.src_phrases = dict() - - def translations_for(self, src_phrase): - """ - Get the translations for a source language phrase - - :param src_phrase: Source language phrase of interest - :type src_phrase: tuple(str) - - :return: A list of target language phrases that are translations - of ``src_phrase``, ordered in decreasing order of - likelihood. Each list element is a tuple of the target - phrase and its log probability. - :rtype: list(PhraseTableEntry) - """ - return self.src_phrases[src_phrase] - - def add(self, src_phrase, trg_phrase, log_prob): - """ - :type src_phrase: tuple(str) - :type trg_phrase: tuple(str) - - :param log_prob: Log probability that given ``src_phrase``, - ``trg_phrase`` is its translation - :type log_prob: float - """ - entry = PhraseTableEntry(trg_phrase=trg_phrase, log_prob=log_prob) - if src_phrase not in self.src_phrases: - self.src_phrases[src_phrase] = [] - self.src_phrases[src_phrase].append(entry) - self.src_phrases[src_phrase].sort(key=lambda e: e.log_prob, reverse=True) - - def __contains__(self, src_phrase): - return src_phrase in self.src_phrases diff --git a/pipeline/nltk/translate/bleu_score.py b/pipeline/nltk/translate/bleu_score.py deleted file mode 100644 index 1b2cc949db964b029f4e7324cbbc7236d3ff9248..0000000000000000000000000000000000000000 --- a/pipeline/nltk/translate/bleu_score.py +++ /dev/null @@ -1,685 +0,0 @@ -# Natural Language Toolkit: BLEU Score -# -# Copyright (C) 2001-2023 NLTK Project -# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim -# Contributors: Björn Mattsson, Dmitrijs Milajevs, Liling Tan -# URL: -# For license information, see LICENSE.TXT - -"""BLEU score implementation.""" - -import math -import sys -import warnings -from collections import Counter -from fractions import Fraction - -from nltk.util import ngrams - - -def sentence_bleu( - references, - hypothesis, - weights=(0.25, 0.25, 0.25, 0.25), - smoothing_function=None, - auto_reweigh=False, -): - """ - Calculate BLEU score (Bilingual Evaluation Understudy) from - Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. - "BLEU: a method for automatic evaluation of machine translation." - In Proceedings of ACL. https://www.aclweb.org/anthology/P02-1040.pdf - - >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', - ... 'ensures', 'that', 'the', 'military', 'always', - ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] - - >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', - ... 'forever', 'hearing', 'the', 'activity', 'guidebook', - ... 'that', 'party', 'direct'] - - >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', - ... 'ensures', 'that', 'the', 'military', 'will', 'forever', - ... 'heed', 'Party', 'commands'] - - >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', - ... 'guarantees', 'the', 'military', 'forces', 'always', - ... 'being', 'under', 'the', 'command', 'of', 'the', - ... 'Party'] - - >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', - ... 'army', 'always', 'to', 'heed', 'the', 'directions', - ... 'of', 'the', 'party'] - - >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS - 0.5045... - - If there is no ngrams overlap for any order of n-grams, BLEU returns the - value 0. This is because the precision for the order of n-grams without - overlap is 0, and the geometric mean in the final BLEU score computation - multiplies the 0 with the precision of other n-grams. This results in 0 - (independently of the precision of the other n-gram orders). The following - example has zero 3-gram and 4-gram overlaps: - - >>> round(sentence_bleu([reference1, reference2, reference3], hypothesis2),4) # doctest: +ELLIPSIS - 0.0 - - To avoid this harsh behaviour when no ngram overlaps are found a smoothing - function can be used. - - >>> chencherry = SmoothingFunction() - >>> sentence_bleu([reference1, reference2, reference3], hypothesis2, - ... smoothing_function=chencherry.method1) # doctest: +ELLIPSIS - 0.0370... - - The default BLEU calculates a score for up to 4-grams using uniform - weights (this is called BLEU-4). To evaluate your translations with - higher/lower order ngrams, use customized weights. E.g. when accounting - for up to 5-grams with uniform weights (this is called BLEU-5) use: - - >>> weights = (1./5., 1./5., 1./5., 1./5., 1./5.) - >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS - 0.3920... - - Multiple BLEU scores can be computed at once, by supplying a list of weights. - E.g. for computing BLEU-2, BLEU-3 *and* BLEU-4 in one computation, use: - >>> weights = [ - ... (1./2., 1./2.), - ... (1./3., 1./3., 1./3.), - ... (1./4., 1./4., 1./4., 1./4.) - ... ] - >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS - [0.7453..., 0.6240..., 0.5045...] - - :param references: reference sentences - :type references: list(list(str)) - :param hypothesis: a hypothesis sentence - :type hypothesis: list(str) - :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights) - :type weights: tuple(float) / list(tuple(float)) - :param smoothing_function: - :type smoothing_function: SmoothingFunction - :param auto_reweigh: Option to re-normalize the weights uniformly. - :type auto_reweigh: bool - :return: The sentence-level BLEU score. Returns a list if multiple weights were supplied. - :rtype: float / list(float) - """ - return corpus_bleu( - [references], [hypothesis], weights, smoothing_function, auto_reweigh - ) - - -def corpus_bleu( - list_of_references, - hypotheses, - weights=(0.25, 0.25, 0.25, 0.25), - smoothing_function=None, - auto_reweigh=False, -): - """ - Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all - the hypotheses and their respective references. - - Instead of averaging the sentence level BLEU scores (i.e. macro-average - precision), the original BLEU metric (Papineni et al. 2002) accounts for - the micro-average precision (i.e. summing the numerators and denominators - for each hypothesis-reference(s) pairs before the division). - - >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', - ... 'ensures', 'that', 'the', 'military', 'always', - ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] - >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', - ... 'ensures', 'that', 'the', 'military', 'will', 'forever', - ... 'heed', 'Party', 'commands'] - >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which', - ... 'guarantees', 'the', 'military', 'forces', 'always', - ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] - >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', - ... 'army', 'always', 'to', 'heed', 'the', 'directions', - ... 'of', 'the', 'party'] - - >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', - ... 'interested', 'in', 'world', 'history'] - >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', - ... 'because', 'he', 'read', 'the', 'book'] - - >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] - >>> hypotheses = [hyp1, hyp2] - >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS - 0.5920... - - The example below show that corpus_bleu() is different from averaging - sentence_bleu() for hypotheses - - >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1) - >>> score2 = sentence_bleu([ref2a], hyp2) - >>> (score1 + score2) / 2 # doctest: +ELLIPSIS - 0.6223... - - Custom weights may be supplied to fine-tune the BLEU score further. - A tuple of float weights for unigrams, bigrams, trigrams and so on can be given. - >>> weights = (0.1, 0.3, 0.5, 0.1) - >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS - 0.5818... - - This particular weight gave extra value to trigrams. - Furthermore, multiple weights can be given, resulting in multiple BLEU scores. - >>> weights = [ - ... (0.5, 0.5), - ... (0.333, 0.333, 0.334), - ... (0.25, 0.25, 0.25, 0.25), - ... (0.2, 0.2, 0.2, 0.2, 0.2) - ... ] - >>> corpus_bleu(list_of_references, hypotheses, weights=weights) # doctest: +ELLIPSIS - [0.8242..., 0.7067..., 0.5920..., 0.4719...] - - :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses - :type list_of_references: list(list(list(str))) - :param hypotheses: a list of hypothesis sentences - :type hypotheses: list(list(str)) - :param weights: weights for unigrams, bigrams, trigrams and so on (one or a list of weights) - :type weights: tuple(float) / list(tuple(float)) - :param smoothing_function: - :type smoothing_function: SmoothingFunction - :param auto_reweigh: Option to re-normalize the weights uniformly. - :type auto_reweigh: bool - :return: The corpus-level BLEU score. - :rtype: float - """ - # Before proceeding to compute BLEU, perform sanity checks. - - p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches. - p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref. - hyp_lengths, ref_lengths = 0, 0 - - assert len(list_of_references) == len(hypotheses), ( - "The number of hypotheses and their reference(s) should be the " "same " - ) - - try: - weights[0][0] - except TypeError: - weights = [weights] - max_weight_length = max(len(weight) for weight in weights) - - # Iterate through each hypothesis and their corresponding references. - for references, hypothesis in zip(list_of_references, hypotheses): - # For each order of ngram, calculate the numerator and - # denominator for the corpus-level modified precision. - for i in range(1, max_weight_length + 1): - p_i = modified_precision(references, hypothesis, i) - p_numerators[i] += p_i.numerator - p_denominators[i] += p_i.denominator - - # Calculate the hypothesis length and the closest reference length. - # Adds them to the corpus-level hypothesis and reference counts. - hyp_len = len(hypothesis) - hyp_lengths += hyp_len - ref_lengths += closest_ref_length(references, hyp_len) - - # Calculate corpus-level brevity penalty. - bp = brevity_penalty(ref_lengths, hyp_lengths) - - # Collects the various precision values for the different ngram orders. - p_n = [ - Fraction(p_numerators[i], p_denominators[i], _normalize=False) - for i in range(1, max_weight_length + 1) - ] - - # Returns 0 if there's no matching n-grams - # We only need to check for p_numerators[1] == 0, since if there's - # no unigrams, there won't be any higher order ngrams. - if p_numerators[1] == 0: - return 0 if len(weights) == 1 else [0] * len(weights) - - # If there's no smoothing, set use method0 from SmoothinFunction class. - if not smoothing_function: - smoothing_function = SmoothingFunction().method0 - # Smoothen the modified precision. - # Note: smoothing_function() may convert values into floats; - # it tries to retain the Fraction object as much as the - # smoothing method allows. - p_n = smoothing_function( - p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths - ) - - bleu_scores = [] - for weight in weights: - # Uniformly re-weighting based on maximum hypothesis lengths if largest - # order of n-grams < 4 and weights is set at default. - if auto_reweigh: - if hyp_lengths < 4 and weight == (0.25, 0.25, 0.25, 0.25): - weight = (1 / hyp_lengths,) * hyp_lengths - - s = (w_i * math.log(p_i) for w_i, p_i in zip(weight, p_n) if p_i > 0) - s = bp * math.exp(math.fsum(s)) - bleu_scores.append(s) - return bleu_scores[0] if len(weights) == 1 else bleu_scores - - -def modified_precision(references, hypothesis, n): - """ - Calculate modified ngram precision. - - The normal precision method may lead to some wrong translations with - high-precision, e.g., the translation, in which a word of reference - repeats several times, has very high precision. - - This function only returns the Fraction object that contains the numerator - and denominator necessary to calculate the corpus-level precision. - To calculate the modified precision for a single pair of hypothesis and - references, cast the Fraction object into a float. - - The famous "the the the ... " example shows that you can get BLEU precision - by duplicating high frequency words. - - >>> reference1 = 'the cat is on the mat'.split() - >>> reference2 = 'there is a cat on the mat'.split() - >>> hypothesis1 = 'the the the the the the the'.split() - >>> references = [reference1, reference2] - >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS - 0.2857... - - In the modified n-gram precision, a reference word will be considered - exhausted after a matching hypothesis word is identified, e.g. - - >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', - ... 'ensures', 'that', 'the', 'military', 'will', - ... 'forever', 'heed', 'Party', 'commands'] - >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', - ... 'guarantees', 'the', 'military', 'forces', 'always', - ... 'being', 'under', 'the', 'command', 'of', 'the', - ... 'Party'] - >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', - ... 'army', 'always', 'to', 'heed', 'the', 'directions', - ... 'of', 'the', 'party'] - >>> hypothesis = 'of the'.split() - >>> references = [reference1, reference2, reference3] - >>> float(modified_precision(references, hypothesis, n=1)) - 1.0 - >>> float(modified_precision(references, hypothesis, n=2)) - 1.0 - - An example of a normal machine translation hypothesis: - - >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', - ... 'ensures', 'that', 'the', 'military', 'always', - ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] - - >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', - ... 'forever', 'hearing', 'the', 'activity', 'guidebook', - ... 'that', 'party', 'direct'] - - >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', - ... 'ensures', 'that', 'the', 'military', 'will', - ... 'forever', 'heed', 'Party', 'commands'] - - >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', - ... 'guarantees', 'the', 'military', 'forces', 'always', - ... 'being', 'under', 'the', 'command', 'of', 'the', - ... 'Party'] - - >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', - ... 'army', 'always', 'to', 'heed', 'the', 'directions', - ... 'of', 'the', 'party'] - >>> references = [reference1, reference2, reference3] - >>> float(modified_precision(references, hypothesis1, n=1)) # doctest: +ELLIPSIS - 0.9444... - >>> float(modified_precision(references, hypothesis2, n=1)) # doctest: +ELLIPSIS - 0.5714... - >>> float(modified_precision(references, hypothesis1, n=2)) # doctest: +ELLIPSIS - 0.5882352941176471 - >>> float(modified_precision(references, hypothesis2, n=2)) # doctest: +ELLIPSIS - 0.07692... - - - :param references: A list of reference translations. - :type references: list(list(str)) - :param hypothesis: A hypothesis translation. - :type hypothesis: list(str) - :param n: The ngram order. - :type n: int - :return: BLEU's modified precision for the nth order ngram. - :rtype: Fraction - """ - # Extracts all ngrams in hypothesis - # Set an empty Counter if hypothesis is empty. - counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter() - # Extract a union of references' counts. - # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references]) - max_counts = {} - for reference in references: - reference_counts = ( - Counter(ngrams(reference, n)) if len(reference) >= n else Counter() - ) - for ngram in counts: - max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram]) - - # Assigns the intersection between hypothesis and references' counts. - clipped_counts = { - ngram: min(count, max_counts[ngram]) for ngram, count in counts.items() - } - - numerator = sum(clipped_counts.values()) - # Ensures that denominator is minimum 1 to avoid ZeroDivisionError. - # Usually this happens when the ngram order is > len(reference). - denominator = max(1, sum(counts.values())) - - return Fraction(numerator, denominator, _normalize=False) - - -def closest_ref_length(references, hyp_len): - """ - This function finds the reference that is the closest length to the - hypothesis. The closest reference length is referred to as *r* variable - from the brevity penalty formula in Papineni et. al. (2002) - - :param references: A list of reference translations. - :type references: list(list(str)) - :param hyp_len: The length of the hypothesis. - :type hyp_len: int - :return: The length of the reference that's closest to the hypothesis. - :rtype: int - """ - ref_lens = (len(reference) for reference in references) - closest_ref_len = min( - ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len) - ) - return closest_ref_len - - -def brevity_penalty(closest_ref_len, hyp_len): - """ - Calculate brevity penalty. - - As the modified n-gram precision still has the problem from the short - length sentence, brevity penalty is used to modify the overall BLEU - score according to length. - - An example from the paper. There are three references with length 12, 15 - and 17. And a concise hypothesis of the length 12. The brevity penalty is 1. - - >>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 - >>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15 - >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17 - >>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 - >>> references = [reference1, reference2, reference3] - >>> hyp_len = len(hypothesis) - >>> closest_ref_len = closest_ref_length(references, hyp_len) - >>> brevity_penalty(closest_ref_len, hyp_len) - 1.0 - - In case a hypothesis translation is shorter than the references, penalty is - applied. - - >>> references = [['a'] * 28, ['a'] * 28] - >>> hypothesis = ['a'] * 12 - >>> hyp_len = len(hypothesis) - >>> closest_ref_len = closest_ref_length(references, hyp_len) - >>> brevity_penalty(closest_ref_len, hyp_len) - 0.2635971381157267 - - The length of the closest reference is used to compute the penalty. If the - length of a hypothesis is 12, and the reference lengths are 13 and 2, the - penalty is applied because the hypothesis length (12) is less then the - closest reference length (13). - - >>> references = [['a'] * 13, ['a'] * 2] - >>> hypothesis = ['a'] * 12 - >>> hyp_len = len(hypothesis) - >>> closest_ref_len = closest_ref_length(references, hyp_len) - >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS - 0.9200... - - The brevity penalty doesn't depend on reference order. More importantly, - when two reference sentences are at the same distance, the shortest - reference sentence length is used. - - >>> references = [['a'] * 13, ['a'] * 11] - >>> hypothesis = ['a'] * 12 - >>> hyp_len = len(hypothesis) - >>> closest_ref_len = closest_ref_length(references, hyp_len) - >>> bp1 = brevity_penalty(closest_ref_len, hyp_len) - >>> hyp_len = len(hypothesis) - >>> closest_ref_len = closest_ref_length(reversed(references), hyp_len) - >>> bp2 = brevity_penalty(closest_ref_len, hyp_len) - >>> bp1 == bp2 == 1 - True - - A test example from mteval-v13a.pl (starting from the line 705): - - >>> references = [['a'] * 11, ['a'] * 8] - >>> hypothesis = ['a'] * 7 - >>> hyp_len = len(hypothesis) - >>> closest_ref_len = closest_ref_length(references, hyp_len) - >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS - 0.8668... - - >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7] - >>> hypothesis = ['a'] * 7 - >>> hyp_len = len(hypothesis) - >>> closest_ref_len = closest_ref_length(references, hyp_len) - >>> brevity_penalty(closest_ref_len, hyp_len) - 1.0 - - :param hyp_len: The length of the hypothesis for a single sentence OR the - sum of all the hypotheses' lengths for a corpus - :type hyp_len: int - :param closest_ref_len: The length of the closest reference for a single - hypothesis OR the sum of all the closest references for every hypotheses. - :type closest_ref_len: int - :return: BLEU's brevity penalty. - :rtype: float - """ - if hyp_len > closest_ref_len: - return 1 - # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0 - elif hyp_len == 0: - return 0 - else: - return math.exp(1 - closest_ref_len / hyp_len) - - -class SmoothingFunction: - """ - This is an implementation of the smoothing techniques - for segment-level BLEU scores that was presented in - Boxing Chen and Collin Cherry (2014) A Systematic Comparison of - Smoothing Techniques for Sentence-Level BLEU. In WMT14. - http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf - """ - - def __init__(self, epsilon=0.1, alpha=5, k=5): - """ - This will initialize the parameters required for the various smoothing - techniques, the default values are set to the numbers used in the - experiments from Chen and Cherry (2014). - - >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', - ... 'that', 'the', 'military', 'always', 'obeys', 'the', - ... 'commands', 'of', 'the', 'party'] - >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', - ... 'that', 'the', 'military', 'will', 'forever', 'heed', - ... 'Party', 'commands'] - - >>> chencherry = SmoothingFunction() - >>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS - 0.4118... - >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS - 0.4118... - >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS - 0.4118... - >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS - 0.4452... - >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS - 0.4118... - >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS - 0.4118... - >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS - 0.4905... - >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS - 0.4135... - >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS - 0.4905... - - :param epsilon: the epsilon value use in method 1 - :type epsilon: float - :param alpha: the alpha value use in method 6 - :type alpha: int - :param k: the k value use in method 4 - :type k: int - """ - self.epsilon = epsilon - self.alpha = alpha - self.k = k - - def method0(self, p_n, *args, **kwargs): - """ - No smoothing. - """ - p_n_new = [] - for i, p_i in enumerate(p_n): - if p_i.numerator != 0: - p_n_new.append(p_i) - else: - _msg = str( - "\nThe hypothesis contains 0 counts of {}-gram overlaps.\n" - "Therefore the BLEU score evaluates to 0, independently of\n" - "how many N-gram overlaps of lower order it contains.\n" - "Consider using lower n-gram order or use " - "SmoothingFunction()" - ).format(i + 1) - warnings.warn(_msg) - # When numerator==0 where denonminator==0 or !=0, the result - # for the precision score should be equal to 0 or undefined. - # Due to BLEU geometric mean computation in logarithm space, - # we we need to take the return sys.float_info.min such that - # math.log(sys.float_info.min) returns a 0 precision score. - p_n_new.append(sys.float_info.min) - return p_n_new - - def method1(self, p_n, *args, **kwargs): - """ - Smoothing method 1: Add *epsilon* counts to precision with 0 counts. - """ - return [ - (p_i.numerator + self.epsilon) / p_i.denominator - if p_i.numerator == 0 - else p_i - for p_i in p_n - ] - - def method2(self, p_n, *args, **kwargs): - """ - Smoothing method 2: Add 1 to both numerator and denominator from - Chin-Yew Lin and Franz Josef Och (2004) ORANGE: a Method for - Evaluating Automatic Evaluation Metrics for Machine Translation. - In COLING 2004. - """ - return [ - Fraction(p_n[i].numerator + 1, p_n[i].denominator + 1, _normalize=False) - if i != 0 - else p_n[0] - for i in range(len(p_n)) - ] - - def method3(self, p_n, *args, **kwargs): - """ - Smoothing method 3: NIST geometric sequence smoothing - The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each - precision score whose matching n-gram count is null. - k is 1 for the first 'n' value for which the n-gram match count is null/ - - For example, if the text contains: - - - one 2-gram match - - and (consequently) two 1-gram matches - - the n-gram count for each individual precision score would be: - - - n=1 => prec_count = 2 (two unigrams) - - n=2 => prec_count = 1 (one bigram) - - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1) - - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2) - """ - incvnt = 1 # From the mteval-v13a.pl, it's referred to as k. - for i, p_i in enumerate(p_n): - if p_i.numerator == 0: - p_n[i] = 1 / (2**incvnt * p_i.denominator) - incvnt += 1 - return p_n - - def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): - """ - Smoothing method 4: - Shorter translations may have inflated precision values due to having - smaller denominators; therefore, we give them proportionally - smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry - suggests dividing by 1/ln(len(T)), where T is the length of the translation. - """ - incvnt = 1 - hyp_len = hyp_len if hyp_len else len(hypothesis) - for i, p_i in enumerate(p_n): - if p_i.numerator == 0 and hyp_len > 1: - # incvnt = i + 1 * self.k / math.log( - # hyp_len - # ) # Note that this K is different from the K from NIST. - # p_n[i] = incvnt / p_i.denominator\ - numerator = 1 / (2**incvnt * self.k / math.log(hyp_len)) - p_n[i] = numerator / p_i.denominator - incvnt += 1 - return p_n - - def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): - """ - Smoothing method 5: - The matched counts for similar values of n should be similar. To a - calculate the n-gram matched count, it averages the n−1, n and n+1 gram - matched counts. - """ - hyp_len = hyp_len if hyp_len else len(hypothesis) - m = {} - # Requires an precision value for an addition ngram order. - p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)] - m[-1] = p_n[0] + 1 - for i, p_i in enumerate(p_n): - p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3 - m[i] = p_n[i] - return p_n - - def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): - """ - Smoothing method 6: - Interpolates the maximum likelihood estimate of the precision *p_n* with - a prior estimate *pi0*. The prior is estimated by assuming that the ratio - between pn and pn−1 will be the same as that between pn−1 and pn−2; from - Gao and He (2013) Training MRF-Based Phrase Translation Models using - Gradient Ascent. In NAACL. - """ - hyp_len = hyp_len if hyp_len else len(hypothesis) - # This smoothing only works when p_1 and p_2 is non-zero. - # Raise an error with an appropriate message when the input is too short - # to use this smoothing technique. - assert p_n[2], "This smoothing method requires non-zero precision for bigrams." - for i, p_i in enumerate(p_n): - if i in [0, 1]: # Skips the first 2 orders of ngrams. - continue - else: - pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2] - # No. of ngrams in translation that matches the reference. - m = p_i.numerator - # No. of ngrams in translation. - l = sum(1 for _ in ngrams(hypothesis, i + 1)) - # Calculates the interpolated precision. - p_n[i] = (m + self.alpha * pi0) / (l + self.alpha) - return p_n - - def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs): - """ - Smoothing method 7: - Interpolates methods 4 and 5. - """ - hyp_len = hyp_len if hyp_len else len(hypothesis) - p_n = self.method4(p_n, references, hypothesis, hyp_len) - p_n = self.method5(p_n, references, hypothesis, hyp_len) - return p_n diff --git a/pipeline/nltk/translate/chrf_score.py b/pipeline/nltk/translate/chrf_score.py deleted file mode 100644 index d4b54f3a07166ba5179b2850cca82b21fe7c39f1..0000000000000000000000000000000000000000 --- a/pipeline/nltk/translate/chrf_score.py +++ /dev/null @@ -1,222 +0,0 @@ -# Natural Language Toolkit: ChrF score -# -# Copyright (C) 2001-2023 NLTK Project -# Authors: Maja Popovic -# Contributors: Liling Tan, Aleš Tamchyna (Memsource) -# URL: -# For license information, see LICENSE.TXT - -""" ChrF score implementation """ -import re -from collections import Counter, defaultdict - -from nltk.util import ngrams - - -def sentence_chrf( - reference, hypothesis, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True -): - """ - Calculates the sentence level CHRF (Character n-gram F-score) described in - - Maja Popovic. 2015. CHRF: Character n-gram F-score for Automatic MT Evaluation. - In Proceedings of the 10th Workshop on Machine Translation. - https://www.statmt.org/wmt15/pdf/WMT49.pdf - - Maja Popovic. 2016. CHRF Deconstructed: β Parameters and n-gram Weights. - In Proceedings of the 1st Conference on Machine Translation. - https://www.statmt.org/wmt16/pdf/W16-2341.pdf - - This implementation of CHRF only supports a single reference at the moment. - - For details not reported in the paper, consult Maja Popovic's original - implementation: https://github.com/m-popovic/chrF - - The code should output results equivalent to running CHRF++ with the - following options: -nw 0 -b 3 - - An example from the original BLEU paper - https://www.aclweb.org/anthology/P02-1040.pdf - - >>> ref1 = str('It is a guide to action that ensures that the military ' - ... 'will forever heed Party commands').split() - >>> hyp1 = str('It is a guide to action which ensures that the military ' - ... 'always obeys the commands of the party').split() - >>> hyp2 = str('It is to insure the troops forever hearing the activity ' - ... 'guidebook that party direct').split() - >>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS - 0.6349... - >>> sentence_chrf(ref1, hyp2) # doctest: +ELLIPSIS - 0.3330... - - The infamous "the the the ... " example - - >>> ref = 'the cat is on the mat'.split() - >>> hyp = 'the the the the the the the'.split() - >>> sentence_chrf(ref, hyp) # doctest: +ELLIPSIS - 0.1468... - - An example to show that this function allows users to use strings instead of - tokens, i.e. list(str) as inputs. - - >>> ref1 = str('It is a guide to action that ensures that the military ' - ... 'will forever heed Party commands') - >>> hyp1 = str('It is a guide to action which ensures that the military ' - ... 'always obeys the commands of the party') - >>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS - 0.6349... - >>> type(ref1) == type(hyp1) == str - True - >>> sentence_chrf(ref1.split(), hyp1.split()) # doctest: +ELLIPSIS - 0.6349... - - To skip the unigrams and only use 2- to 3-grams: - - >>> sentence_chrf(ref1, hyp1, min_len=2, max_len=3) # doctest: +ELLIPSIS - 0.6617... - - :param references: reference sentence - :type references: list(str) / str - :param hypothesis: a hypothesis sentence - :type hypothesis: list(str) / str - :param min_len: The minimum order of n-gram this function should extract. - :type min_len: int - :param max_len: The maximum order of n-gram this function should extract. - :type max_len: int - :param beta: the parameter to assign more importance to recall over precision - :type beta: float - :param ignore_whitespace: ignore whitespace characters in scoring - :type ignore_whitespace: bool - :return: the sentence level CHRF score. - :rtype: float - """ - return corpus_chrf( - [reference], - [hypothesis], - min_len, - max_len, - beta=beta, - ignore_whitespace=ignore_whitespace, - ) - - -def _preprocess(sent, ignore_whitespace): - if type(sent) != str: - # turn list of tokens into a string - sent = " ".join(sent) - - if ignore_whitespace: - sent = re.sub(r"\s+", "", sent) - return sent - - -def chrf_precision_recall_fscore_support( - reference, hypothesis, n, beta=3.0, epsilon=1e-16 -): - """ - This function computes the precision, recall and fscore from the ngram - overlaps. It returns the `support` which is the true positive score. - - By underspecifying the input type, the function will be agnostic as to how - it computes the ngrams and simply take the whichever element in the list; - it could be either token or character. - - :param reference: The reference sentence. - :type reference: list - :param hypothesis: The hypothesis sentence. - :type hypothesis: list - :param n: Extract up to the n-th order ngrams - :type n: int - :param beta: The parameter to assign more importance to recall over precision. - :type beta: float - :param epsilon: The fallback value if the hypothesis or reference is empty. - :type epsilon: float - :return: Returns the precision, recall and f-score and support (true positive). - :rtype: tuple(float) - """ - ref_ngrams = Counter(ngrams(reference, n)) - hyp_ngrams = Counter(ngrams(hypothesis, n)) - - # calculate the number of ngram matches - overlap_ngrams = ref_ngrams & hyp_ngrams - tp = sum(overlap_ngrams.values()) # True positives. - tpfp = sum(hyp_ngrams.values()) # True positives + False positives. - tpfn = sum(ref_ngrams.values()) # True positives + False negatives. - - try: - prec = tp / tpfp # precision - rec = tp / tpfn # recall - factor = beta**2 - fscore = (1 + factor) * (prec * rec) / (factor * prec + rec) - except ZeroDivisionError: - prec = rec = fscore = epsilon - return prec, rec, fscore, tp - - -def corpus_chrf( - references, hypotheses, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True -): - """ - Calculates the corpus level CHRF (Character n-gram F-score), it is the - macro-averaged value of the sentence/segment level CHRF score. - - This implementation of CHRF only supports a single reference at the moment. - - >>> ref1 = str('It is a guide to action that ensures that the military ' - ... 'will forever heed Party commands').split() - >>> ref2 = str('It is the guiding principle which guarantees the military ' - ... 'forces always being under the command of the Party').split() - >>> - >>> hyp1 = str('It is a guide to action which ensures that the military ' - ... 'always obeys the commands of the party').split() - >>> hyp2 = str('It is to insure the troops forever hearing the activity ' - ... 'guidebook that party direct') - >>> corpus_chrf([ref1, ref2, ref1, ref2], [hyp1, hyp2, hyp2, hyp1]) # doctest: +ELLIPSIS - 0.3910... - - :param references: a corpus of list of reference sentences, w.r.t. hypotheses - :type references: list(list(str)) - :param hypotheses: a list of hypothesis sentences - :type hypotheses: list(list(str)) - :param min_len: The minimum order of n-gram this function should extract. - :type min_len: int - :param max_len: The maximum order of n-gram this function should extract. - :type max_len: int - :param beta: the parameter to assign more importance to recall over precision - :type beta: float - :param ignore_whitespace: ignore whitespace characters in scoring - :type ignore_whitespace: bool - :return: the sentence level CHRF score. - :rtype: float - """ - - assert len(references) == len( - hypotheses - ), "The number of hypotheses and their references should be the same" - num_sents = len(hypotheses) - - # Keep f-scores for each n-gram order separate - ngram_fscores = defaultdict(lambda: list()) - - # Iterate through each hypothesis and their corresponding references. - for reference, hypothesis in zip(references, hypotheses): - - # preprocess both reference and hypothesis - reference = _preprocess(reference, ignore_whitespace) - hypothesis = _preprocess(hypothesis, ignore_whitespace) - - # Calculate f-scores for each sentence and for each n-gram order - # separately. - for n in range(min_len, max_len + 1): - # Compute the precision, recall, fscore and support. - prec, rec, fscore, tp = chrf_precision_recall_fscore_support( - reference, hypothesis, n, beta=beta - ) - ngram_fscores[n].append(fscore) - - # how many n-gram sizes - num_ngram_sizes = len(ngram_fscores) - - # sum of f-scores over all sentences for each n-gram order - total_scores = [sum(fscores) for n, fscores in ngram_fscores.items()] - - # macro-average over n-gram orders and over all sentences - return (sum(total_scores) / num_ngram_sizes) / num_sents diff --git a/pipeline/nltk/translate/gale_church.py b/pipeline/nltk/translate/gale_church.py deleted file mode 100644 index d7c81940d9ac27c159b680d688343e67e9ef9c58..0000000000000000000000000000000000000000 --- a/pipeline/nltk/translate/gale_church.py +++ /dev/null @@ -1,263 +0,0 @@ -# Natural Language Toolkit: Gale-Church Aligner -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Torsten Marek -# Contributor: Cassidy Laidlaw, Liling Tan -# URL: -# For license information, see LICENSE.TXT - -""" - -A port of the Gale-Church Aligner. - -Gale & Church (1993), A Program for Aligning Sentences in Bilingual Corpora. -https://aclweb.org/anthology/J93-1004.pdf - -""" - -import math - -try: - from norm import logsf as norm_logsf - from scipy.stats import norm -except ImportError: - - def erfcc(x): - """Complementary error function.""" - z = abs(x) - t = 1 / (1 + 0.5 * z) - r = t * math.exp( - -z * z - - 1.26551223 - + t - * ( - 1.00002368 - + t - * ( - 0.37409196 - + t - * ( - 0.09678418 - + t - * ( - -0.18628806 - + t - * ( - 0.27886807 - + t - * ( - -1.13520398 - + t - * (1.48851587 + t * (-0.82215223 + t * 0.17087277)) - ) - ) - ) - ) - ) - ) - ) - if x >= 0.0: - return r - else: - return 2.0 - r - - def norm_cdf(x): - """Return the area under the normal distribution from M{-∞..x}.""" - return 1 - 0.5 * erfcc(x / math.sqrt(2)) - - def norm_logsf(x): - try: - return math.log(1 - norm_cdf(x)) - except ValueError: - return float("-inf") - - -LOG2 = math.log(2) - - -class LanguageIndependent: - # These are the language-independent probabilities and parameters - # given in Gale & Church - - # for the computation, l_1 is always the language with less characters - PRIORS = { - (1, 0): 0.0099, - (0, 1): 0.0099, - (1, 1): 0.89, - (2, 1): 0.089, - (1, 2): 0.089, - (2, 2): 0.011, - } - - AVERAGE_CHARACTERS = 1 - VARIANCE_CHARACTERS = 6.8 - - -def trace(backlinks, source_sents_lens, target_sents_lens): - """ - Traverse the alignment cost from the tracebacks and retrieves - appropriate sentence pairs. - - :param backlinks: A dictionary where the key is the alignment points and value is the cost (referencing the LanguageIndependent.PRIORS) - :type backlinks: dict - :param source_sents_lens: A list of target sentences' lengths - :type source_sents_lens: list(int) - :param target_sents_lens: A list of target sentences' lengths - :type target_sents_lens: list(int) - """ - links = [] - position = (len(source_sents_lens), len(target_sents_lens)) - while position != (0, 0) and all(p >= 0 for p in position): - try: - s, t = backlinks[position] - except TypeError: - position = (position[0] - 1, position[1] - 1) - continue - for i in range(s): - for j in range(t): - links.append((position[0] - i - 1, position[1] - j - 1)) - position = (position[0] - s, position[1] - t) - - return links[::-1] - - -def align_log_prob(i, j, source_sents, target_sents, alignment, params): - """Returns the log probability of the two sentences C{source_sents[i]}, C{target_sents[j]} - being aligned with a specific C{alignment}. - - @param i: The offset of the source sentence. - @param j: The offset of the target sentence. - @param source_sents: The list of source sentence lengths. - @param target_sents: The list of target sentence lengths. - @param alignment: The alignment type, a tuple of two integers. - @param params: The sentence alignment parameters. - - @returns: The log probability of a specific alignment between the two sentences, given the parameters. - """ - l_s = sum(source_sents[i - offset - 1] for offset in range(alignment[0])) - l_t = sum(target_sents[j - offset - 1] for offset in range(alignment[1])) - try: - # actually, the paper says l_s * params.VARIANCE_CHARACTERS, this is based on the C - # reference implementation. With l_s in the denominator, insertions are impossible. - m = (l_s + l_t / params.AVERAGE_CHARACTERS) / 2 - delta = (l_s * params.AVERAGE_CHARACTERS - l_t) / math.sqrt( - m * params.VARIANCE_CHARACTERS - ) - except ZeroDivisionError: - return float("-inf") - - return -(LOG2 + norm_logsf(abs(delta)) + math.log(params.PRIORS[alignment])) - - -def align_blocks(source_sents_lens, target_sents_lens, params=LanguageIndependent): - """Return the sentence alignment of two text blocks (usually paragraphs). - - >>> align_blocks([5,5,5], [7,7,7]) - [(0, 0), (1, 1), (2, 2)] - >>> align_blocks([10,5,5], [12,20]) - [(0, 0), (1, 1), (2, 1)] - >>> align_blocks([12,20], [10,5,5]) - [(0, 0), (1, 1), (1, 2)] - >>> align_blocks([10,2,10,10,2,10], [12,3,20,3,12]) - [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4)] - - @param source_sents_lens: The list of source sentence lengths. - @param target_sents_lens: The list of target sentence lengths. - @param params: the sentence alignment parameters. - @return: The sentence alignments, a list of index pairs. - """ - - alignment_types = list(params.PRIORS.keys()) - - # there are always three rows in the history (with the last of them being filled) - D = [[]] - - backlinks = {} - - for i in range(len(source_sents_lens) + 1): - for j in range(len(target_sents_lens) + 1): - min_dist = float("inf") - min_align = None - for a in alignment_types: - prev_i = -1 - a[0] - prev_j = j - a[1] - if prev_i < -len(D) or prev_j < 0: - continue - p = D[prev_i][prev_j] + align_log_prob( - i, j, source_sents_lens, target_sents_lens, a, params - ) - if p < min_dist: - min_dist = p - min_align = a - - if min_dist == float("inf"): - min_dist = 0 - - backlinks[(i, j)] = min_align - D[-1].append(min_dist) - - if len(D) > 2: - D.pop(0) - D.append([]) - - return trace(backlinks, source_sents_lens, target_sents_lens) - - -def align_texts(source_blocks, target_blocks, params=LanguageIndependent): - """Creates the sentence alignment of two texts. - - Texts can consist of several blocks. Block boundaries cannot be crossed by sentence - alignment links. - - Each block consists of a list that contains the lengths (in characters) of the sentences - in this block. - - @param source_blocks: The list of blocks in the source text. - @param target_blocks: The list of blocks in the target text. - @param params: the sentence alignment parameters. - - @returns: A list of sentence alignment lists - """ - if len(source_blocks) != len(target_blocks): - raise ValueError( - "Source and target texts do not have the same number of blocks." - ) - - return [ - align_blocks(source_block, target_block, params) - for source_block, target_block in zip(source_blocks, target_blocks) - ] - - -# File I/O functions; may belong in a corpus reader - - -def split_at(it, split_value): - """Splits an iterator C{it} at values of C{split_value}. - - Each instance of C{split_value} is swallowed. The iterator produces - subiterators which need to be consumed fully before the next subiterator - can be used. - """ - - def _chunk_iterator(first): - v = first - while v != split_value: - yield v - v = it.next() - - while True: - yield _chunk_iterator(it.next()) - - -def parse_token_stream(stream, soft_delimiter, hard_delimiter): - """Parses a stream of tokens and splits it into sentences (using C{soft_delimiter} tokens) - and blocks (using C{hard_delimiter} tokens) for use with the L{align_texts} function. - """ - return [ - [ - sum(len(token) for token in sentence_it) - for sentence_it in split_at(block_it, soft_delimiter) - ] - for block_it in split_at(stream, hard_delimiter) - ] diff --git a/pipeline/nltk/translate/gdfa.py b/pipeline/nltk/translate/gdfa.py deleted file mode 100644 index 57df0cea63b35bfbf83f9d330bf137563b332a33..0000000000000000000000000000000000000000 --- a/pipeline/nltk/translate/gdfa.py +++ /dev/null @@ -1,138 +0,0 @@ -# Natural Language Toolkit: GDFA word alignment symmetrization -# -# Copyright (C) 2001-2023 NLTK Project -# Authors: Liling Tan -# URL: -# For license information, see LICENSE.TXT - -from collections import defaultdict - - -def grow_diag_final_and(srclen, trglen, e2f, f2e): - """ - This module symmetrisatizes the source-to-target and target-to-source - word alignment output and produces, aka. GDFA algorithm (Koehn, 2005). - - Step 1: Find the intersection of the bidirectional alignment. - - Step 2: Search for additional neighbor alignment points to be added, given - these criteria: (i) neighbor alignments points are not in the - intersection and (ii) neighbor alignments are in the union. - - Step 3: Add all other alignment points that are not in the intersection, not in - the neighboring alignments that met the criteria but in the original - forward/backward alignment outputs. - - >>> forw = ('0-0 2-1 9-2 21-3 10-4 7-5 11-6 9-7 12-8 1-9 3-10 ' - ... '4-11 17-12 17-13 25-14 13-15 24-16 11-17 28-18') - >>> back = ('0-0 1-9 2-9 3-10 4-11 5-12 6-6 7-5 8-6 9-7 10-4 ' - ... '11-6 12-8 13-12 15-12 17-13 18-13 19-12 20-13 ' - ... '21-3 22-12 23-14 24-17 25-15 26-17 27-18 28-18') - >>> srctext = ("この よう な ハロー 白色 わい 星 の L 関数 " - ... "は L と 共 に 不連続 に 増加 する こと が " - ... "期待 さ れる こと を 示し た 。") - >>> trgtext = ("Therefore , we expect that the luminosity function " - ... "of such halo white dwarfs increases discontinuously " - ... "with the luminosity .") - >>> srclen = len(srctext.split()) - >>> trglen = len(trgtext.split()) - >>> - >>> gdfa = grow_diag_final_and(srclen, trglen, forw, back) - >>> gdfa == sorted(set([(28, 18), (6, 6), (24, 17), (2, 1), (15, 12), (13, 12), - ... (2, 9), (3, 10), (26, 17), (25, 15), (8, 6), (9, 7), (20, - ... 13), (18, 13), (0, 0), (10, 4), (13, 15), (23, 14), (7, 5), - ... (25, 14), (1, 9), (17, 13), (4, 11), (11, 17), (9, 2), (22, - ... 12), (27, 18), (24, 16), (21, 3), (19, 12), (17, 12), (5, - ... 12), (11, 6), (12, 8)])) - True - - References: - Koehn, P., A. Axelrod, A. Birch, C. Callison, M. Osborne, and D. Talbot. - 2005. Edinburgh System Description for the 2005 IWSLT Speech - Translation Evaluation. In MT Eval Workshop. - - :type srclen: int - :param srclen: the number of tokens in the source language - :type trglen: int - :param trglen: the number of tokens in the target language - :type e2f: str - :param e2f: the forward word alignment outputs from source-to-target - language (in pharaoh output format) - :type f2e: str - :param f2e: the backward word alignment outputs from target-to-source - language (in pharaoh output format) - :rtype: set(tuple(int)) - :return: the symmetrized alignment points from the GDFA algorithm - """ - - # Converts pharaoh text format into list of tuples. - e2f = [tuple(map(int, a.split("-"))) for a in e2f.split()] - f2e = [tuple(map(int, a.split("-"))) for a in f2e.split()] - - neighbors = [(-1, 0), (0, -1), (1, 0), (0, 1), (-1, -1), (-1, 1), (1, -1), (1, 1)] - alignment = set(e2f).intersection(set(f2e)) # Find the intersection. - union = set(e2f).union(set(f2e)) - - # *aligned* is used to check if neighbors are aligned in grow_diag() - aligned = defaultdict(set) - for i, j in alignment: - aligned["e"].add(i) - aligned["f"].add(j) - - def grow_diag(): - """ - Search for the neighbor points and them to the intersected alignment - points if criteria are met. - """ - prev_len = len(alignment) - 1 - # iterate until no new points added - while prev_len < len(alignment): - no_new_points = True - # for english word e = 0 ... en - for e in range(srclen): - # for foreign word f = 0 ... fn - for f in range(trglen): - # if ( e aligned with f) - if (e, f) in alignment: - # for each neighboring point (e-new, f-new) - for neighbor in neighbors: - neighbor = tuple(i + j for i, j in zip((e, f), neighbor)) - e_new, f_new = neighbor - # if ( ( e-new not aligned and f-new not aligned) - # and (e-new, f-new in union(e2f, f2e) ) - if ( - e_new not in aligned and f_new not in aligned - ) and neighbor in union: - alignment.add(neighbor) - aligned["e"].add(e_new) - aligned["f"].add(f_new) - prev_len += 1 - no_new_points = False - # iterate until no new points added - if no_new_points: - break - - def final_and(a): - """ - Adds remaining points that are not in the intersection, not in the - neighboring alignments but in the original *e2f* and *f2e* alignments - """ - # for english word e = 0 ... en - for e_new in range(srclen): - # for foreign word f = 0 ... fn - for f_new in range(trglen): - # if ( ( e-new not aligned and f-new not aligned) - # and (e-new, f-new in union(e2f, f2e) ) - if ( - e_new not in aligned - and f_new not in aligned - and (e_new, f_new) in union - ): - alignment.add((e_new, f_new)) - aligned["e"].add(e_new) - aligned["f"].add(f_new) - - grow_diag() - final_and(e2f) - final_and(f2e) - return sorted(alignment) diff --git a/pipeline/nltk/translate/gleu_score.py b/pipeline/nltk/translate/gleu_score.py deleted file mode 100644 index 81932a73fb5bdd34e539dfd9d1b46f179fc26558..0000000000000000000000000000000000000000 --- a/pipeline/nltk/translate/gleu_score.py +++ /dev/null @@ -1,190 +0,0 @@ -# Natural Language Toolkit: GLEU Score -# -# Copyright (C) 2001-2023 NLTK Project -# Authors: -# Contributors: Mike Schuster, Michael Wayne Goodman, Liling Tan -# URL: -# For license information, see LICENSE.TXT - -""" GLEU score implementation. """ - -from collections import Counter - -from nltk.util import everygrams, ngrams - - -def sentence_gleu(references, hypothesis, min_len=1, max_len=4): - """ - Calculates the sentence level GLEU (Google-BLEU) score described in - - Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V. Le, Mohammad Norouzi, - Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, - Jeff Klingner, Apurva Shah, Melvin Johnson, Xiaobing Liu, Lukasz Kaiser, - Stephan Gouws, Yoshikiyo Kato, Taku Kudo, Hideto Kazawa, Keith Stevens, - George Kurian, Nishant Patil, Wei Wang, Cliff Young, Jason Smith, - Jason Riesa, Alex Rudnick, Oriol Vinyals, Greg Corrado, Macduff Hughes, - Jeffrey Dean. (2016) Google’s Neural Machine Translation System: - Bridging the Gap between Human and Machine Translation. - eprint arXiv:1609.08144. https://arxiv.org/pdf/1609.08144v2.pdf - Retrieved on 27 Oct 2016. - - From Wu et al. (2016): - "The BLEU score has some undesirable properties when used for single - sentences, as it was designed to be a corpus measure. We therefore - use a slightly different score for our RL experiments which we call - the 'GLEU score'. For the GLEU score, we record all sub-sequences of - 1, 2, 3 or 4 tokens in output and target sequence (n-grams). We then - compute a recall, which is the ratio of the number of matching n-grams - to the number of total n-grams in the target (ground truth) sequence, - and a precision, which is the ratio of the number of matching n-grams - to the number of total n-grams in the generated output sequence. Then - GLEU score is simply the minimum of recall and precision. This GLEU - score's range is always between 0 (no matches) and 1 (all match) and - it is symmetrical when switching output and target. According to - our experiments, GLEU score correlates quite well with the BLEU - metric on a corpus level but does not have its drawbacks for our per - sentence reward objective." - - Note: The initial implementation only allowed a single reference, but now - a list of references is required (which is consistent with - bleu_score.sentence_bleu()). - - The infamous "the the the ... " example - - >>> ref = 'the cat is on the mat'.split() - >>> hyp = 'the the the the the the the'.split() - >>> sentence_gleu([ref], hyp) # doctest: +ELLIPSIS - 0.0909... - - An example to evaluate normal machine translation outputs - - >>> ref1 = str('It is a guide to action that ensures that the military ' - ... 'will forever heed Party commands').split() - >>> hyp1 = str('It is a guide to action which ensures that the military ' - ... 'always obeys the commands of the party').split() - >>> hyp2 = str('It is to insure the troops forever hearing the activity ' - ... 'guidebook that party direct').split() - >>> sentence_gleu([ref1], hyp1) # doctest: +ELLIPSIS - 0.4393... - >>> sentence_gleu([ref1], hyp2) # doctest: +ELLIPSIS - 0.1206... - - :param references: a list of reference sentences - :type references: list(list(str)) - :param hypothesis: a hypothesis sentence - :type hypothesis: list(str) - :param min_len: The minimum order of n-gram this function should extract. - :type min_len: int - :param max_len: The maximum order of n-gram this function should extract. - :type max_len: int - :return: the sentence level GLEU score. - :rtype: float - """ - return corpus_gleu([references], [hypothesis], min_len=min_len, max_len=max_len) - - -def corpus_gleu(list_of_references, hypotheses, min_len=1, max_len=4): - """ - Calculate a single corpus-level GLEU score (aka. system-level GLEU) for all - the hypotheses and their respective references. - - Instead of averaging the sentence level GLEU scores (i.e. macro-average - precision), Wu et al. (2016) sum up the matching tokens and the max of - hypothesis and reference tokens for each sentence, then compute using the - aggregate values. - - From Mike Schuster (via email): - "For the corpus, we just add up the two statistics n_match and - n_all = max(n_all_output, n_all_target) for all sentences, then - calculate gleu_score = n_match / n_all, so it is not just a mean of - the sentence gleu scores (in our case, longer sentences count more, - which I think makes sense as they are more difficult to translate)." - - >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', - ... 'ensures', 'that', 'the', 'military', 'always', - ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] - >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', - ... 'ensures', 'that', 'the', 'military', 'will', 'forever', - ... 'heed', 'Party', 'commands'] - >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which', - ... 'guarantees', 'the', 'military', 'forces', 'always', - ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] - >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', - ... 'army', 'always', 'to', 'heed', 'the', 'directions', - ... 'of', 'the', 'party'] - - >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', - ... 'interested', 'in', 'world', 'history'] - >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', - ... 'because', 'he', 'read', 'the', 'book'] - - >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] - >>> hypotheses = [hyp1, hyp2] - >>> corpus_gleu(list_of_references, hypotheses) # doctest: +ELLIPSIS - 0.5673... - - The example below show that corpus_gleu() is different from averaging - sentence_gleu() for hypotheses - - >>> score1 = sentence_gleu([ref1a], hyp1) - >>> score2 = sentence_gleu([ref2a], hyp2) - >>> (score1 + score2) / 2 # doctest: +ELLIPSIS - 0.6144... - - :param list_of_references: a list of reference sentences, w.r.t. hypotheses - :type list_of_references: list(list(list(str))) - :param hypotheses: a list of hypothesis sentences - :type hypotheses: list(list(str)) - :param min_len: The minimum order of n-gram this function should extract. - :type min_len: int - :param max_len: The maximum order of n-gram this function should extract. - :type max_len: int - :return: The corpus-level GLEU score. - :rtype: float - """ - # sanity check - assert len(list_of_references) == len( - hypotheses - ), "The number of hypotheses and their reference(s) should be the same" - - # sum matches and max-token-lengths over all sentences - corpus_n_match = 0 - corpus_n_all = 0 - - for references, hypothesis in zip(list_of_references, hypotheses): - hyp_ngrams = Counter(everygrams(hypothesis, min_len, max_len)) - tpfp = sum(hyp_ngrams.values()) # True positives + False positives. - - hyp_counts = [] - for reference in references: - ref_ngrams = Counter(everygrams(reference, min_len, max_len)) - tpfn = sum(ref_ngrams.values()) # True positives + False negatives. - - overlap_ngrams = ref_ngrams & hyp_ngrams - tp = sum(overlap_ngrams.values()) # True positives. - - # While GLEU is defined as the minimum of precision and - # recall, we can reduce the number of division operations by one by - # instead finding the maximum of the denominators for the precision - # and recall formulae, since the numerators are the same: - # precision = tp / tpfp - # recall = tp / tpfn - # gleu_score = min(precision, recall) == tp / max(tpfp, tpfn) - n_all = max(tpfp, tpfn) - - if n_all > 0: - hyp_counts.append((tp, n_all)) - - # use the reference yielding the highest score - if hyp_counts: - n_match, n_all = max(hyp_counts, key=lambda hc: hc[0] / hc[1]) - corpus_n_match += n_match - corpus_n_all += n_all - - # corner case: empty corpus or empty references---don't divide by zero! - if corpus_n_all == 0: - gleu_score = 0.0 - else: - gleu_score = corpus_n_match / corpus_n_all - - return gleu_score diff --git a/pipeline/nltk/translate/ibm1.py b/pipeline/nltk/translate/ibm1.py deleted file mode 100644 index badb896968633d0db99f9b8fb2a7679b65d9a534..0000000000000000000000000000000000000000 --- a/pipeline/nltk/translate/ibm1.py +++ /dev/null @@ -1,251 +0,0 @@ -# Natural Language Toolkit: IBM Model 1 -# -# Copyright (C) 2001-2013 NLTK Project -# Author: Chin Yee Lee -# Hengfeng Li -# Ruxin Hou -# Calvin Tanujaya Lim -# Based on earlier version by: -# Will Zhang -# Guan Gui -# URL: -# For license information, see LICENSE.TXT - -""" -Lexical translation model that ignores word order. - -In IBM Model 1, word order is ignored for simplicity. As long as the -word alignments are equivalent, it doesn't matter where the word occurs -in the source or target sentence. Thus, the following three alignments -are equally likely:: - - Source: je mange du jambon - Target: i eat some ham - Alignment: (0,0) (1,1) (2,2) (3,3) - - Source: je mange du jambon - Target: some ham eat i - Alignment: (0,2) (1,3) (2,1) (3,1) - - Source: du jambon je mange - Target: eat i some ham - Alignment: (0,3) (1,2) (2,0) (3,1) - -Note that an alignment is represented here as -(word_index_in_target, word_index_in_source). - -The EM algorithm used in Model 1 is: - -:E step: In the training data, count how many times a source language - word is translated into a target language word, weighted by - the prior probability of the translation. - -:M step: Estimate the new probability of translation based on the - counts from the Expectation step. - -Notations ---------- - -:i: Position in the source sentence - Valid values are 0 (for NULL), 1, 2, ..., length of source sentence -:j: Position in the target sentence - Valid values are 1, 2, ..., length of target sentence -:s: A word in the source language -:t: A word in the target language - -References ----------- - -Philipp Koehn. 2010. Statistical Machine Translation. -Cambridge University Press, New York. - -Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and -Robert L. Mercer. 1993. The Mathematics of Statistical Machine -Translation: Parameter Estimation. Computational Linguistics, 19 (2), -263-311. -""" - -import warnings -from collections import defaultdict - -from nltk.translate import AlignedSent, Alignment, IBMModel -from nltk.translate.ibm_model import Counts - - -class IBMModel1(IBMModel): - """ - Lexical translation model that ignores word order - - >>> bitext = [] - >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) - >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big'])) - >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) - >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) - >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) - >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) - - >>> ibm1 = IBMModel1(bitext, 5) - - >>> print(round(ibm1.translation_table['buch']['book'], 3)) - 0.889 - >>> print(round(ibm1.translation_table['das']['book'], 3)) - 0.062 - >>> print(round(ibm1.translation_table['buch'][None], 3)) - 0.113 - >>> print(round(ibm1.translation_table['ja'][None], 3)) - 0.073 - - >>> test_sentence = bitext[2] - >>> test_sentence.words - ['das', 'buch', 'ist', 'ja', 'klein'] - >>> test_sentence.mots - ['the', 'book', 'is', 'small'] - >>> test_sentence.alignment - Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)]) - - """ - - def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None): - """ - Train on ``sentence_aligned_corpus`` and create a lexical - translation model. - - Translation direction is from ``AlignedSent.mots`` to - ``AlignedSent.words``. - - :param sentence_aligned_corpus: Sentence-aligned parallel corpus - :type sentence_aligned_corpus: list(AlignedSent) - - :param iterations: Number of iterations to run training algorithm - :type iterations: int - - :param probability_tables: Optional. Use this to pass in custom - probability values. If not specified, probabilities will be - set to a uniform distribution, or some other sensible value. - If specified, the following entry must be present: - ``translation_table``. - See ``IBMModel`` for the type and purpose of this table. - :type probability_tables: dict[str]: object - """ - super().__init__(sentence_aligned_corpus) - - if probability_tables is None: - self.set_uniform_probabilities(sentence_aligned_corpus) - else: - # Set user-defined probabilities - self.translation_table = probability_tables["translation_table"] - - for n in range(0, iterations): - self.train(sentence_aligned_corpus) - - self.align_all(sentence_aligned_corpus) - - def set_uniform_probabilities(self, sentence_aligned_corpus): - initial_prob = 1 / len(self.trg_vocab) - if initial_prob < IBMModel.MIN_PROB: - warnings.warn( - "Target language vocabulary is too large (" - + str(len(self.trg_vocab)) - + " words). " - "Results may be less accurate." - ) - - for t in self.trg_vocab: - self.translation_table[t] = defaultdict(lambda: initial_prob) - - def train(self, parallel_corpus): - counts = Counts() - for aligned_sentence in parallel_corpus: - trg_sentence = aligned_sentence.words - src_sentence = [None] + aligned_sentence.mots - - # E step (a): Compute normalization factors to weigh counts - total_count = self.prob_all_alignments(src_sentence, trg_sentence) - - # E step (b): Collect counts - for t in trg_sentence: - for s in src_sentence: - count = self.prob_alignment_point(s, t) - normalized_count = count / total_count[t] - counts.t_given_s[t][s] += normalized_count - counts.any_t_given_s[s] += normalized_count - - # M step: Update probabilities with maximum likelihood estimate - self.maximize_lexical_translation_probabilities(counts) - - def prob_all_alignments(self, src_sentence, trg_sentence): - """ - Computes the probability of all possible word alignments, - expressed as a marginal distribution over target words t - - Each entry in the return value represents the contribution to - the total alignment probability by the target word t. - - To obtain probability(alignment | src_sentence, trg_sentence), - simply sum the entries in the return value. - - :return: Probability of t for all s in ``src_sentence`` - :rtype: dict(str): float - """ - alignment_prob_for_t = defaultdict(lambda: 0.0) - for t in trg_sentence: - for s in src_sentence: - alignment_prob_for_t[t] += self.prob_alignment_point(s, t) - return alignment_prob_for_t - - def prob_alignment_point(self, s, t): - """ - Probability that word ``t`` in the target sentence is aligned to - word ``s`` in the source sentence - """ - return self.translation_table[t][s] - - def prob_t_a_given_s(self, alignment_info): - """ - Probability of target sentence and an alignment given the - source sentence - """ - prob = 1.0 - - for j, i in enumerate(alignment_info.alignment): - if j == 0: - continue # skip the dummy zeroeth element - trg_word = alignment_info.trg_sentence[j] - src_word = alignment_info.src_sentence[i] - prob *= self.translation_table[trg_word][src_word] - - return max(prob, IBMModel.MIN_PROB) - - def align_all(self, parallel_corpus): - for sentence_pair in parallel_corpus: - self.align(sentence_pair) - - def align(self, sentence_pair): - """ - Determines the best word alignment for one sentence pair from - the corpus that the model was trained on. - - The best alignment will be set in ``sentence_pair`` when the - method returns. In contrast with the internal implementation of - IBM models, the word indices in the ``Alignment`` are zero- - indexed, not one-indexed. - - :param sentence_pair: A sentence in the source language and its - counterpart sentence in the target language - :type sentence_pair: AlignedSent - """ - best_alignment = [] - - for j, trg_word in enumerate(sentence_pair.words): - # Initialize trg_word to align with the NULL token - best_prob = max(self.translation_table[trg_word][None], IBMModel.MIN_PROB) - best_alignment_point = None - for i, src_word in enumerate(sentence_pair.mots): - align_prob = self.translation_table[trg_word][src_word] - if align_prob >= best_prob: # prefer newer word in case of tie - best_prob = align_prob - best_alignment_point = i - - best_alignment.append((j, best_alignment_point)) - - sentence_pair.alignment = Alignment(best_alignment) diff --git a/pipeline/nltk/translate/ibm2.py b/pipeline/nltk/translate/ibm2.py deleted file mode 100644 index 0b3ff375f045f4a809778ea8d3221e6b62e5e2ad..0000000000000000000000000000000000000000 --- a/pipeline/nltk/translate/ibm2.py +++ /dev/null @@ -1,319 +0,0 @@ -# Natural Language Toolkit: IBM Model 2 -# -# Copyright (C) 2001-2013 NLTK Project -# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim -# URL: -# For license information, see LICENSE.TXT - -""" -Lexical translation model that considers word order. - -IBM Model 2 improves on Model 1 by accounting for word order. -An alignment probability is introduced, a(i | j,l,m), which predicts -a source word position, given its aligned target word's position. - -The EM algorithm used in Model 2 is: - -:E step: In the training data, collect counts, weighted by prior - probabilities. - - - (a) count how many times a source language word is translated - into a target language word - - (b) count how many times a particular position in the source - sentence is aligned to a particular position in the target - sentence - -:M step: Estimate new probabilities based on the counts from the E step - -Notations ---------- - -:i: Position in the source sentence - Valid values are 0 (for NULL), 1, 2, ..., length of source sentence -:j: Position in the target sentence - Valid values are 1, 2, ..., length of target sentence -:l: Number of words in the source sentence, excluding NULL -:m: Number of words in the target sentence -:s: A word in the source language -:t: A word in the target language - -References ----------- - -Philipp Koehn. 2010. Statistical Machine Translation. -Cambridge University Press, New York. - -Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and -Robert L. Mercer. 1993. The Mathematics of Statistical Machine -Translation: Parameter Estimation. Computational Linguistics, 19 (2), -263-311. -""" - -import warnings -from collections import defaultdict - -from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel1 -from nltk.translate.ibm_model import Counts - - -class IBMModel2(IBMModel): - """ - Lexical translation model that considers word order - - >>> bitext = [] - >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) - >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big'])) - >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) - >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) - >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) - >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) - - >>> ibm2 = IBMModel2(bitext, 5) - - >>> print(round(ibm2.translation_table['buch']['book'], 3)) - 1.0 - >>> print(round(ibm2.translation_table['das']['book'], 3)) - 0.0 - >>> print(round(ibm2.translation_table['buch'][None], 3)) - 0.0 - >>> print(round(ibm2.translation_table['ja'][None], 3)) - 0.0 - - >>> print(round(ibm2.alignment_table[1][1][2][2], 3)) - 0.939 - >>> print(round(ibm2.alignment_table[1][2][2][2], 3)) - 0.0 - >>> print(round(ibm2.alignment_table[2][2][4][5], 3)) - 1.0 - - >>> test_sentence = bitext[2] - >>> test_sentence.words - ['das', 'buch', 'ist', 'ja', 'klein'] - >>> test_sentence.mots - ['the', 'book', 'is', 'small'] - >>> test_sentence.alignment - Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)]) - - """ - - def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None): - """ - Train on ``sentence_aligned_corpus`` and create a lexical - translation model and an alignment model. - - Translation direction is from ``AlignedSent.mots`` to - ``AlignedSent.words``. - - :param sentence_aligned_corpus: Sentence-aligned parallel corpus - :type sentence_aligned_corpus: list(AlignedSent) - - :param iterations: Number of iterations to run training algorithm - :type iterations: int - - :param probability_tables: Optional. Use this to pass in custom - probability values. If not specified, probabilities will be - set to a uniform distribution, or some other sensible value. - If specified, all the following entries must be present: - ``translation_table``, ``alignment_table``. - See ``IBMModel`` for the type and purpose of these tables. - :type probability_tables: dict[str]: object - """ - super().__init__(sentence_aligned_corpus) - - if probability_tables is None: - # Get translation probabilities from IBM Model 1 - # Run more iterations of training for Model 1, since it is - # faster than Model 2 - ibm1 = IBMModel1(sentence_aligned_corpus, 2 * iterations) - self.translation_table = ibm1.translation_table - self.set_uniform_probabilities(sentence_aligned_corpus) - else: - # Set user-defined probabilities - self.translation_table = probability_tables["translation_table"] - self.alignment_table = probability_tables["alignment_table"] - - for n in range(0, iterations): - self.train(sentence_aligned_corpus) - - self.align_all(sentence_aligned_corpus) - - def set_uniform_probabilities(self, sentence_aligned_corpus): - # a(i | j,l,m) = 1 / (l+1) for all i, j, l, m - l_m_combinations = set() - for aligned_sentence in sentence_aligned_corpus: - l = len(aligned_sentence.mots) - m = len(aligned_sentence.words) - if (l, m) not in l_m_combinations: - l_m_combinations.add((l, m)) - initial_prob = 1 / (l + 1) - if initial_prob < IBMModel.MIN_PROB: - warnings.warn( - "A source sentence is too long (" - + str(l) - + " words). Results may be less accurate." - ) - - for i in range(0, l + 1): - for j in range(1, m + 1): - self.alignment_table[i][j][l][m] = initial_prob - - def train(self, parallel_corpus): - counts = Model2Counts() - for aligned_sentence in parallel_corpus: - src_sentence = [None] + aligned_sentence.mots - trg_sentence = ["UNUSED"] + aligned_sentence.words # 1-indexed - l = len(aligned_sentence.mots) - m = len(aligned_sentence.words) - - # E step (a): Compute normalization factors to weigh counts - total_count = self.prob_all_alignments(src_sentence, trg_sentence) - - # E step (b): Collect counts - for j in range(1, m + 1): - t = trg_sentence[j] - for i in range(0, l + 1): - s = src_sentence[i] - count = self.prob_alignment_point(i, j, src_sentence, trg_sentence) - normalized_count = count / total_count[t] - - counts.update_lexical_translation(normalized_count, s, t) - counts.update_alignment(normalized_count, i, j, l, m) - - # M step: Update probabilities with maximum likelihood estimates - self.maximize_lexical_translation_probabilities(counts) - self.maximize_alignment_probabilities(counts) - - def maximize_alignment_probabilities(self, counts): - MIN_PROB = IBMModel.MIN_PROB - for i, j_s in counts.alignment.items(): - for j, src_sentence_lengths in j_s.items(): - for l, trg_sentence_lengths in src_sentence_lengths.items(): - for m in trg_sentence_lengths: - estimate = ( - counts.alignment[i][j][l][m] - / counts.alignment_for_any_i[j][l][m] - ) - self.alignment_table[i][j][l][m] = max(estimate, MIN_PROB) - - def prob_all_alignments(self, src_sentence, trg_sentence): - """ - Computes the probability of all possible word alignments, - expressed as a marginal distribution over target words t - - Each entry in the return value represents the contribution to - the total alignment probability by the target word t. - - To obtain probability(alignment | src_sentence, trg_sentence), - simply sum the entries in the return value. - - :return: Probability of t for all s in ``src_sentence`` - :rtype: dict(str): float - """ - alignment_prob_for_t = defaultdict(lambda: 0.0) - for j in range(1, len(trg_sentence)): - t = trg_sentence[j] - for i in range(0, len(src_sentence)): - alignment_prob_for_t[t] += self.prob_alignment_point( - i, j, src_sentence, trg_sentence - ) - return alignment_prob_for_t - - def prob_alignment_point(self, i, j, src_sentence, trg_sentence): - """ - Probability that position j in ``trg_sentence`` is aligned to - position i in the ``src_sentence`` - """ - l = len(src_sentence) - 1 - m = len(trg_sentence) - 1 - s = src_sentence[i] - t = trg_sentence[j] - return self.translation_table[t][s] * self.alignment_table[i][j][l][m] - - def prob_t_a_given_s(self, alignment_info): - """ - Probability of target sentence and an alignment given the - source sentence - """ - prob = 1.0 - l = len(alignment_info.src_sentence) - 1 - m = len(alignment_info.trg_sentence) - 1 - - for j, i in enumerate(alignment_info.alignment): - if j == 0: - continue # skip the dummy zeroeth element - trg_word = alignment_info.trg_sentence[j] - src_word = alignment_info.src_sentence[i] - prob *= ( - self.translation_table[trg_word][src_word] - * self.alignment_table[i][j][l][m] - ) - - return max(prob, IBMModel.MIN_PROB) - - def align_all(self, parallel_corpus): - for sentence_pair in parallel_corpus: - self.align(sentence_pair) - - def align(self, sentence_pair): - """ - Determines the best word alignment for one sentence pair from - the corpus that the model was trained on. - - The best alignment will be set in ``sentence_pair`` when the - method returns. In contrast with the internal implementation of - IBM models, the word indices in the ``Alignment`` are zero- - indexed, not one-indexed. - - :param sentence_pair: A sentence in the source language and its - counterpart sentence in the target language - :type sentence_pair: AlignedSent - """ - best_alignment = [] - - l = len(sentence_pair.mots) - m = len(sentence_pair.words) - - for j, trg_word in enumerate(sentence_pair.words): - # Initialize trg_word to align with the NULL token - best_prob = ( - self.translation_table[trg_word][None] - * self.alignment_table[0][j + 1][l][m] - ) - best_prob = max(best_prob, IBMModel.MIN_PROB) - best_alignment_point = None - for i, src_word in enumerate(sentence_pair.mots): - align_prob = ( - self.translation_table[trg_word][src_word] - * self.alignment_table[i + 1][j + 1][l][m] - ) - if align_prob >= best_prob: - best_prob = align_prob - best_alignment_point = i - - best_alignment.append((j, best_alignment_point)) - - sentence_pair.alignment = Alignment(best_alignment) - - -class Model2Counts(Counts): - """ - Data object to store counts of various parameters during training. - Includes counts for alignment. - """ - - def __init__(self): - super().__init__() - self.alignment = defaultdict( - lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) - ) - self.alignment_for_any_i = defaultdict( - lambda: defaultdict(lambda: defaultdict(lambda: 0.0)) - ) - - def update_lexical_translation(self, count, s, t): - self.t_given_s[t][s] += count - self.any_t_given_s[s] += count - - def update_alignment(self, count, i, j, l, m): - self.alignment[i][j][l][m] += count - self.alignment_for_any_i[j][l][m] += count diff --git a/pipeline/nltk/translate/ibm3.py b/pipeline/nltk/translate/ibm3.py deleted file mode 100644 index f295dee0b563bbcb9a5b9557c8d1602942a75bc3..0000000000000000000000000000000000000000 --- a/pipeline/nltk/translate/ibm3.py +++ /dev/null @@ -1,346 +0,0 @@ -# Natural Language Toolkit: IBM Model 3 -# -# Copyright (C) 2001-2013 NLTK Project -# Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim -# URL: -# For license information, see LICENSE.TXT - -""" -Translation model that considers how a word can be aligned to -multiple words in another language. - -IBM Model 3 improves on Model 2 by directly modeling the phenomenon -where a word in one language may be translated into zero or more words -in another. This is expressed by the fertility probability, -n(phi | source word). - -If a source word translates into more than one word, it is possible to -generate sentences that have the same alignment in multiple ways. This -is modeled by a distortion step. The distortion probability, d(j|i,l,m), -predicts a target word position, given its aligned source word's -position. The distortion probability replaces the alignment probability -of Model 2. - -The fertility probability is not applicable for NULL. Target words that -align to NULL are assumed to be distributed uniformly in the target -sentence. The existence of these words is modeled by p1, the probability -that a target word produced by a real source word requires another -target word that is produced by NULL. - -The EM algorithm used in Model 3 is: - -:E step: In the training data, collect counts, weighted by prior - probabilities. - - - (a) count how many times a source language word is translated - into a target language word - - (b) count how many times a particular position in the target - sentence is aligned to a particular position in the source - sentence - - (c) count how many times a source word is aligned to phi number - of target words - - (d) count how many times NULL is aligned to a target word - -:M step: Estimate new probabilities based on the counts from the E step - -Because there are too many possible alignments, only the most probable -ones are considered. First, the best alignment is determined using prior -probabilities. Then, a hill climbing approach is used to find other good -candidates. - -Notations ---------- - -:i: Position in the source sentence - Valid values are 0 (for NULL), 1, 2, ..., length of source sentence -:j: Position in the target sentence - Valid values are 1, 2, ..., length of target sentence -:l: Number of words in the source sentence, excluding NULL -:m: Number of words in the target sentence -:s: A word in the source language -:t: A word in the target language -:phi: Fertility, the number of target words produced by a source word -:p1: Probability that a target word produced by a source word is - accompanied by another target word that is aligned to NULL -:p0: 1 - p1 - -References ----------- - -Philipp Koehn. 2010. Statistical Machine Translation. -Cambridge University Press, New York. - -Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and -Robert L. Mercer. 1993. The Mathematics of Statistical Machine -Translation: Parameter Estimation. Computational Linguistics, 19 (2), -263-311. -""" - -import warnings -from collections import defaultdict -from math import factorial - -from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel2 -from nltk.translate.ibm_model import Counts - - -class IBMModel3(IBMModel): - """ - Translation model that considers how a word can be aligned to - multiple words in another language - - >>> bitext = [] - >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) - >>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big'])) - >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) - >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small'])) - >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) - >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) - >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) - >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book'])) - >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize'])) - - >>> ibm3 = IBMModel3(bitext, 5) - - >>> print(round(ibm3.translation_table['buch']['book'], 3)) - 1.0 - >>> print(round(ibm3.translation_table['das']['book'], 3)) - 0.0 - >>> print(round(ibm3.translation_table['ja'][None], 3)) - 1.0 - - >>> print(round(ibm3.distortion_table[1][1][2][2], 3)) - 1.0 - >>> print(round(ibm3.distortion_table[1][2][2][2], 3)) - 0.0 - >>> print(round(ibm3.distortion_table[2][2][4][5], 3)) - 0.75 - - >>> print(round(ibm3.fertility_table[2]['summarize'], 3)) - 1.0 - >>> print(round(ibm3.fertility_table[1]['book'], 3)) - 1.0 - - >>> print(round(ibm3.p1, 3)) - 0.054 - - >>> test_sentence = bitext[2] - >>> test_sentence.words - ['das', 'buch', 'ist', 'ja', 'klein'] - >>> test_sentence.mots - ['the', 'book', 'is', 'small'] - >>> test_sentence.alignment - Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)]) - - """ - - def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None): - """ - Train on ``sentence_aligned_corpus`` and create a lexical - translation model, a distortion model, a fertility model, and a - model for generating NULL-aligned words. - - Translation direction is from ``AlignedSent.mots`` to - ``AlignedSent.words``. - - :param sentence_aligned_corpus: Sentence-aligned parallel corpus - :type sentence_aligned_corpus: list(AlignedSent) - - :param iterations: Number of iterations to run training algorithm - :type iterations: int - - :param probability_tables: Optional. Use this to pass in custom - probability values. If not specified, probabilities will be - set to a uniform distribution, or some other sensible value. - If specified, all the following entries must be present: - ``translation_table``, ``alignment_table``, - ``fertility_table``, ``p1``, ``distortion_table``. - See ``IBMModel`` for the type and purpose of these tables. - :type probability_tables: dict[str]: object - """ - super().__init__(sentence_aligned_corpus) - self.reset_probabilities() - - if probability_tables is None: - # Get translation and alignment probabilities from IBM Model 2 - ibm2 = IBMModel2(sentence_aligned_corpus, iterations) - self.translation_table = ibm2.translation_table - self.alignment_table = ibm2.alignment_table - self.set_uniform_probabilities(sentence_aligned_corpus) - else: - # Set user-defined probabilities - self.translation_table = probability_tables["translation_table"] - self.alignment_table = probability_tables["alignment_table"] - self.fertility_table = probability_tables["fertility_table"] - self.p1 = probability_tables["p1"] - self.distortion_table = probability_tables["distortion_table"] - - for n in range(0, iterations): - self.train(sentence_aligned_corpus) - - def reset_probabilities(self): - super().reset_probabilities() - self.distortion_table = defaultdict( - lambda: defaultdict( - lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)) - ) - ) - """ - dict[int][int][int][int]: float. Probability(j | i,l,m). - Values accessed as ``distortion_table[j][i][l][m]``. - """ - - def set_uniform_probabilities(self, sentence_aligned_corpus): - # d(j | i,l,m) = 1 / m for all i, j, l, m - l_m_combinations = set() - for aligned_sentence in sentence_aligned_corpus: - l = len(aligned_sentence.mots) - m = len(aligned_sentence.words) - if (l, m) not in l_m_combinations: - l_m_combinations.add((l, m)) - initial_prob = 1 / m - if initial_prob < IBMModel.MIN_PROB: - warnings.warn( - "A target sentence is too long (" - + str(m) - + " words). Results may be less accurate." - ) - for j in range(1, m + 1): - for i in range(0, l + 1): - self.distortion_table[j][i][l][m] = initial_prob - - # simple initialization, taken from GIZA++ - self.fertility_table[0] = defaultdict(lambda: 0.2) - self.fertility_table[1] = defaultdict(lambda: 0.65) - self.fertility_table[2] = defaultdict(lambda: 0.1) - self.fertility_table[3] = defaultdict(lambda: 0.04) - MAX_FERTILITY = 10 - initial_fert_prob = 0.01 / (MAX_FERTILITY - 4) - for phi in range(4, MAX_FERTILITY): - self.fertility_table[phi] = defaultdict(lambda: initial_fert_prob) - - self.p1 = 0.5 - - def train(self, parallel_corpus): - counts = Model3Counts() - for aligned_sentence in parallel_corpus: - l = len(aligned_sentence.mots) - m = len(aligned_sentence.words) - - # Sample the alignment space - sampled_alignments, best_alignment = self.sample(aligned_sentence) - # Record the most probable alignment - aligned_sentence.alignment = Alignment( - best_alignment.zero_indexed_alignment() - ) - - # E step (a): Compute normalization factors to weigh counts - total_count = self.prob_of_alignments(sampled_alignments) - - # E step (b): Collect counts - for alignment_info in sampled_alignments: - count = self.prob_t_a_given_s(alignment_info) - normalized_count = count / total_count - - for j in range(1, m + 1): - counts.update_lexical_translation( - normalized_count, alignment_info, j - ) - counts.update_distortion(normalized_count, alignment_info, j, l, m) - - counts.update_null_generation(normalized_count, alignment_info) - counts.update_fertility(normalized_count, alignment_info) - - # M step: Update probabilities with maximum likelihood estimates - # If any probability is less than MIN_PROB, clamp it to MIN_PROB - existing_alignment_table = self.alignment_table - self.reset_probabilities() - self.alignment_table = existing_alignment_table # don't retrain - - self.maximize_lexical_translation_probabilities(counts) - self.maximize_distortion_probabilities(counts) - self.maximize_fertility_probabilities(counts) - self.maximize_null_generation_probabilities(counts) - - def maximize_distortion_probabilities(self, counts): - MIN_PROB = IBMModel.MIN_PROB - for j, i_s in counts.distortion.items(): - for i, src_sentence_lengths in i_s.items(): - for l, trg_sentence_lengths in src_sentence_lengths.items(): - for m in trg_sentence_lengths: - estimate = ( - counts.distortion[j][i][l][m] - / counts.distortion_for_any_j[i][l][m] - ) - self.distortion_table[j][i][l][m] = max(estimate, MIN_PROB) - - def prob_t_a_given_s(self, alignment_info): - """ - Probability of target sentence and an alignment given the - source sentence - """ - src_sentence = alignment_info.src_sentence - trg_sentence = alignment_info.trg_sentence - l = len(src_sentence) - 1 # exclude NULL - m = len(trg_sentence) - 1 - p1 = self.p1 - p0 = 1 - p1 - - probability = 1.0 - MIN_PROB = IBMModel.MIN_PROB - - # Combine NULL insertion probability - null_fertility = alignment_info.fertility_of_i(0) - probability *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility) - if probability < MIN_PROB: - return MIN_PROB - - # Compute combination (m - null_fertility) choose null_fertility - for i in range(1, null_fertility + 1): - probability *= (m - null_fertility - i + 1) / i - if probability < MIN_PROB: - return MIN_PROB - - # Combine fertility probabilities - for i in range(1, l + 1): - fertility = alignment_info.fertility_of_i(i) - probability *= ( - factorial(fertility) * self.fertility_table[fertility][src_sentence[i]] - ) - if probability < MIN_PROB: - return MIN_PROB - - # Combine lexical and distortion probabilities - for j in range(1, m + 1): - t = trg_sentence[j] - i = alignment_info.alignment[j] - s = src_sentence[i] - - probability *= ( - self.translation_table[t][s] * self.distortion_table[j][i][l][m] - ) - if probability < MIN_PROB: - return MIN_PROB - - return probability - - -class Model3Counts(Counts): - """ - Data object to store counts of various parameters during training. - Includes counts for distortion. - """ - - def __init__(self): - super().__init__() - self.distortion = defaultdict( - lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) - ) - self.distortion_for_any_j = defaultdict( - lambda: defaultdict(lambda: defaultdict(lambda: 0.0)) - ) - - def update_distortion(self, count, alignment_info, j, l, m): - i = alignment_info.alignment[j] - self.distortion[j][i][l][m] += count - self.distortion_for_any_j[i][l][m] += count diff --git a/pipeline/nltk/translate/ibm4.py b/pipeline/nltk/translate/ibm4.py deleted file mode 100644 index c7686939ac5027d6e16147cc82611cd4519ea51e..0000000000000000000000000000000000000000 --- a/pipeline/nltk/translate/ibm4.py +++ /dev/null @@ -1,490 +0,0 @@ -# Natural Language Toolkit: IBM Model 4 -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Tah Wei Hoon -# URL: -# For license information, see LICENSE.TXT - -""" -Translation model that reorders output words based on their type and -distance from other related words in the output sentence. - -IBM Model 4 improves the distortion model of Model 3, motivated by the -observation that certain words tend to be re-ordered in a predictable -way relative to one another. For example, in English -usually has its order flipped as in French. - -Model 4 requires words in the source and target vocabularies to be -categorized into classes. This can be linguistically driven, like parts -of speech (adjective, nouns, prepositions, etc). Word classes can also -be obtained by statistical methods. The original IBM Model 4 uses an -information theoretic approach to group words into 50 classes for each -vocabulary. - -Terminology ------------ - -:Cept: - A source word with non-zero fertility i.e. aligned to one or more - target words. -:Tablet: - The set of target word(s) aligned to a cept. -:Head of cept: - The first word of the tablet of that cept. -:Center of cept: - The average position of the words in that cept's tablet. If the - value is not an integer, the ceiling is taken. - For example, for a tablet with words in positions 2, 5, 6 in the - target sentence, the center of the corresponding cept is - ceil((2 + 5 + 6) / 3) = 5 -:Displacement: - For a head word, defined as (position of head word - position of - previous cept's center). Can be positive or negative. - For a non-head word, defined as (position of non-head word - - position of previous word in the same tablet). Always positive, - because successive words in a tablet are assumed to appear to the - right of the previous word. - -In contrast to Model 3 which reorders words in a tablet independently of -other words, Model 4 distinguishes between three cases. - -1. Words generated by NULL are distributed uniformly. -2. For a head word t, its position is modeled by the probability - d_head(displacement | word_class_s(s),word_class_t(t)), - where s is the previous cept, and word_class_s and word_class_t maps - s and t to a source and target language word class respectively. -3. For a non-head word t, its position is modeled by the probability - d_non_head(displacement | word_class_t(t)) - -The EM algorithm used in Model 4 is: - -:E step: In the training data, collect counts, weighted by prior - probabilities. - - - (a) count how many times a source language word is translated - into a target language word - - (b) for a particular word class, count how many times a head - word is located at a particular displacement from the - previous cept's center - - (c) for a particular word class, count how many times a - non-head word is located at a particular displacement from - the previous target word - - (d) count how many times a source word is aligned to phi number - of target words - - (e) count how many times NULL is aligned to a target word - -:M step: Estimate new probabilities based on the counts from the E step - -Like Model 3, there are too many possible alignments to consider. Thus, -a hill climbing approach is used to sample good candidates. - -Notations ---------- - -:i: Position in the source sentence - Valid values are 0 (for NULL), 1, 2, ..., length of source sentence -:j: Position in the target sentence - Valid values are 1, 2, ..., length of target sentence -:l: Number of words in the source sentence, excluding NULL -:m: Number of words in the target sentence -:s: A word in the source language -:t: A word in the target language -:phi: Fertility, the number of target words produced by a source word -:p1: Probability that a target word produced by a source word is - accompanied by another target word that is aligned to NULL -:p0: 1 - p1 -:dj: Displacement, Δj - -References ----------- - -Philipp Koehn. 2010. Statistical Machine Translation. -Cambridge University Press, New York. - -Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and -Robert L. Mercer. 1993. The Mathematics of Statistical Machine -Translation: Parameter Estimation. Computational Linguistics, 19 (2), -263-311. -""" - -import warnings -from collections import defaultdict -from math import factorial - -from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel3 -from nltk.translate.ibm_model import Counts, longest_target_sentence_length - - -class IBMModel4(IBMModel): - """ - Translation model that reorders output words based on their type and - their distance from other related words in the output sentence - - >>> bitext = [] - >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) - >>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big'])) - >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) - >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small'])) - >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) - >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) - >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) - >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book'])) - >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize'])) - >>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'was': 3, 'i': 4, 'summarize': 5 } - >>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'war': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 } - - >>> ibm4 = IBMModel4(bitext, 5, src_classes, trg_classes) - - >>> print(round(ibm4.translation_table['buch']['book'], 3)) - 1.0 - >>> print(round(ibm4.translation_table['das']['book'], 3)) - 0.0 - >>> print(round(ibm4.translation_table['ja'][None], 3)) - 1.0 - - >>> print(round(ibm4.head_distortion_table[1][0][1], 3)) - 1.0 - >>> print(round(ibm4.head_distortion_table[2][0][1], 3)) - 0.0 - >>> print(round(ibm4.non_head_distortion_table[3][6], 3)) - 0.5 - - >>> print(round(ibm4.fertility_table[2]['summarize'], 3)) - 1.0 - >>> print(round(ibm4.fertility_table[1]['book'], 3)) - 1.0 - - >>> print(round(ibm4.p1, 3)) - 0.033 - - >>> test_sentence = bitext[2] - >>> test_sentence.words - ['das', 'buch', 'ist', 'ja', 'klein'] - >>> test_sentence.mots - ['the', 'book', 'is', 'small'] - >>> test_sentence.alignment - Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)]) - - """ - - def __init__( - self, - sentence_aligned_corpus, - iterations, - source_word_classes, - target_word_classes, - probability_tables=None, - ): - """ - Train on ``sentence_aligned_corpus`` and create a lexical - translation model, distortion models, a fertility model, and a - model for generating NULL-aligned words. - - Translation direction is from ``AlignedSent.mots`` to - ``AlignedSent.words``. - - :param sentence_aligned_corpus: Sentence-aligned parallel corpus - :type sentence_aligned_corpus: list(AlignedSent) - - :param iterations: Number of iterations to run training algorithm - :type iterations: int - - :param source_word_classes: Lookup table that maps a source word - to its word class, the latter represented by an integer id - :type source_word_classes: dict[str]: int - - :param target_word_classes: Lookup table that maps a target word - to its word class, the latter represented by an integer id - :type target_word_classes: dict[str]: int - - :param probability_tables: Optional. Use this to pass in custom - probability values. If not specified, probabilities will be - set to a uniform distribution, or some other sensible value. - If specified, all the following entries must be present: - ``translation_table``, ``alignment_table``, - ``fertility_table``, ``p1``, ``head_distortion_table``, - ``non_head_distortion_table``. See ``IBMModel`` and - ``IBMModel4`` for the type and purpose of these tables. - :type probability_tables: dict[str]: object - """ - super().__init__(sentence_aligned_corpus) - self.reset_probabilities() - self.src_classes = source_word_classes - self.trg_classes = target_word_classes - - if probability_tables is None: - # Get probabilities from IBM model 3 - ibm3 = IBMModel3(sentence_aligned_corpus, iterations) - self.translation_table = ibm3.translation_table - self.alignment_table = ibm3.alignment_table - self.fertility_table = ibm3.fertility_table - self.p1 = ibm3.p1 - self.set_uniform_probabilities(sentence_aligned_corpus) - else: - # Set user-defined probabilities - self.translation_table = probability_tables["translation_table"] - self.alignment_table = probability_tables["alignment_table"] - self.fertility_table = probability_tables["fertility_table"] - self.p1 = probability_tables["p1"] - self.head_distortion_table = probability_tables["head_distortion_table"] - self.non_head_distortion_table = probability_tables[ - "non_head_distortion_table" - ] - - for n in range(0, iterations): - self.train(sentence_aligned_corpus) - - def reset_probabilities(self): - super().reset_probabilities() - self.head_distortion_table = defaultdict( - lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)) - ) - """ - dict[int][int][int]: float. Probability(displacement of head - word | word class of previous cept,target word class). - Values accessed as ``distortion_table[dj][src_class][trg_class]``. - """ - - self.non_head_distortion_table = defaultdict( - lambda: defaultdict(lambda: self.MIN_PROB) - ) - """ - dict[int][int]: float. Probability(displacement of non-head - word | target word class). - Values accessed as ``distortion_table[dj][trg_class]``. - """ - - def set_uniform_probabilities(self, sentence_aligned_corpus): - """ - Set distortion probabilities uniformly to - 1 / cardinality of displacement values - """ - max_m = longest_target_sentence_length(sentence_aligned_corpus) - - # The maximum displacement is m-1, when a word is in the last - # position m of the target sentence and the previously placed - # word is in the first position. - # Conversely, the minimum displacement is -(m-1). - # Thus, the displacement range is (m-1) - (-(m-1)). Note that - # displacement cannot be zero and is not included in the range. - if max_m <= 1: - initial_prob = IBMModel.MIN_PROB - else: - initial_prob = 1 / (2 * (max_m - 1)) - if initial_prob < IBMModel.MIN_PROB: - warnings.warn( - "A target sentence is too long (" - + str(max_m) - + " words). Results may be less accurate." - ) - - for dj in range(1, max_m): - self.head_distortion_table[dj] = defaultdict( - lambda: defaultdict(lambda: initial_prob) - ) - self.head_distortion_table[-dj] = defaultdict( - lambda: defaultdict(lambda: initial_prob) - ) - self.non_head_distortion_table[dj] = defaultdict(lambda: initial_prob) - self.non_head_distortion_table[-dj] = defaultdict(lambda: initial_prob) - - def train(self, parallel_corpus): - counts = Model4Counts() - for aligned_sentence in parallel_corpus: - m = len(aligned_sentence.words) - - # Sample the alignment space - sampled_alignments, best_alignment = self.sample(aligned_sentence) - # Record the most probable alignment - aligned_sentence.alignment = Alignment( - best_alignment.zero_indexed_alignment() - ) - - # E step (a): Compute normalization factors to weigh counts - total_count = self.prob_of_alignments(sampled_alignments) - - # E step (b): Collect counts - for alignment_info in sampled_alignments: - count = self.prob_t_a_given_s(alignment_info) - normalized_count = count / total_count - - for j in range(1, m + 1): - counts.update_lexical_translation( - normalized_count, alignment_info, j - ) - counts.update_distortion( - normalized_count, - alignment_info, - j, - self.src_classes, - self.trg_classes, - ) - - counts.update_null_generation(normalized_count, alignment_info) - counts.update_fertility(normalized_count, alignment_info) - - # M step: Update probabilities with maximum likelihood estimates - # If any probability is less than MIN_PROB, clamp it to MIN_PROB - existing_alignment_table = self.alignment_table - self.reset_probabilities() - self.alignment_table = existing_alignment_table # don't retrain - - self.maximize_lexical_translation_probabilities(counts) - self.maximize_distortion_probabilities(counts) - self.maximize_fertility_probabilities(counts) - self.maximize_null_generation_probabilities(counts) - - def maximize_distortion_probabilities(self, counts): - head_d_table = self.head_distortion_table - for dj, src_classes in counts.head_distortion.items(): - for s_cls, trg_classes in src_classes.items(): - for t_cls in trg_classes: - estimate = ( - counts.head_distortion[dj][s_cls][t_cls] - / counts.head_distortion_for_any_dj[s_cls][t_cls] - ) - head_d_table[dj][s_cls][t_cls] = max(estimate, IBMModel.MIN_PROB) - - non_head_d_table = self.non_head_distortion_table - for dj, trg_classes in counts.non_head_distortion.items(): - for t_cls in trg_classes: - estimate = ( - counts.non_head_distortion[dj][t_cls] - / counts.non_head_distortion_for_any_dj[t_cls] - ) - non_head_d_table[dj][t_cls] = max(estimate, IBMModel.MIN_PROB) - - def prob_t_a_given_s(self, alignment_info): - """ - Probability of target sentence and an alignment given the - source sentence - """ - return IBMModel4.model4_prob_t_a_given_s(alignment_info, self) - - @staticmethod # exposed for Model 5 to use - def model4_prob_t_a_given_s(alignment_info, ibm_model): - probability = 1.0 - MIN_PROB = IBMModel.MIN_PROB - - def null_generation_term(): - # Binomial distribution: B(m - null_fertility, p1) - value = 1.0 - p1 = ibm_model.p1 - p0 = 1 - p1 - null_fertility = alignment_info.fertility_of_i(0) - m = len(alignment_info.trg_sentence) - 1 - value *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility) - if value < MIN_PROB: - return MIN_PROB - - # Combination: (m - null_fertility) choose null_fertility - for i in range(1, null_fertility + 1): - value *= (m - null_fertility - i + 1) / i - return value - - def fertility_term(): - value = 1.0 - src_sentence = alignment_info.src_sentence - for i in range(1, len(src_sentence)): - fertility = alignment_info.fertility_of_i(i) - value *= ( - factorial(fertility) - * ibm_model.fertility_table[fertility][src_sentence[i]] - ) - if value < MIN_PROB: - return MIN_PROB - return value - - def lexical_translation_term(j): - t = alignment_info.trg_sentence[j] - i = alignment_info.alignment[j] - s = alignment_info.src_sentence[i] - return ibm_model.translation_table[t][s] - - def distortion_term(j): - t = alignment_info.trg_sentence[j] - i = alignment_info.alignment[j] - if i == 0: - # case 1: t is aligned to NULL - return 1.0 - if alignment_info.is_head_word(j): - # case 2: t is the first word of a tablet - previous_cept = alignment_info.previous_cept(j) - src_class = None - if previous_cept is not None: - previous_s = alignment_info.src_sentence[previous_cept] - src_class = ibm_model.src_classes[previous_s] - trg_class = ibm_model.trg_classes[t] - dj = j - alignment_info.center_of_cept(previous_cept) - return ibm_model.head_distortion_table[dj][src_class][trg_class] - - # case 3: t is a subsequent word of a tablet - previous_position = alignment_info.previous_in_tablet(j) - trg_class = ibm_model.trg_classes[t] - dj = j - previous_position - return ibm_model.non_head_distortion_table[dj][trg_class] - - # end nested functions - - # Abort computation whenever probability falls below MIN_PROB at - # any point, since MIN_PROB can be considered as zero - probability *= null_generation_term() - if probability < MIN_PROB: - return MIN_PROB - - probability *= fertility_term() - if probability < MIN_PROB: - return MIN_PROB - - for j in range(1, len(alignment_info.trg_sentence)): - probability *= lexical_translation_term(j) - if probability < MIN_PROB: - return MIN_PROB - - probability *= distortion_term(j) - if probability < MIN_PROB: - return MIN_PROB - - return probability - - -class Model4Counts(Counts): - """ - Data object to store counts of various parameters during training. - Includes counts for distortion. - """ - - def __init__(self): - super().__init__() - self.head_distortion = defaultdict( - lambda: defaultdict(lambda: defaultdict(lambda: 0.0)) - ) - self.head_distortion_for_any_dj = defaultdict(lambda: defaultdict(lambda: 0.0)) - self.non_head_distortion = defaultdict(lambda: defaultdict(lambda: 0.0)) - self.non_head_distortion_for_any_dj = defaultdict(lambda: 0.0) - - def update_distortion(self, count, alignment_info, j, src_classes, trg_classes): - i = alignment_info.alignment[j] - t = alignment_info.trg_sentence[j] - if i == 0: - # case 1: t is aligned to NULL - pass - elif alignment_info.is_head_word(j): - # case 2: t is the first word of a tablet - previous_cept = alignment_info.previous_cept(j) - if previous_cept is not None: - previous_src_word = alignment_info.src_sentence[previous_cept] - src_class = src_classes[previous_src_word] - else: - src_class = None - trg_class = trg_classes[t] - dj = j - alignment_info.center_of_cept(previous_cept) - self.head_distortion[dj][src_class][trg_class] += count - self.head_distortion_for_any_dj[src_class][trg_class] += count - else: - # case 3: t is a subsequent word of a tablet - previous_j = alignment_info.previous_in_tablet(j) - trg_class = trg_classes[t] - dj = j - previous_j - self.non_head_distortion[dj][trg_class] += count - self.non_head_distortion_for_any_dj[trg_class] += count diff --git a/pipeline/nltk/translate/ibm5.py b/pipeline/nltk/translate/ibm5.py deleted file mode 100644 index 98ed2ec0aec4535fd6b4e18abbf8ecd8f696a9e6..0000000000000000000000000000000000000000 --- a/pipeline/nltk/translate/ibm5.py +++ /dev/null @@ -1,663 +0,0 @@ -# Natural Language Toolkit: IBM Model 5 -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Tah Wei Hoon -# URL: -# For license information, see LICENSE.TXT - -""" -Translation model that keeps track of vacant positions in the target -sentence to decide where to place translated words. - -Translation can be viewed as a process where each word in the source -sentence is stepped through sequentially, generating translated words -for each source word. The target sentence can be viewed as being made -up of ``m`` empty slots initially, which gradually fill up as generated -words are placed in them. - -Models 3 and 4 use distortion probabilities to decide how to place -translated words. For simplicity, these models ignore the history of -which slots have already been occupied with translated words. -Consider the placement of the last translated word: there is only one -empty slot left in the target sentence, so the distortion probability -should be 1.0 for that position and 0.0 everywhere else. However, the -distortion probabilities for Models 3 and 4 are set up such that all -positions are under consideration. - -IBM Model 5 fixes this deficiency by accounting for occupied slots -during translation. It introduces the vacancy function v(j), the number -of vacancies up to, and including, position j in the target sentence. - -Terminology ------------ - -:Maximum vacancy: - The number of valid slots that a word can be placed in. - This is not necessarily the same as the number of vacant slots. - For example, if a tablet contains more than one word, the head word - cannot be placed at the last vacant slot because there will be no - space for the other words in the tablet. The number of valid slots - has to take into account the length of the tablet. - Non-head words cannot be placed before the head word, so vacancies - to the left of the head word are ignored. -:Vacancy difference: - For a head word: (v(j) - v(center of previous cept)) - Can be positive or negative. - For a non-head word: (v(j) - v(position of previously placed word)) - Always positive, because successive words in a tablet are assumed to - appear to the right of the previous word. - -Positioning of target words fall under three cases: - -1. Words generated by NULL are distributed uniformly -2. For a head word t, its position is modeled by the probability - v_head(dv | max_v,word_class_t(t)) -3. For a non-head word t, its position is modeled by the probability - v_non_head(dv | max_v,word_class_t(t)) - -dv and max_v are defined differently for head and non-head words. - -The EM algorithm used in Model 5 is: - -:E step: In the training data, collect counts, weighted by prior - probabilities. - - - (a) count how many times a source language word is translated - into a target language word - - (b) for a particular word class and maximum vacancy, count how - many times a head word and the previous cept's center have - a particular difference in number of vacancies - - (b) for a particular word class and maximum vacancy, count how - many times a non-head word and the previous target word - have a particular difference in number of vacancies - - (d) count how many times a source word is aligned to phi number - of target words - - (e) count how many times NULL is aligned to a target word - -:M step: Estimate new probabilities based on the counts from the E step - -Like Model 4, there are too many possible alignments to consider. Thus, -a hill climbing approach is used to sample good candidates. In addition, -pruning is used to weed out unlikely alignments based on Model 4 scores. - -Notations ---------- - -:i: Position in the source sentence - Valid values are 0 (for NULL), 1, 2, ..., length of source sentence -:j: Position in the target sentence - Valid values are 1, 2, ..., length of target sentence -:l: Number of words in the source sentence, excluding NULL -:m: Number of words in the target sentence -:s: A word in the source language -:t: A word in the target language -:phi: Fertility, the number of target words produced by a source word -:p1: Probability that a target word produced by a source word is - accompanied by another target word that is aligned to NULL -:p0: 1 - p1 -:max_v: Maximum vacancy -:dv: Vacancy difference, Δv - -The definition of v_head here differs from GIZA++, section 4.7 of -[Brown et al., 1993], and [Koehn, 2010]. In the latter cases, v_head is -v_head(v(j) | v(center of previous cept),max_v,word_class(t)). - -Here, we follow appendix B of [Brown et al., 1993] and combine v(j) with -v(center of previous cept) to obtain dv: -v_head(v(j) - v(center of previous cept) | max_v,word_class(t)). - -References ----------- - -Philipp Koehn. 2010. Statistical Machine Translation. -Cambridge University Press, New York. - -Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and -Robert L. Mercer. 1993. The Mathematics of Statistical Machine -Translation: Parameter Estimation. Computational Linguistics, 19 (2), -263-311. -""" - -import warnings -from collections import defaultdict -from math import factorial - -from nltk.translate import AlignedSent, Alignment, IBMModel, IBMModel4 -from nltk.translate.ibm_model import Counts, longest_target_sentence_length - - -class IBMModel5(IBMModel): - """ - Translation model that keeps track of vacant positions in the target - sentence to decide where to place translated words - - >>> bitext = [] - >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) - >>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big'])) - >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) - >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small'])) - >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) - >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) - >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) - >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book'])) - >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize'])) - >>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'was': 3, 'i': 4, 'summarize': 5 } - >>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'war': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 } - - >>> ibm5 = IBMModel5(bitext, 5, src_classes, trg_classes) - - >>> print(round(ibm5.head_vacancy_table[1][1][1], 3)) - 1.0 - >>> print(round(ibm5.head_vacancy_table[2][1][1], 3)) - 0.0 - >>> print(round(ibm5.non_head_vacancy_table[3][3][6], 3)) - 1.0 - - >>> print(round(ibm5.fertility_table[2]['summarize'], 3)) - 1.0 - >>> print(round(ibm5.fertility_table[1]['book'], 3)) - 1.0 - - >>> print(round(ibm5.p1, 3)) - 0.033 - - >>> test_sentence = bitext[2] - >>> test_sentence.words - ['das', 'buch', 'ist', 'ja', 'klein'] - >>> test_sentence.mots - ['the', 'book', 'is', 'small'] - >>> test_sentence.alignment - Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)]) - - """ - - MIN_SCORE_FACTOR = 0.2 - """ - Alignments with scores below this factor are pruned during sampling - """ - - def __init__( - self, - sentence_aligned_corpus, - iterations, - source_word_classes, - target_word_classes, - probability_tables=None, - ): - """ - Train on ``sentence_aligned_corpus`` and create a lexical - translation model, vacancy models, a fertility model, and a - model for generating NULL-aligned words. - - Translation direction is from ``AlignedSent.mots`` to - ``AlignedSent.words``. - - :param sentence_aligned_corpus: Sentence-aligned parallel corpus - :type sentence_aligned_corpus: list(AlignedSent) - - :param iterations: Number of iterations to run training algorithm - :type iterations: int - - :param source_word_classes: Lookup table that maps a source word - to its word class, the latter represented by an integer id - :type source_word_classes: dict[str]: int - - :param target_word_classes: Lookup table that maps a target word - to its word class, the latter represented by an integer id - :type target_word_classes: dict[str]: int - - :param probability_tables: Optional. Use this to pass in custom - probability values. If not specified, probabilities will be - set to a uniform distribution, or some other sensible value. - If specified, all the following entries must be present: - ``translation_table``, ``alignment_table``, - ``fertility_table``, ``p1``, ``head_distortion_table``, - ``non_head_distortion_table``, ``head_vacancy_table``, - ``non_head_vacancy_table``. See ``IBMModel``, ``IBMModel4``, - and ``IBMModel5`` for the type and purpose of these tables. - :type probability_tables: dict[str]: object - """ - super().__init__(sentence_aligned_corpus) - self.reset_probabilities() - self.src_classes = source_word_classes - self.trg_classes = target_word_classes - - if probability_tables is None: - # Get probabilities from IBM model 4 - ibm4 = IBMModel4( - sentence_aligned_corpus, - iterations, - source_word_classes, - target_word_classes, - ) - self.translation_table = ibm4.translation_table - self.alignment_table = ibm4.alignment_table - self.fertility_table = ibm4.fertility_table - self.p1 = ibm4.p1 - self.head_distortion_table = ibm4.head_distortion_table - self.non_head_distortion_table = ibm4.non_head_distortion_table - self.set_uniform_probabilities(sentence_aligned_corpus) - else: - # Set user-defined probabilities - self.translation_table = probability_tables["translation_table"] - self.alignment_table = probability_tables["alignment_table"] - self.fertility_table = probability_tables["fertility_table"] - self.p1 = probability_tables["p1"] - self.head_distortion_table = probability_tables["head_distortion_table"] - self.non_head_distortion_table = probability_tables[ - "non_head_distortion_table" - ] - self.head_vacancy_table = probability_tables["head_vacancy_table"] - self.non_head_vacancy_table = probability_tables["non_head_vacancy_table"] - - for n in range(0, iterations): - self.train(sentence_aligned_corpus) - - def reset_probabilities(self): - super().reset_probabilities() - self.head_vacancy_table = defaultdict( - lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)) - ) - """ - dict[int][int][int]: float. Probability(vacancy difference | - number of remaining valid positions,target word class). - Values accessed as ``head_vacancy_table[dv][v_max][trg_class]``. - """ - - self.non_head_vacancy_table = defaultdict( - lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)) - ) - """ - dict[int][int][int]: float. Probability(vacancy difference | - number of remaining valid positions,target word class). - Values accessed as ``non_head_vacancy_table[dv][v_max][trg_class]``. - """ - - def set_uniform_probabilities(self, sentence_aligned_corpus): - """ - Set vacancy probabilities uniformly to - 1 / cardinality of vacancy difference values - """ - max_m = longest_target_sentence_length(sentence_aligned_corpus) - - # The maximum vacancy difference occurs when a word is placed in - # the last available position m of the target sentence and the - # previous word position has no vacancies. - # The minimum is 1-max_v, when a word is placed in the first - # available position and the previous word is placed beyond the - # last available position. - # Thus, the number of possible vacancy difference values is - # (max_v) - (1-max_v) + 1 = 2 * max_v. - if max_m > 0 and (1 / (2 * max_m)) < IBMModel.MIN_PROB: - warnings.warn( - "A target sentence is too long (" - + str(max_m) - + " words). Results may be less accurate." - ) - - for max_v in range(1, max_m + 1): - for dv in range(1, max_m + 1): - initial_prob = 1 / (2 * max_v) - self.head_vacancy_table[dv][max_v] = defaultdict(lambda: initial_prob) - self.head_vacancy_table[-(dv - 1)][max_v] = defaultdict( - lambda: initial_prob - ) - self.non_head_vacancy_table[dv][max_v] = defaultdict( - lambda: initial_prob - ) - self.non_head_vacancy_table[-(dv - 1)][max_v] = defaultdict( - lambda: initial_prob - ) - - def train(self, parallel_corpus): - counts = Model5Counts() - for aligned_sentence in parallel_corpus: - l = len(aligned_sentence.mots) - m = len(aligned_sentence.words) - - # Sample the alignment space - sampled_alignments, best_alignment = self.sample(aligned_sentence) - # Record the most probable alignment - aligned_sentence.alignment = Alignment( - best_alignment.zero_indexed_alignment() - ) - - # E step (a): Compute normalization factors to weigh counts - total_count = self.prob_of_alignments(sampled_alignments) - - # E step (b): Collect counts - for alignment_info in sampled_alignments: - count = self.prob_t_a_given_s(alignment_info) - normalized_count = count / total_count - - for j in range(1, m + 1): - counts.update_lexical_translation( - normalized_count, alignment_info, j - ) - - slots = Slots(m) - for i in range(1, l + 1): - counts.update_vacancy( - normalized_count, alignment_info, i, self.trg_classes, slots - ) - - counts.update_null_generation(normalized_count, alignment_info) - counts.update_fertility(normalized_count, alignment_info) - - # M step: Update probabilities with maximum likelihood estimates - # If any probability is less than MIN_PROB, clamp it to MIN_PROB - existing_alignment_table = self.alignment_table - self.reset_probabilities() - self.alignment_table = existing_alignment_table # don't retrain - - self.maximize_lexical_translation_probabilities(counts) - self.maximize_vacancy_probabilities(counts) - self.maximize_fertility_probabilities(counts) - self.maximize_null_generation_probabilities(counts) - - def sample(self, sentence_pair): - """ - Sample the most probable alignments from the entire alignment - space according to Model 4 - - Note that Model 4 scoring is used instead of Model 5 because the - latter is too expensive to compute. - - First, determine the best alignment according to IBM Model 2. - With this initial alignment, use hill climbing to determine the - best alignment according to a IBM Model 4. Add this - alignment and its neighbors to the sample set. Repeat this - process with other initial alignments obtained by pegging an - alignment point. Finally, prune alignments that have - substantially lower Model 4 scores than the best alignment. - - :param sentence_pair: Source and target language sentence pair - to generate a sample of alignments from - :type sentence_pair: AlignedSent - - :return: A set of best alignments represented by their ``AlignmentInfo`` - and the best alignment of the set for convenience - :rtype: set(AlignmentInfo), AlignmentInfo - """ - sampled_alignments, best_alignment = super().sample(sentence_pair) - return self.prune(sampled_alignments), best_alignment - - def prune(self, alignment_infos): - """ - Removes alignments from ``alignment_infos`` that have - substantially lower Model 4 scores than the best alignment - - :return: Pruned alignments - :rtype: set(AlignmentInfo) - """ - alignments = [] - best_score = 0 - - for alignment_info in alignment_infos: - score = IBMModel4.model4_prob_t_a_given_s(alignment_info, self) - best_score = max(score, best_score) - alignments.append((alignment_info, score)) - - threshold = IBMModel5.MIN_SCORE_FACTOR * best_score - alignments = [a[0] for a in alignments if a[1] > threshold] - return set(alignments) - - def hillclimb(self, alignment_info, j_pegged=None): - """ - Starting from the alignment in ``alignment_info``, look at - neighboring alignments iteratively for the best one, according - to Model 4 - - Note that Model 4 scoring is used instead of Model 5 because the - latter is too expensive to compute. - - There is no guarantee that the best alignment in the alignment - space will be found, because the algorithm might be stuck in a - local maximum. - - :param j_pegged: If specified, the search will be constrained to - alignments where ``j_pegged`` remains unchanged - :type j_pegged: int - - :return: The best alignment found from hill climbing - :rtype: AlignmentInfo - """ - alignment = alignment_info # alias with shorter name - max_probability = IBMModel4.model4_prob_t_a_given_s(alignment, self) - - while True: - old_alignment = alignment - for neighbor_alignment in self.neighboring(alignment, j_pegged): - neighbor_probability = IBMModel4.model4_prob_t_a_given_s( - neighbor_alignment, self - ) - - if neighbor_probability > max_probability: - alignment = neighbor_alignment - max_probability = neighbor_probability - - if alignment == old_alignment: - # Until there are no better alignments - break - - alignment.score = max_probability - return alignment - - def prob_t_a_given_s(self, alignment_info): - """ - Probability of target sentence and an alignment given the - source sentence - """ - probability = 1.0 - MIN_PROB = IBMModel.MIN_PROB - slots = Slots(len(alignment_info.trg_sentence) - 1) - - def null_generation_term(): - # Binomial distribution: B(m - null_fertility, p1) - value = 1.0 - p1 = self.p1 - p0 = 1 - p1 - null_fertility = alignment_info.fertility_of_i(0) - m = len(alignment_info.trg_sentence) - 1 - value *= pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility) - if value < MIN_PROB: - return MIN_PROB - - # Combination: (m - null_fertility) choose null_fertility - for i in range(1, null_fertility + 1): - value *= (m - null_fertility - i + 1) / i - return value - - def fertility_term(): - value = 1.0 - src_sentence = alignment_info.src_sentence - for i in range(1, len(src_sentence)): - fertility = alignment_info.fertility_of_i(i) - value *= ( - factorial(fertility) - * self.fertility_table[fertility][src_sentence[i]] - ) - if value < MIN_PROB: - return MIN_PROB - return value - - def lexical_translation_term(j): - t = alignment_info.trg_sentence[j] - i = alignment_info.alignment[j] - s = alignment_info.src_sentence[i] - return self.translation_table[t][s] - - def vacancy_term(i): - value = 1.0 - tablet = alignment_info.cepts[i] - tablet_length = len(tablet) - total_vacancies = slots.vacancies_at(len(slots)) - - # case 1: NULL-aligned words - if tablet_length == 0: - return value - - # case 2: head word - j = tablet[0] - previous_cept = alignment_info.previous_cept(j) - previous_center = alignment_info.center_of_cept(previous_cept) - dv = slots.vacancies_at(j) - slots.vacancies_at(previous_center) - max_v = total_vacancies - tablet_length + 1 - trg_class = self.trg_classes[alignment_info.trg_sentence[j]] - value *= self.head_vacancy_table[dv][max_v][trg_class] - slots.occupy(j) # mark position as occupied - total_vacancies -= 1 - if value < MIN_PROB: - return MIN_PROB - - # case 3: non-head words - for k in range(1, tablet_length): - previous_position = tablet[k - 1] - previous_vacancies = slots.vacancies_at(previous_position) - j = tablet[k] - dv = slots.vacancies_at(j) - previous_vacancies - max_v = total_vacancies - tablet_length + k + 1 - previous_vacancies - trg_class = self.trg_classes[alignment_info.trg_sentence[j]] - value *= self.non_head_vacancy_table[dv][max_v][trg_class] - slots.occupy(j) # mark position as occupied - total_vacancies -= 1 - if value < MIN_PROB: - return MIN_PROB - - return value - - # end nested functions - - # Abort computation whenever probability falls below MIN_PROB at - # any point, since MIN_PROB can be considered as zero - probability *= null_generation_term() - if probability < MIN_PROB: - return MIN_PROB - - probability *= fertility_term() - if probability < MIN_PROB: - return MIN_PROB - - for j in range(1, len(alignment_info.trg_sentence)): - probability *= lexical_translation_term(j) - if probability < MIN_PROB: - return MIN_PROB - - for i in range(1, len(alignment_info.src_sentence)): - probability *= vacancy_term(i) - if probability < MIN_PROB: - return MIN_PROB - - return probability - - def maximize_vacancy_probabilities(self, counts): - MIN_PROB = IBMModel.MIN_PROB - head_vacancy_table = self.head_vacancy_table - for dv, max_vs in counts.head_vacancy.items(): - for max_v, trg_classes in max_vs.items(): - for t_cls in trg_classes: - estimate = ( - counts.head_vacancy[dv][max_v][t_cls] - / counts.head_vacancy_for_any_dv[max_v][t_cls] - ) - head_vacancy_table[dv][max_v][t_cls] = max(estimate, MIN_PROB) - - non_head_vacancy_table = self.non_head_vacancy_table - for dv, max_vs in counts.non_head_vacancy.items(): - for max_v, trg_classes in max_vs.items(): - for t_cls in trg_classes: - estimate = ( - counts.non_head_vacancy[dv][max_v][t_cls] - / counts.non_head_vacancy_for_any_dv[max_v][t_cls] - ) - non_head_vacancy_table[dv][max_v][t_cls] = max(estimate, MIN_PROB) - - -class Model5Counts(Counts): - """ - Data object to store counts of various parameters during training. - Includes counts for vacancies. - """ - - def __init__(self): - super().__init__() - self.head_vacancy = defaultdict( - lambda: defaultdict(lambda: defaultdict(lambda: 0.0)) - ) - self.head_vacancy_for_any_dv = defaultdict(lambda: defaultdict(lambda: 0.0)) - self.non_head_vacancy = defaultdict( - lambda: defaultdict(lambda: defaultdict(lambda: 0.0)) - ) - self.non_head_vacancy_for_any_dv = defaultdict(lambda: defaultdict(lambda: 0.0)) - - def update_vacancy(self, count, alignment_info, i, trg_classes, slots): - """ - :param count: Value to add to the vacancy counts - :param alignment_info: Alignment under consideration - :param i: Source word position under consideration - :param trg_classes: Target word classes - :param slots: Vacancy states of the slots in the target sentence. - Output parameter that will be modified as new words are placed - in the target sentence. - """ - tablet = alignment_info.cepts[i] - tablet_length = len(tablet) - total_vacancies = slots.vacancies_at(len(slots)) - - # case 1: NULL aligned words - if tablet_length == 0: - return # ignore zero fertility words - - # case 2: head word - j = tablet[0] - previous_cept = alignment_info.previous_cept(j) - previous_center = alignment_info.center_of_cept(previous_cept) - dv = slots.vacancies_at(j) - slots.vacancies_at(previous_center) - max_v = total_vacancies - tablet_length + 1 - trg_class = trg_classes[alignment_info.trg_sentence[j]] - self.head_vacancy[dv][max_v][trg_class] += count - self.head_vacancy_for_any_dv[max_v][trg_class] += count - slots.occupy(j) # mark position as occupied - total_vacancies -= 1 - - # case 3: non-head words - for k in range(1, tablet_length): - previous_position = tablet[k - 1] - previous_vacancies = slots.vacancies_at(previous_position) - j = tablet[k] - dv = slots.vacancies_at(j) - previous_vacancies - max_v = total_vacancies - tablet_length + k + 1 - previous_vacancies - trg_class = trg_classes[alignment_info.trg_sentence[j]] - self.non_head_vacancy[dv][max_v][trg_class] += count - self.non_head_vacancy_for_any_dv[max_v][trg_class] += count - slots.occupy(j) # mark position as occupied - total_vacancies -= 1 - - -class Slots: - """ - Represents positions in a target sentence. Used to keep track of - which slot (position) is occupied. - """ - - def __init__(self, target_sentence_length): - self._slots = [False] * (target_sentence_length + 1) # 1-indexed - - def occupy(self, position): - """ - :return: Mark slot at ``position`` as occupied - """ - self._slots[position] = True - - def vacancies_at(self, position): - """ - :return: Number of vacant slots up to, and including, ``position`` - """ - vacancies = 0 - for k in range(1, position + 1): - if not self._slots[k]: - vacancies += 1 - return vacancies - - def __len__(self): - return len(self._slots) - 1 # exclude dummy zeroeth element diff --git a/pipeline/nltk/translate/ibm_model.py b/pipeline/nltk/translate/ibm_model.py deleted file mode 100644 index ed9a49408638605f8d4d627883d51e04816877d7..0000000000000000000000000000000000000000 --- a/pipeline/nltk/translate/ibm_model.py +++ /dev/null @@ -1,549 +0,0 @@ -# Natural Language Toolkit: IBM Model Core -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Tah Wei Hoon -# URL: -# For license information, see LICENSE.TXT - -""" -Common methods and classes for all IBM models. See ``IBMModel1``, -``IBMModel2``, ``IBMModel3``, ``IBMModel4``, and ``IBMModel5`` -for specific implementations. - -The IBM models are a series of generative models that learn lexical -translation probabilities, p(target language word|source language word), -given a sentence-aligned parallel corpus. - -The models increase in sophistication from model 1 to 5. Typically, the -output of lower models is used to seed the higher models. All models -use the Expectation-Maximization (EM) algorithm to learn various -probability tables. - -Words in a sentence are one-indexed. The first word of a sentence has -position 1, not 0. Index 0 is reserved in the source sentence for the -NULL token. The concept of position does not apply to NULL, but it is -indexed at 0 by convention. - -Each target word is aligned to exactly one source word or the NULL -token. - -References: -Philipp Koehn. 2010. Statistical Machine Translation. -Cambridge University Press, New York. - -Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and -Robert L. Mercer. 1993. The Mathematics of Statistical Machine -Translation: Parameter Estimation. Computational Linguistics, 19 (2), -263-311. -""" - -from bisect import insort_left -from collections import defaultdict -from copy import deepcopy -from math import ceil - - -def longest_target_sentence_length(sentence_aligned_corpus): - """ - :param sentence_aligned_corpus: Parallel corpus under consideration - :type sentence_aligned_corpus: list(AlignedSent) - :return: Number of words in the longest target language sentence - of ``sentence_aligned_corpus`` - """ - max_m = 0 - for aligned_sentence in sentence_aligned_corpus: - m = len(aligned_sentence.words) - max_m = max(m, max_m) - return max_m - - -class IBMModel: - """ - Abstract base class for all IBM models - """ - - # Avoid division by zero and precision errors by imposing a minimum - # value for probabilities. Note that this approach is theoretically - # incorrect, since it may create probabilities that sum to more - # than 1. In practice, the contribution of probabilities with MIN_PROB - # is tiny enough that the value of MIN_PROB can be treated as zero. - MIN_PROB = 1.0e-12 # GIZA++ is more liberal and uses 1.0e-7 - - def __init__(self, sentence_aligned_corpus): - self.init_vocab(sentence_aligned_corpus) - self.reset_probabilities() - - def reset_probabilities(self): - self.translation_table = defaultdict( - lambda: defaultdict(lambda: IBMModel.MIN_PROB) - ) - """ - dict[str][str]: float. Probability(target word | source word). - Values accessed as ``translation_table[target_word][source_word]``. - """ - - self.alignment_table = defaultdict( - lambda: defaultdict( - lambda: defaultdict(lambda: defaultdict(lambda: IBMModel.MIN_PROB)) - ) - ) - """ - dict[int][int][int][int]: float. Probability(i | j,l,m). - Values accessed as ``alignment_table[i][j][l][m]``. - Used in model 2 and hill climbing in models 3 and above - """ - - self.fertility_table = defaultdict(lambda: defaultdict(lambda: self.MIN_PROB)) - """ - dict[int][str]: float. Probability(fertility | source word). - Values accessed as ``fertility_table[fertility][source_word]``. - Used in model 3 and higher. - """ - - self.p1 = 0.5 - """ - Probability that a generated word requires another target word - that is aligned to NULL. - Used in model 3 and higher. - """ - - def set_uniform_probabilities(self, sentence_aligned_corpus): - """ - Initialize probability tables to a uniform distribution - - Derived classes should implement this accordingly. - """ - pass - - def init_vocab(self, sentence_aligned_corpus): - src_vocab = set() - trg_vocab = set() - for aligned_sentence in sentence_aligned_corpus: - trg_vocab.update(aligned_sentence.words) - src_vocab.update(aligned_sentence.mots) - # Add the NULL token - src_vocab.add(None) - - self.src_vocab = src_vocab - """ - set(str): All source language words used in training - """ - - self.trg_vocab = trg_vocab - """ - set(str): All target language words used in training - """ - - def sample(self, sentence_pair): - """ - Sample the most probable alignments from the entire alignment - space - - First, determine the best alignment according to IBM Model 2. - With this initial alignment, use hill climbing to determine the - best alignment according to a higher IBM Model. Add this - alignment and its neighbors to the sample set. Repeat this - process with other initial alignments obtained by pegging an - alignment point. - - Hill climbing may be stuck in a local maxima, hence the pegging - and trying out of different alignments. - - :param sentence_pair: Source and target language sentence pair - to generate a sample of alignments from - :type sentence_pair: AlignedSent - - :return: A set of best alignments represented by their ``AlignmentInfo`` - and the best alignment of the set for convenience - :rtype: set(AlignmentInfo), AlignmentInfo - """ - sampled_alignments = set() - l = len(sentence_pair.mots) - m = len(sentence_pair.words) - - # Start from the best model 2 alignment - initial_alignment = self.best_model2_alignment(sentence_pair) - potential_alignment = self.hillclimb(initial_alignment) - sampled_alignments.update(self.neighboring(potential_alignment)) - best_alignment = potential_alignment - - # Start from other model 2 alignments, - # with the constraint that j is aligned (pegged) to i - for j in range(1, m + 1): - for i in range(0, l + 1): - initial_alignment = self.best_model2_alignment(sentence_pair, j, i) - potential_alignment = self.hillclimb(initial_alignment, j) - neighbors = self.neighboring(potential_alignment, j) - sampled_alignments.update(neighbors) - if potential_alignment.score > best_alignment.score: - best_alignment = potential_alignment - - return sampled_alignments, best_alignment - - def best_model2_alignment(self, sentence_pair, j_pegged=None, i_pegged=0): - """ - Finds the best alignment according to IBM Model 2 - - Used as a starting point for hill climbing in Models 3 and - above, because it is easier to compute than the best alignments - in higher models - - :param sentence_pair: Source and target language sentence pair - to be word-aligned - :type sentence_pair: AlignedSent - - :param j_pegged: If specified, the alignment point of j_pegged - will be fixed to i_pegged - :type j_pegged: int - - :param i_pegged: Alignment point to j_pegged - :type i_pegged: int - """ - src_sentence = [None] + sentence_pair.mots - trg_sentence = ["UNUSED"] + sentence_pair.words # 1-indexed - - l = len(src_sentence) - 1 # exclude NULL - m = len(trg_sentence) - 1 - - alignment = [0] * (m + 1) # init all alignments to NULL - cepts = [[] for i in range(l + 1)] # init all cepts to empty list - - for j in range(1, m + 1): - if j == j_pegged: - # use the pegged alignment instead of searching for best one - best_i = i_pegged - else: - best_i = 0 - max_alignment_prob = IBMModel.MIN_PROB - t = trg_sentence[j] - - for i in range(0, l + 1): - s = src_sentence[i] - alignment_prob = ( - self.translation_table[t][s] * self.alignment_table[i][j][l][m] - ) - - if alignment_prob >= max_alignment_prob: - max_alignment_prob = alignment_prob - best_i = i - - alignment[j] = best_i - cepts[best_i].append(j) - - return AlignmentInfo( - tuple(alignment), tuple(src_sentence), tuple(trg_sentence), cepts - ) - - def hillclimb(self, alignment_info, j_pegged=None): - """ - Starting from the alignment in ``alignment_info``, look at - neighboring alignments iteratively for the best one - - There is no guarantee that the best alignment in the alignment - space will be found, because the algorithm might be stuck in a - local maximum. - - :param j_pegged: If specified, the search will be constrained to - alignments where ``j_pegged`` remains unchanged - :type j_pegged: int - - :return: The best alignment found from hill climbing - :rtype: AlignmentInfo - """ - alignment = alignment_info # alias with shorter name - max_probability = self.prob_t_a_given_s(alignment) - - while True: - old_alignment = alignment - for neighbor_alignment in self.neighboring(alignment, j_pegged): - neighbor_probability = self.prob_t_a_given_s(neighbor_alignment) - - if neighbor_probability > max_probability: - alignment = neighbor_alignment - max_probability = neighbor_probability - - if alignment == old_alignment: - # Until there are no better alignments - break - - alignment.score = max_probability - return alignment - - def neighboring(self, alignment_info, j_pegged=None): - """ - Determine the neighbors of ``alignment_info``, obtained by - moving or swapping one alignment point - - :param j_pegged: If specified, neighbors that have a different - alignment point from j_pegged will not be considered - :type j_pegged: int - - :return: A set neighboring alignments represented by their - ``AlignmentInfo`` - :rtype: set(AlignmentInfo) - """ - neighbors = set() - - l = len(alignment_info.src_sentence) - 1 # exclude NULL - m = len(alignment_info.trg_sentence) - 1 - original_alignment = alignment_info.alignment - original_cepts = alignment_info.cepts - - for j in range(1, m + 1): - if j != j_pegged: - # Add alignments that differ by one alignment point - for i in range(0, l + 1): - new_alignment = list(original_alignment) - new_cepts = deepcopy(original_cepts) - old_i = original_alignment[j] - - # update alignment - new_alignment[j] = i - - # update cepts - insort_left(new_cepts[i], j) - new_cepts[old_i].remove(j) - - new_alignment_info = AlignmentInfo( - tuple(new_alignment), - alignment_info.src_sentence, - alignment_info.trg_sentence, - new_cepts, - ) - neighbors.add(new_alignment_info) - - for j in range(1, m + 1): - if j != j_pegged: - # Add alignments that have two alignment points swapped - for other_j in range(1, m + 1): - if other_j != j_pegged and other_j != j: - new_alignment = list(original_alignment) - new_cepts = deepcopy(original_cepts) - other_i = original_alignment[other_j] - i = original_alignment[j] - - # update alignments - new_alignment[j] = other_i - new_alignment[other_j] = i - - # update cepts - new_cepts[other_i].remove(other_j) - insort_left(new_cepts[other_i], j) - new_cepts[i].remove(j) - insort_left(new_cepts[i], other_j) - - new_alignment_info = AlignmentInfo( - tuple(new_alignment), - alignment_info.src_sentence, - alignment_info.trg_sentence, - new_cepts, - ) - neighbors.add(new_alignment_info) - - return neighbors - - def maximize_lexical_translation_probabilities(self, counts): - for t, src_words in counts.t_given_s.items(): - for s in src_words: - estimate = counts.t_given_s[t][s] / counts.any_t_given_s[s] - self.translation_table[t][s] = max(estimate, IBMModel.MIN_PROB) - - def maximize_fertility_probabilities(self, counts): - for phi, src_words in counts.fertility.items(): - for s in src_words: - estimate = counts.fertility[phi][s] / counts.fertility_for_any_phi[s] - self.fertility_table[phi][s] = max(estimate, IBMModel.MIN_PROB) - - def maximize_null_generation_probabilities(self, counts): - p1_estimate = counts.p1 / (counts.p1 + counts.p0) - p1_estimate = max(p1_estimate, IBMModel.MIN_PROB) - # Clip p1 if it is too large, because p0 = 1 - p1 should not be - # smaller than MIN_PROB - self.p1 = min(p1_estimate, 1 - IBMModel.MIN_PROB) - - def prob_of_alignments(self, alignments): - probability = 0 - for alignment_info in alignments: - probability += self.prob_t_a_given_s(alignment_info) - return probability - - def prob_t_a_given_s(self, alignment_info): - """ - Probability of target sentence and an alignment given the - source sentence - - All required information is assumed to be in ``alignment_info`` - and self. - - Derived classes should override this method - """ - return 0.0 - - -class AlignmentInfo: - """ - Helper data object for training IBM Models 3 and up - - Read-only. For a source sentence and its counterpart in the target - language, this class holds information about the sentence pair's - alignment, cepts, and fertility. - - Warning: Alignments are one-indexed here, in contrast to - nltk.translate.Alignment and AlignedSent, which are zero-indexed - This class is not meant to be used outside of IBM models. - """ - - def __init__(self, alignment, src_sentence, trg_sentence, cepts): - if not isinstance(alignment, tuple): - raise TypeError( - "The alignment must be a tuple because it is used " - "to uniquely identify AlignmentInfo objects." - ) - - self.alignment = alignment - """ - tuple(int): Alignment function. ``alignment[j]`` is the position - in the source sentence that is aligned to the position j in the - target sentence. - """ - - self.src_sentence = src_sentence - """ - tuple(str): Source sentence referred to by this object. - Should include NULL token (None) in index 0. - """ - - self.trg_sentence = trg_sentence - """ - tuple(str): Target sentence referred to by this object. - Should have a dummy element in index 0 so that the first word - starts from index 1. - """ - - self.cepts = cepts - """ - list(list(int)): The positions of the target words, in - ascending order, aligned to a source word position. For example, - cepts[4] = (2, 3, 7) means that words in positions 2, 3 and 7 - of the target sentence are aligned to the word in position 4 of - the source sentence - """ - - self.score = None - """ - float: Optional. Probability of alignment, as defined by the - IBM model that assesses this alignment - """ - - def fertility_of_i(self, i): - """ - Fertility of word in position ``i`` of the source sentence - """ - return len(self.cepts[i]) - - def is_head_word(self, j): - """ - :return: Whether the word in position ``j`` of the target - sentence is a head word - """ - i = self.alignment[j] - return self.cepts[i][0] == j - - def center_of_cept(self, i): - """ - :return: The ceiling of the average positions of the words in - the tablet of cept ``i``, or 0 if ``i`` is None - """ - if i is None: - return 0 - - average_position = sum(self.cepts[i]) / len(self.cepts[i]) - return int(ceil(average_position)) - - def previous_cept(self, j): - """ - :return: The previous cept of ``j``, or None if ``j`` belongs to - the first cept - """ - i = self.alignment[j] - if i == 0: - raise ValueError( - "Words aligned to NULL cannot have a previous " - "cept because NULL has no position" - ) - previous_cept = i - 1 - while previous_cept > 0 and self.fertility_of_i(previous_cept) == 0: - previous_cept -= 1 - - if previous_cept <= 0: - previous_cept = None - return previous_cept - - def previous_in_tablet(self, j): - """ - :return: The position of the previous word that is in the same - tablet as ``j``, or None if ``j`` is the first word of the - tablet - """ - i = self.alignment[j] - tablet_position = self.cepts[i].index(j) - if tablet_position == 0: - return None - return self.cepts[i][tablet_position - 1] - - def zero_indexed_alignment(self): - """ - :return: Zero-indexed alignment, suitable for use in external - ``nltk.translate`` modules like ``nltk.translate.Alignment`` - :rtype: list(tuple) - """ - zero_indexed_alignment = [] - for j in range(1, len(self.trg_sentence)): - i = self.alignment[j] - 1 - if i < 0: - i = None # alignment to NULL token - zero_indexed_alignment.append((j - 1, i)) - return zero_indexed_alignment - - def __eq__(self, other): - return self.alignment == other.alignment - - def __ne__(self, other): - return not self == other - - def __hash__(self): - return hash(self.alignment) - - -class Counts: - """ - Data object to store counts of various parameters during training - """ - - def __init__(self): - self.t_given_s = defaultdict(lambda: defaultdict(lambda: 0.0)) - self.any_t_given_s = defaultdict(lambda: 0.0) - self.p0 = 0.0 - self.p1 = 0.0 - self.fertility = defaultdict(lambda: defaultdict(lambda: 0.0)) - self.fertility_for_any_phi = defaultdict(lambda: 0.0) - - def update_lexical_translation(self, count, alignment_info, j): - i = alignment_info.alignment[j] - t = alignment_info.trg_sentence[j] - s = alignment_info.src_sentence[i] - self.t_given_s[t][s] += count - self.any_t_given_s[s] += count - - def update_null_generation(self, count, alignment_info): - m = len(alignment_info.trg_sentence) - 1 - fertility_of_null = alignment_info.fertility_of_i(0) - self.p1 += fertility_of_null * count - self.p0 += (m - 2 * fertility_of_null) * count - - def update_fertility(self, count, alignment_info): - for i in range(0, len(alignment_info.src_sentence)): - s = alignment_info.src_sentence[i] - phi = alignment_info.fertility_of_i(i) - self.fertility[phi][s] += count - self.fertility_for_any_phi[s] += count diff --git a/pipeline/nltk/translate/meteor_score.py b/pipeline/nltk/translate/meteor_score.py deleted file mode 100644 index 847f2ad19205816f71caff5623b1d992ef2dbfda..0000000000000000000000000000000000000000 --- a/pipeline/nltk/translate/meteor_score.py +++ /dev/null @@ -1,409 +0,0 @@ -# Natural Language Toolkit: Machine Translation -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Uday Krishna -# Contributor: Tom Aarsen -# URL: -# For license information, see LICENSE.TXT - - -from itertools import chain, product -from typing import Callable, Iterable, List, Tuple - -from nltk.corpus import WordNetCorpusReader, wordnet -from nltk.stem.api import StemmerI -from nltk.stem.porter import PorterStemmer - - -def _generate_enums( - hypothesis: Iterable[str], - reference: Iterable[str], - preprocess: Callable[[str], str] = str.lower, -) -> Tuple[List[Tuple[int, str]], List[Tuple[int, str]]]: - """ - Takes in pre-tokenized inputs for hypothesis and reference and returns - enumerated word lists for each of them - - :param hypothesis: pre-tokenized hypothesis - :param reference: pre-tokenized reference - :preprocess: preprocessing method (default str.lower) - :return: enumerated words list - """ - if isinstance(hypothesis, str): - raise TypeError( - f'"hypothesis" expects pre-tokenized hypothesis (Iterable[str]): {hypothesis}' - ) - - if isinstance(reference, str): - raise TypeError( - f'"reference" expects pre-tokenized reference (Iterable[str]): {reference}' - ) - - enum_hypothesis_list = list(enumerate(map(preprocess, hypothesis))) - enum_reference_list = list(enumerate(map(preprocess, reference))) - return enum_hypothesis_list, enum_reference_list - - -def exact_match( - hypothesis: Iterable[str], reference: Iterable[str] -) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: - """ - matches exact words in hypothesis and reference - and returns a word mapping based on the enumerated - word id between hypothesis and reference - - :param hypothesis: pre-tokenized hypothesis - :param reference: pre-tokenized reference - :return: enumerated matched tuples, enumerated unmatched hypothesis tuples, - enumerated unmatched reference tuples - """ - enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference) - return _match_enums(enum_hypothesis_list, enum_reference_list) - - -def _match_enums( - enum_hypothesis_list: List[Tuple[int, str]], - enum_reference_list: List[Tuple[int, str]], -) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: - """ - matches exact words in hypothesis and reference and returns - a word mapping between enum_hypothesis_list and enum_reference_list - based on the enumerated word id. - - :param enum_hypothesis_list: enumerated hypothesis list - :param enum_reference_list: enumerated reference list - :return: enumerated matched tuples, enumerated unmatched hypothesis tuples, - enumerated unmatched reference tuples - """ - word_match = [] - for i in range(len(enum_hypothesis_list))[::-1]: - for j in range(len(enum_reference_list))[::-1]: - if enum_hypothesis_list[i][1] == enum_reference_list[j][1]: - word_match.append( - (enum_hypothesis_list[i][0], enum_reference_list[j][0]) - ) - enum_hypothesis_list.pop(i) - enum_reference_list.pop(j) - break - return word_match, enum_hypothesis_list, enum_reference_list - - -def _enum_stem_match( - enum_hypothesis_list: List[Tuple[int, str]], - enum_reference_list: List[Tuple[int, str]], - stemmer: StemmerI = PorterStemmer(), -) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: - """ - Stems each word and matches them in hypothesis and reference - and returns a word mapping between enum_hypothesis_list and - enum_reference_list based on the enumerated word id. The function also - returns a enumerated list of unmatched words for hypothesis and reference. - - :param enum_hypothesis_list: enumerated hypothesis list - :param enum_reference_list: enumerated reference list - :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer()) - :return: enumerated matched tuples, enumerated unmatched hypothesis tuples, - enumerated unmatched reference tuples - """ - stemmed_enum_hypothesis_list = [ - (word_pair[0], stemmer.stem(word_pair[1])) for word_pair in enum_hypothesis_list - ] - - stemmed_enum_reference_list = [ - (word_pair[0], stemmer.stem(word_pair[1])) for word_pair in enum_reference_list - ] - - return _match_enums(stemmed_enum_hypothesis_list, stemmed_enum_reference_list) - - -def stem_match( - hypothesis: Iterable[str], - reference: Iterable[str], - stemmer: StemmerI = PorterStemmer(), -) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: - """ - Stems each word and matches them in hypothesis and reference - and returns a word mapping between hypothesis and reference - - :param hypothesis: pre-tokenized hypothesis - :param reference: pre-tokenized reference - :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer()) - :return: enumerated matched tuples, enumerated unmatched hypothesis tuples, - enumerated unmatched reference tuples - """ - enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference) - return _enum_stem_match(enum_hypothesis_list, enum_reference_list, stemmer=stemmer) - - -def _enum_wordnetsyn_match( - enum_hypothesis_list: List[Tuple[int, str]], - enum_reference_list: List[Tuple[int, str]], - wordnet: WordNetCorpusReader = wordnet, -) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: - """ - Matches each word in reference to a word in hypothesis - if any synonym of a hypothesis word is the exact match - to the reference word. - - :param enum_hypothesis_list: enumerated hypothesis list - :param enum_reference_list: enumerated reference list - :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet) - """ - word_match = [] - for i in range(len(enum_hypothesis_list))[::-1]: - hypothesis_syns = set( - chain.from_iterable( - ( - lemma.name() - for lemma in synset.lemmas() - if lemma.name().find("_") < 0 - ) - for synset in wordnet.synsets(enum_hypothesis_list[i][1]) - ) - ).union({enum_hypothesis_list[i][1]}) - for j in range(len(enum_reference_list))[::-1]: - if enum_reference_list[j][1] in hypothesis_syns: - word_match.append( - (enum_hypothesis_list[i][0], enum_reference_list[j][0]) - ) - enum_hypothesis_list.pop(i) - enum_reference_list.pop(j) - break - return word_match, enum_hypothesis_list, enum_reference_list - - -def wordnetsyn_match( - hypothesis: Iterable[str], - reference: Iterable[str], - wordnet: WordNetCorpusReader = wordnet, -) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: - """ - Matches each word in reference to a word in hypothesis if any synonym - of a hypothesis word is the exact match to the reference word. - - :param hypothesis: pre-tokenized hypothesis - :param reference: pre-tokenized reference - :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet) - :return: list of mapped tuples - """ - enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference) - return _enum_wordnetsyn_match( - enum_hypothesis_list, enum_reference_list, wordnet=wordnet - ) - - -def _enum_align_words( - enum_hypothesis_list: List[Tuple[int, str]], - enum_reference_list: List[Tuple[int, str]], - stemmer: StemmerI = PorterStemmer(), - wordnet: WordNetCorpusReader = wordnet, -) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: - """ - Aligns/matches words in the hypothesis to reference by sequentially - applying exact match, stemmed match and wordnet based synonym match. - in case there are multiple matches the match which has the least number - of crossing is chosen. Takes enumerated list as input instead of - string input - - :param enum_hypothesis_list: enumerated hypothesis list - :param enum_reference_list: enumerated reference list - :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer()) - :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet) - :return: sorted list of matched tuples, unmatched hypothesis list, - unmatched reference list - """ - exact_matches, enum_hypothesis_list, enum_reference_list = _match_enums( - enum_hypothesis_list, enum_reference_list - ) - - stem_matches, enum_hypothesis_list, enum_reference_list = _enum_stem_match( - enum_hypothesis_list, enum_reference_list, stemmer=stemmer - ) - - wns_matches, enum_hypothesis_list, enum_reference_list = _enum_wordnetsyn_match( - enum_hypothesis_list, enum_reference_list, wordnet=wordnet - ) - - return ( - sorted( - exact_matches + stem_matches + wns_matches, key=lambda wordpair: wordpair[0] - ), - enum_hypothesis_list, - enum_reference_list, - ) - - -def align_words( - hypothesis: Iterable[str], - reference: Iterable[str], - stemmer: StemmerI = PorterStemmer(), - wordnet: WordNetCorpusReader = wordnet, -) -> Tuple[List[Tuple[int, int]], List[Tuple[int, str]], List[Tuple[int, str]]]: - """ - Aligns/matches words in the hypothesis to reference by sequentially - applying exact match, stemmed match and wordnet based synonym match. - In case there are multiple matches the match which has the least number - of crossing is chosen. - - :param hypothesis: pre-tokenized hypothesis - :param reference: pre-tokenized reference - :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer()) - :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet) - :return: sorted list of matched tuples, unmatched hypothesis list, unmatched reference list - """ - enum_hypothesis_list, enum_reference_list = _generate_enums(hypothesis, reference) - return _enum_align_words( - enum_hypothesis_list, enum_reference_list, stemmer=stemmer, wordnet=wordnet - ) - - -def _count_chunks(matches: List[Tuple[int, int]]) -> int: - """ - Counts the fewest possible number of chunks such that matched unigrams - of each chunk are adjacent to each other. This is used to calculate the - fragmentation part of the metric. - - :param matches: list containing a mapping of matched words (output of align_words) - :return: Number of chunks a sentence is divided into post alignment - """ - i = 0 - chunks = 1 - while i < len(matches) - 1: - if (matches[i + 1][0] == matches[i][0] + 1) and ( - matches[i + 1][1] == matches[i][1] + 1 - ): - i += 1 - continue - i += 1 - chunks += 1 - return chunks - - -def single_meteor_score( - reference: Iterable[str], - hypothesis: Iterable[str], - preprocess: Callable[[str], str] = str.lower, - stemmer: StemmerI = PorterStemmer(), - wordnet: WordNetCorpusReader = wordnet, - alpha: float = 0.9, - beta: float = 3.0, - gamma: float = 0.5, -) -> float: - """ - Calculates METEOR score for single hypothesis and reference as per - "Meteor: An Automatic Metric for MT Evaluation with HighLevels of - Correlation with Human Judgments" by Alon Lavie and Abhaya Agarwal, - in Proceedings of ACL. - https://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf - - - >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', 'that', 'the', 'military', 'always', 'obeys', 'the', 'commands', 'of', 'the', 'party'] - - >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', 'that', 'the', 'military', 'will', 'forever', 'heed', 'Party', 'commands'] - - - >>> round(single_meteor_score(reference1, hypothesis1),4) - 0.6944 - - If there is no words match during the alignment the method returns the - score as 0. We can safely return a zero instead of raising a - division by zero error as no match usually implies a bad translation. - - >>> round(single_meteor_score(['this', 'is', 'a', 'cat'], ['non', 'matching', 'hypothesis']),4) - 0.0 - - :param reference: pre-tokenized reference - :param hypothesis: pre-tokenized hypothesis - :param preprocess: preprocessing function (default str.lower) - :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer()) - :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet) - :param alpha: parameter for controlling relative weights of precision and recall. - :param beta: parameter for controlling shape of penalty as a - function of as a function of fragmentation. - :param gamma: relative weight assigned to fragmentation penalty. - :return: The sentence-level METEOR score. - """ - enum_hypothesis, enum_reference = _generate_enums( - hypothesis, reference, preprocess=preprocess - ) - translation_length = len(enum_hypothesis) - reference_length = len(enum_reference) - matches, _, _ = _enum_align_words( - enum_hypothesis, enum_reference, stemmer=stemmer, wordnet=wordnet - ) - matches_count = len(matches) - try: - precision = float(matches_count) / translation_length - recall = float(matches_count) / reference_length - fmean = (precision * recall) / (alpha * precision + (1 - alpha) * recall) - chunk_count = float(_count_chunks(matches)) - frag_frac = chunk_count / matches_count - except ZeroDivisionError: - return 0.0 - penalty = gamma * frag_frac**beta - return (1 - penalty) * fmean - - -def meteor_score( - references: Iterable[Iterable[str]], - hypothesis: Iterable[str], - preprocess: Callable[[str], str] = str.lower, - stemmer: StemmerI = PorterStemmer(), - wordnet: WordNetCorpusReader = wordnet, - alpha: float = 0.9, - beta: float = 3.0, - gamma: float = 0.5, -) -> float: - """ - Calculates METEOR score for hypothesis with multiple references as - described in "Meteor: An Automatic Metric for MT Evaluation with - HighLevels of Correlation with Human Judgments" by Alon Lavie and - Abhaya Agarwal, in Proceedings of ACL. - https://www.cs.cmu.edu/~alavie/METEOR/pdf/Lavie-Agarwal-2007-METEOR.pdf - - - In case of multiple references the best score is chosen. This method - iterates over single_meteor_score and picks the best pair among all - the references for a given hypothesis - - >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures', 'that', 'the', 'military', 'always', 'obeys', 'the', 'commands', 'of', 'the', 'party'] - >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', 'forever', 'hearing', 'the', 'activity', 'guidebook', 'that', 'party', 'direct'] - - >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures', 'that', 'the', 'military', 'will', 'forever', 'heed', 'Party', 'commands'] - >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', 'guarantees', 'the', 'military', 'forces', 'always', 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] - >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', 'army', 'always', 'to', 'heed', 'the', 'directions', 'of', 'the', 'party'] - - >>> round(meteor_score([reference1, reference2, reference3], hypothesis1),4) - 0.6944 - - If there is no words match during the alignment the method returns the - score as 0. We can safely return a zero instead of raising a - division by zero error as no match usually implies a bad translation. - - >>> round(meteor_score([['this', 'is', 'a', 'cat']], ['non', 'matching', 'hypothesis']),4) - 0.0 - - :param references: pre-tokenized reference sentences - :param hypothesis: a pre-tokenized hypothesis sentence - :param preprocess: preprocessing function (default str.lower) - :param stemmer: nltk.stem.api.StemmerI object (default PorterStemmer()) - :param wordnet: a wordnet corpus reader object (default nltk.corpus.wordnet) - :param alpha: parameter for controlling relative weights of precision and recall. - :param beta: parameter for controlling shape of penalty as a function - of as a function of fragmentation. - :param gamma: relative weight assigned to fragmentation penalty. - :return: The sentence-level METEOR score. - """ - return max( - single_meteor_score( - reference, - hypothesis, - preprocess=preprocess, - stemmer=stemmer, - wordnet=wordnet, - alpha=alpha, - beta=beta, - gamma=gamma, - ) - for reference in references - ) diff --git a/pipeline/nltk/translate/metrics.py b/pipeline/nltk/translate/metrics.py deleted file mode 100644 index 88444087f65395428c87a6c5d805c682958b6e55..0000000000000000000000000000000000000000 --- a/pipeline/nltk/translate/metrics.py +++ /dev/null @@ -1,41 +0,0 @@ -# Natural Language Toolkit: Translation metrics -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Will Zhang -# Guan Gui -# Steven Bird -# URL: -# For license information, see LICENSE.TXT - - -def alignment_error_rate(reference, hypothesis, possible=None): - """ - Return the Alignment Error Rate (AER) of an alignment - with respect to a "gold standard" reference alignment. - Return an error rate between 0.0 (perfect alignment) and 1.0 (no - alignment). - - >>> from nltk.translate import Alignment - >>> ref = Alignment([(0, 0), (1, 1), (2, 2)]) - >>> test = Alignment([(0, 0), (1, 2), (2, 1)]) - >>> alignment_error_rate(ref, test) # doctest: +ELLIPSIS - 0.6666666666666667 - - :type reference: Alignment - :param reference: A gold standard alignment (sure alignments) - :type hypothesis: Alignment - :param hypothesis: A hypothesis alignment (aka. candidate alignments) - :type possible: Alignment or None - :param possible: A gold standard reference of possible alignments - (defaults to *reference* if None) - :rtype: float or None - """ - - if possible is None: - possible = reference - else: - assert reference.issubset(possible) # sanity check - - return 1.0 - (len(hypothesis & reference) + len(hypothesis & possible)) / float( - len(hypothesis) + len(reference) - ) diff --git a/pipeline/nltk/translate/nist_score.py b/pipeline/nltk/translate/nist_score.py deleted file mode 100644 index 0035a9dcdae5f1acf703c2c957353f880db22615..0000000000000000000000000000000000000000 --- a/pipeline/nltk/translate/nist_score.py +++ /dev/null @@ -1,195 +0,0 @@ -# Natural Language Toolkit: NIST Score -# -# Copyright (C) 2001-2023 NLTK Project -# Authors: -# Contributors: -# URL: -# For license information, see LICENSE.TXT - -"""NIST score implementation.""" - -import fractions -import math -from collections import Counter - -from nltk.util import ngrams - - -def sentence_nist(references, hypothesis, n=5): - """ - Calculate NIST score from - George Doddington. 2002. "Automatic evaluation of machine translation quality - using n-gram co-occurrence statistics." Proceedings of HLT. - Morgan Kaufmann Publishers Inc. https://dl.acm.org/citation.cfm?id=1289189.1289273 - - DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU - score. The official script used by NIST to compute BLEU and NIST score is - mteval-14.pl. The main differences are: - - - BLEU uses geometric mean of the ngram overlaps, NIST uses arithmetic mean. - - NIST has a different brevity penalty - - NIST score from mteval-14.pl has a self-contained tokenizer - - Note: The mteval-14.pl includes a smoothing function for BLEU score that is NOT - used in the NIST score computation. - - >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', - ... 'ensures', 'that', 'the', 'military', 'always', - ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] - - >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', - ... 'forever', 'hearing', 'the', 'activity', 'guidebook', - ... 'that', 'party', 'direct'] - - >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', - ... 'ensures', 'that', 'the', 'military', 'will', 'forever', - ... 'heed', 'Party', 'commands'] - - >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', - ... 'guarantees', 'the', 'military', 'forces', 'always', - ... 'being', 'under', 'the', 'command', 'of', 'the', - ... 'Party'] - - >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', - ... 'army', 'always', 'to', 'heed', 'the', 'directions', - ... 'of', 'the', 'party'] - - >>> sentence_nist([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS - 3.3709... - - >>> sentence_nist([reference1, reference2, reference3], hypothesis2) # doctest: +ELLIPSIS - 1.4619... - - :param references: reference sentences - :type references: list(list(str)) - :param hypothesis: a hypothesis sentence - :type hypothesis: list(str) - :param n: highest n-gram order - :type n: int - """ - return corpus_nist([references], [hypothesis], n) - - -def corpus_nist(list_of_references, hypotheses, n=5): - """ - Calculate a single corpus-level NIST score (aka. system-level BLEU) for all - the hypotheses and their respective references. - - :param references: a corpus of lists of reference sentences, w.r.t. hypotheses - :type references: list(list(list(str))) - :param hypotheses: a list of hypothesis sentences - :type hypotheses: list(list(str)) - :param n: highest n-gram order - :type n: int - """ - # Before proceeding to compute NIST, perform sanity checks. - assert len(list_of_references) == len( - hypotheses - ), "The number of hypotheses and their reference(s) should be the same" - - # Collect the ngram coounts from the reference sentences. - ngram_freq = Counter() - total_reference_words = 0 - for ( - references - ) in list_of_references: # For each source sent, there's a list of reference sents. - for reference in references: - # For each order of ngram, count the ngram occurrences. - for i in range(1, n + 1): - ngram_freq.update(ngrams(reference, i)) - total_reference_words += len(reference) - - # Compute the information weights based on the reference sentences. - # Eqn 2 in Doddington (2002): - # Info(w_1 ... w_n) = log_2 [ (# of occurrences of w_1 ... w_n-1) / (# of occurrences of w_1 ... w_n) ] - information_weights = {} - for _ngram in ngram_freq: # w_1 ... w_n - _mgram = _ngram[:-1] # w_1 ... w_n-1 - # From https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v13a.pl#L546 - # it's computed as such: - # denominator = ngram_freq[_mgram] if _mgram and _mgram in ngram_freq else denominator = total_reference_words - # information_weights[_ngram] = -1 * math.log(ngram_freq[_ngram]/denominator) / math.log(2) - # - # Mathematically, it's equivalent to the our implementation: - if _mgram and _mgram in ngram_freq: - numerator = ngram_freq[_mgram] - else: - numerator = total_reference_words - information_weights[_ngram] = math.log(numerator / ngram_freq[_ngram], 2) - - # Micro-average. - nist_precision_numerator_per_ngram = Counter() - nist_precision_denominator_per_ngram = Counter() - l_ref, l_sys = 0, 0 - # For each order of ngram. - for i in range(1, n + 1): - # Iterate through each hypothesis and their corresponding references. - for references, hypothesis in zip(list_of_references, hypotheses): - hyp_len = len(hypothesis) - - # Find reference with the best NIST score. - nist_score_per_ref = [] - for reference in references: - _ref_len = len(reference) - # Counter of ngrams in hypothesis. - hyp_ngrams = ( - Counter(ngrams(hypothesis, i)) - if len(hypothesis) >= i - else Counter() - ) - ref_ngrams = ( - Counter(ngrams(reference, i)) if len(reference) >= i else Counter() - ) - ngram_overlaps = hyp_ngrams & ref_ngrams - # Precision part of the score in Eqn 3 - _numerator = sum( - information_weights[_ngram] * count - for _ngram, count in ngram_overlaps.items() - ) - _denominator = sum(hyp_ngrams.values()) - _precision = 0 if _denominator == 0 else _numerator / _denominator - nist_score_per_ref.append( - (_precision, _numerator, _denominator, _ref_len) - ) - # Best reference. - precision, numerator, denominator, ref_len = max(nist_score_per_ref) - nist_precision_numerator_per_ngram[i] += numerator - nist_precision_denominator_per_ngram[i] += denominator - l_ref += ref_len - l_sys += hyp_len - - # Final NIST micro-average mean aggregation. - nist_precision = 0 - for i in nist_precision_numerator_per_ngram: - precision = ( - nist_precision_numerator_per_ngram[i] - / nist_precision_denominator_per_ngram[i] - ) - nist_precision += precision - # Eqn 3 in Doddington(2002) - return nist_precision * nist_length_penalty(l_ref, l_sys) - - -def nist_length_penalty(ref_len, hyp_len): - """ - Calculates the NIST length penalty, from Eq. 3 in Doddington (2002) - - penalty = exp( beta * log( min( len(hyp)/len(ref) , 1.0 ))) - - where, - - `beta` is chosen to make the brevity penalty factor = 0.5 when the - no. of words in the system output (hyp) is 2/3 of the average - no. of words in the reference translation (ref) - - The NIST penalty is different from BLEU's such that it minimize the impact - of the score of small variations in the length of a translation. - See Fig. 4 in Doddington (2002) - """ - ratio = hyp_len / ref_len - if 0 < ratio < 1: - ratio_x, score_x = 1.5, 0.5 - beta = math.log(score_x) / math.log(ratio_x) ** 2 - return math.exp(beta * math.log(ratio) ** 2) - else: # ratio <= 0 or ratio >= 1 - return max(min(ratio, 1.0), 0.0) diff --git a/pipeline/nltk/translate/phrase_based.py b/pipeline/nltk/translate/phrase_based.py deleted file mode 100644 index 3fd85109ad26055023c502d6bd233a220d28e7e4..0000000000000000000000000000000000000000 --- a/pipeline/nltk/translate/phrase_based.py +++ /dev/null @@ -1,193 +0,0 @@ -# Natural Language Toolkit: Phrase Extraction Algorithm -# -# Copyright (C) 2001-2023 NLTK Project -# Authors: Liling Tan, Fredrik Hedman, Petra Barancikova -# URL: -# For license information, see LICENSE.TXT - - -def extract( - f_start, - f_end, - e_start, - e_end, - alignment, - f_aligned, - srctext, - trgtext, - srclen, - trglen, - max_phrase_length, -): - """ - This function checks for alignment point consistency and extracts - phrases using the chunk of consistent phrases. - - A phrase pair (e, f ) is consistent with an alignment A if and only if: - - (i) No English words in the phrase pair are aligned to words outside it. - - ∀e i ∈ e, (e i , f j ) ∈ A ⇒ f j ∈ f - - (ii) No Foreign words in the phrase pair are aligned to words outside it. - - ∀f j ∈ f , (e i , f j ) ∈ A ⇒ e i ∈ e - - (iii) The phrase pair contains at least one alignment point. - - ∃e i ∈ e ̄ , f j ∈ f ̄ s.t. (e i , f j ) ∈ A - - :type f_start: int - :param f_start: Starting index of the possible foreign language phrases - :type f_end: int - :param f_end: End index of the possible foreign language phrases - :type e_start: int - :param e_start: Starting index of the possible source language phrases - :type e_end: int - :param e_end: End index of the possible source language phrases - :type srctext: list - :param srctext: The source language tokens, a list of string. - :type trgtext: list - :param trgtext: The target language tokens, a list of string. - :type srclen: int - :param srclen: The number of tokens in the source language tokens. - :type trglen: int - :param trglen: The number of tokens in the target language tokens. - """ - - if f_end < 0: # 0-based indexing. - return {} - # Check if alignment points are consistent. - for e, f in alignment: - if (f_start <= f <= f_end) and (e < e_start or e > e_end): - return {} - - # Add phrase pairs (incl. additional unaligned f) - phrases = set() - fs = f_start - while True: - fe = min(f_end, f_start + max_phrase_length - 1) - while True: - # add phrase pair ([e_start, e_end], [fs, fe]) to set E - # Need to +1 in range to include the end-point. - src_phrase = " ".join(srctext[e_start : e_end + 1]) - trg_phrase = " ".join(trgtext[fs : fe + 1]) - # Include more data for later ordering. - phrases.add(((e_start, e_end + 1), (fs, fe + 1), src_phrase, trg_phrase)) - fe += 1 - if fe in f_aligned or fe >= trglen: - break - fs -= 1 - if fs in f_aligned or fs < 0: - break - return phrases - - -def phrase_extraction(srctext, trgtext, alignment, max_phrase_length=0): - """ - Phrase extraction algorithm extracts all consistent phrase pairs from - a word-aligned sentence pair. - - The idea is to loop over all possible source language (e) phrases and find - the minimal foreign phrase (f) that matches each of them. Matching is done - by identifying all alignment points for the source phrase and finding the - shortest foreign phrase that includes all the foreign counterparts for the - source words. - - In short, a phrase alignment has to - (a) contain all alignment points for all covered words - (b) contain at least one alignment point - - >>> srctext = "michael assumes that he will stay in the house" - >>> trgtext = "michael geht davon aus , dass er im haus bleibt" - >>> alignment = [(0,0), (1,1), (1,2), (1,3), (2,5), (3,6), (4,9), - ... (5,9), (6,7), (7,7), (8,8)] - >>> phrases = phrase_extraction(srctext, trgtext, alignment) - >>> for i in sorted(phrases): - ... print(i) - ... - ((0, 1), (0, 1), 'michael', 'michael') - ((0, 2), (0, 4), 'michael assumes', 'michael geht davon aus') - ((0, 2), (0, 5), 'michael assumes', 'michael geht davon aus ,') - ((0, 3), (0, 6), 'michael assumes that', 'michael geht davon aus , dass') - ((0, 4), (0, 7), 'michael assumes that he', 'michael geht davon aus , dass er') - ((0, 9), (0, 10), 'michael assumes that he will stay in the house', 'michael geht davon aus , dass er im haus bleibt') - ((1, 2), (1, 4), 'assumes', 'geht davon aus') - ((1, 2), (1, 5), 'assumes', 'geht davon aus ,') - ((1, 3), (1, 6), 'assumes that', 'geht davon aus , dass') - ((1, 4), (1, 7), 'assumes that he', 'geht davon aus , dass er') - ((1, 9), (1, 10), 'assumes that he will stay in the house', 'geht davon aus , dass er im haus bleibt') - ((2, 3), (4, 6), 'that', ', dass') - ((2, 3), (5, 6), 'that', 'dass') - ((2, 4), (4, 7), 'that he', ', dass er') - ((2, 4), (5, 7), 'that he', 'dass er') - ((2, 9), (4, 10), 'that he will stay in the house', ', dass er im haus bleibt') - ((2, 9), (5, 10), 'that he will stay in the house', 'dass er im haus bleibt') - ((3, 4), (6, 7), 'he', 'er') - ((3, 9), (6, 10), 'he will stay in the house', 'er im haus bleibt') - ((4, 6), (9, 10), 'will stay', 'bleibt') - ((4, 9), (7, 10), 'will stay in the house', 'im haus bleibt') - ((6, 8), (7, 8), 'in the', 'im') - ((6, 9), (7, 9), 'in the house', 'im haus') - ((8, 9), (8, 9), 'house', 'haus') - - :type srctext: str - :param srctext: The sentence string from the source language. - :type trgtext: str - :param trgtext: The sentence string from the target language. - :type alignment: list(tuple) - :param alignment: The word alignment outputs as list of tuples, where - the first elements of tuples are the source words' indices and - second elements are the target words' indices. This is also the output - format of nltk.translate.ibm1 - :rtype: list(tuple) - :return: A list of tuples, each element in a list is a phrase and each - phrase is a tuple made up of (i) its source location, (ii) its target - location, (iii) the source phrase and (iii) the target phrase. The phrase - list of tuples represents all the possible phrases extracted from the - word alignments. - :type max_phrase_length: int - :param max_phrase_length: maximal phrase length, if 0 or not specified - it is set to a length of the longer sentence (srctext or trgtext). - """ - - srctext = srctext.split() # e - trgtext = trgtext.split() # f - srclen = len(srctext) # len(e) - trglen = len(trgtext) # len(f) - # Keeps an index of which source/target words that are aligned. - f_aligned = [j for _, j in alignment] - max_phrase_length = max_phrase_length or max(srclen, trglen) - - # set of phrase pairs BP - bp = set() - - for e_start in range(srclen): - max_idx = min(srclen, e_start + max_phrase_length) - for e_end in range(e_start, max_idx): - # // find the minimally matching foreign phrase - # (f start , f end ) = ( length(f), 0 ) - # f_start ∈ [0, len(f) - 1]; f_end ∈ [0, len(f) - 1] - f_start, f_end = trglen - 1, -1 # 0-based indexing - - for e, f in alignment: - if e_start <= e <= e_end: - f_start = min(f, f_start) - f_end = max(f, f_end) - # add extract (f start , f end , e start , e end ) to set BP - phrases = extract( - f_start, - f_end, - e_start, - e_end, - alignment, - f_aligned, - srctext, - trgtext, - srclen, - trglen, - max_phrase_length, - ) - if phrases: - bp.update(phrases) - return bp diff --git a/pipeline/nltk/translate/ribes_score.py b/pipeline/nltk/translate/ribes_score.py deleted file mode 100644 index f5d0bb5f14590082fb74e4a2c3613a40b6e168f1..0000000000000000000000000000000000000000 --- a/pipeline/nltk/translate/ribes_score.py +++ /dev/null @@ -1,330 +0,0 @@ -# Natural Language Toolkit: RIBES Score -# -# Copyright (C) 2001-2023 NLTK Project -# Contributors: Katsuhito Sudoh, Liling Tan, Kasramvd, J.F.Sebastian -# Mark Byers, ekhumoro, P. Ortiz -# URL: -# For license information, see LICENSE.TXT -""" RIBES score implementation """ - -import math -from itertools import islice - -from nltk.util import choose, ngrams - - -def sentence_ribes(references, hypothesis, alpha=0.25, beta=0.10): - """ - The RIBES (Rank-based Intuitive Bilingual Evaluation Score) from - Hideki Isozaki, Tsutomu Hirao, Kevin Duh, Katsuhito Sudoh and - Hajime Tsukada. 2010. "Automatic Evaluation of Translation Quality for - Distant Language Pairs". In Proceedings of EMNLP. - https://www.aclweb.org/anthology/D/D10/D10-1092.pdf - - The generic RIBES scores used in shared task, e.g. Workshop for - Asian Translation (WAT) uses the following RIBES calculations: - - RIBES = kendall_tau * (alpha**p1) * (beta**bp) - - Please note that this re-implementation differs from the official - RIBES implementation and though it emulates the results as describe - in the original paper, there are further optimization implemented - in the official RIBES script. - - Users are encouraged to use the official RIBES script instead of this - implementation when evaluating your machine translation system. Refer - to https://www.kecl.ntt.co.jp/icl/lirg/ribes/ for the official script. - - :param references: a list of reference sentences - :type references: list(list(str)) - :param hypothesis: a hypothesis sentence - :type hypothesis: list(str) - :param alpha: hyperparameter used as a prior for the unigram precision. - :type alpha: float - :param beta: hyperparameter used as a prior for the brevity penalty. - :type beta: float - :return: The best ribes score from one of the references. - :rtype: float - """ - best_ribes = -1.0 - # Calculates RIBES for each reference and returns the best score. - for reference in references: - # Collects the *worder* from the ranked correlation alignments. - worder = word_rank_alignment(reference, hypothesis) - nkt = kendall_tau(worder) - - # Calculates the brevity penalty - bp = min(1.0, math.exp(1.0 - len(reference) / len(hypothesis))) - - # Calculates the unigram precision, *p1* - p1 = len(worder) / len(hypothesis) - - _ribes = nkt * (p1**alpha) * (bp**beta) - - if _ribes > best_ribes: # Keeps the best score. - best_ribes = _ribes - - return best_ribes - - -def corpus_ribes(list_of_references, hypotheses, alpha=0.25, beta=0.10): - """ - This function "calculates RIBES for a system output (hypothesis) with - multiple references, and returns "best" score among multi-references and - individual scores. The scores are corpus-wise, i.e., averaged by the number - of sentences." (c.f. RIBES version 1.03.1 code). - - Different from BLEU's micro-average precision, RIBES calculates the - macro-average precision by averaging the best RIBES score for each pair of - hypothesis and its corresponding references - - >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', - ... 'ensures', 'that', 'the', 'military', 'always', - ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] - >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', - ... 'ensures', 'that', 'the', 'military', 'will', 'forever', - ... 'heed', 'Party', 'commands'] - >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which', - ... 'guarantees', 'the', 'military', 'forces', 'always', - ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] - >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', - ... 'army', 'always', 'to', 'heed', 'the', 'directions', - ... 'of', 'the', 'party'] - - >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', - ... 'interested', 'in', 'world', 'history'] - >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', - ... 'because', 'he', 'read', 'the', 'book'] - - >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] - >>> hypotheses = [hyp1, hyp2] - >>> round(corpus_ribes(list_of_references, hypotheses),4) - 0.3597 - - :param references: a corpus of lists of reference sentences, w.r.t. hypotheses - :type references: list(list(list(str))) - :param hypotheses: a list of hypothesis sentences - :type hypotheses: list(list(str)) - :param alpha: hyperparameter used as a prior for the unigram precision. - :type alpha: float - :param beta: hyperparameter used as a prior for the brevity penalty. - :type beta: float - :return: The best ribes score from one of the references. - :rtype: float - """ - corpus_best_ribes = 0.0 - # Iterate through each hypothesis and their corresponding references. - for references, hypothesis in zip(list_of_references, hypotheses): - corpus_best_ribes += sentence_ribes(references, hypothesis, alpha, beta) - return corpus_best_ribes / len(hypotheses) - - -def position_of_ngram(ngram, sentence): - """ - This function returns the position of the first instance of the ngram - appearing in a sentence. - - Note that one could also use string as follows but the code is a little - convoluted with type casting back and forth: - - char_pos = ' '.join(sent)[:' '.join(sent).index(' '.join(ngram))] - word_pos = char_pos.count(' ') - - Another way to conceive this is: - - return next(i for i, ng in enumerate(ngrams(sentence, len(ngram))) - if ng == ngram) - - :param ngram: The ngram that needs to be searched - :type ngram: tuple - :param sentence: The list of tokens to search from. - :type sentence: list(str) - """ - # Iterates through the ngrams in sentence. - for i, sublist in enumerate(ngrams(sentence, len(ngram))): - # Returns the index of the word when ngram matches. - if ngram == sublist: - return i - - -def word_rank_alignment(reference, hypothesis, character_based=False): - """ - This is the word rank alignment algorithm described in the paper to produce - the *worder* list, i.e. a list of word indices of the hypothesis word orders - w.r.t. the list of reference words. - - Below is (H0, R0) example from the Isozaki et al. 2010 paper, - note the examples are indexed from 1 but the results here are indexed from 0: - - >>> ref = str('he was interested in world history because he ' - ... 'read the book').split() - >>> hyp = str('he read the book because he was interested in world ' - ... 'history').split() - >>> word_rank_alignment(ref, hyp) - [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5] - - The (H1, R1) example from the paper, note the 0th index: - - >>> ref = 'John hit Bob yesterday'.split() - >>> hyp = 'Bob hit John yesterday'.split() - >>> word_rank_alignment(ref, hyp) - [2, 1, 0, 3] - - Here is the (H2, R2) example from the paper, note the 0th index here too: - - >>> ref = 'the boy read the book'.split() - >>> hyp = 'the book was read by the boy'.split() - >>> word_rank_alignment(ref, hyp) - [3, 4, 2, 0, 1] - - :param reference: a reference sentence - :type reference: list(str) - :param hypothesis: a hypothesis sentence - :type hypothesis: list(str) - """ - worder = [] - hyp_len = len(hypothesis) - # Stores a list of possible ngrams from the reference sentence. - # This is used for matching context window later in the algorithm. - ref_ngrams = [] - hyp_ngrams = [] - for n in range(1, len(reference) + 1): - for ng in ngrams(reference, n): - ref_ngrams.append(ng) - for ng in ngrams(hypothesis, n): - hyp_ngrams.append(ng) - for i, h_word in enumerate(hypothesis): - # If word is not in the reference, continue. - if h_word not in reference: - continue - # If we can determine one-to-one word correspondence for unigrams that - # only appear once in both the reference and hypothesis. - elif hypothesis.count(h_word) == reference.count(h_word) == 1: - worder.append(reference.index(h_word)) - else: - max_window_size = max(i, hyp_len - i + 1) - for window in range(1, max_window_size): - if i + window < hyp_len: # If searching the right context is possible. - # Retrieve the right context window. - right_context_ngram = tuple(islice(hypothesis, i, i + window + 1)) - num_times_in_ref = ref_ngrams.count(right_context_ngram) - num_times_in_hyp = hyp_ngrams.count(right_context_ngram) - # If ngram appears only once in both ref and hyp. - if num_times_in_ref == num_times_in_hyp == 1: - # Find the position of ngram that matched the reference. - pos = position_of_ngram(right_context_ngram, reference) - worder.append(pos) # Add the positions of the ngram. - break - if window <= i: # If searching the left context is possible. - # Retrieve the left context window. - left_context_ngram = tuple(islice(hypothesis, i - window, i + 1)) - num_times_in_ref = ref_ngrams.count(left_context_ngram) - num_times_in_hyp = hyp_ngrams.count(left_context_ngram) - if num_times_in_ref == num_times_in_hyp == 1: - # Find the position of ngram that matched the reference. - pos = position_of_ngram(left_context_ngram, reference) - # Add the positions of the ngram. - worder.append(pos + len(left_context_ngram) - 1) - break - return worder - - -def find_increasing_sequences(worder): - """ - Given the *worder* list, this function groups monotonic +1 sequences. - - >>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5] - >>> list(find_increasing_sequences(worder)) - [(7, 8, 9, 10), (0, 1, 2, 3, 4, 5)] - - :param worder: The worder list output from word_rank_alignment - :param type: list(int) - """ - items = iter(worder) - a, b = None, next(items, None) - result = [b] - while b is not None: - a, b = b, next(items, None) - if b is not None and a + 1 == b: - result.append(b) - else: - if len(result) > 1: - yield tuple(result) - result = [b] - - -def kendall_tau(worder, normalize=True): - """ - Calculates the Kendall's Tau correlation coefficient given the *worder* - list of word alignments from word_rank_alignment(), using the formula: - - tau = 2 * num_increasing_pairs / num_possible_pairs -1 - - Note that the no. of increasing pairs can be discontinuous in the *worder* - list and each each increasing sequence can be tabulated as choose(len(seq), 2) - no. of increasing pairs, e.g. - - >>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5] - >>> number_possible_pairs = choose(len(worder), 2) - >>> round(kendall_tau(worder, normalize=False),3) - -0.236 - >>> round(kendall_tau(worder),3) - 0.382 - - :param worder: The worder list output from word_rank_alignment - :type worder: list(int) - :param normalize: Flag to indicate normalization to between 0.0 and 1.0. - :type normalize: boolean - :return: The Kendall's Tau correlation coefficient. - :rtype: float - """ - worder_len = len(worder) - # With worder_len < 2, `choose(worder_len, 2)` will be 0. - # As we divide by this, it will give a ZeroDivisionError. - # To avoid this, we can just return the lowest possible score. - if worder_len < 2: - tau = -1 - else: - # Extract the groups of increasing/monotonic sequences. - increasing_sequences = find_increasing_sequences(worder) - # Calculate no. of increasing_pairs in *worder* list. - num_increasing_pairs = sum(choose(len(seq), 2) for seq in increasing_sequences) - # Calculate no. of possible pairs. - num_possible_pairs = choose(worder_len, 2) - # Kendall's Tau computation. - tau = 2 * num_increasing_pairs / num_possible_pairs - 1 - if normalize: # If normalized, the tau output falls between 0.0 to 1.0 - return (tau + 1) / 2 - else: # Otherwise, the tau outputs falls between -1.0 to +1.0 - return tau - - -def spearman_rho(worder, normalize=True): - """ - Calculates the Spearman's Rho correlation coefficient given the *worder* - list of word alignment from word_rank_alignment(), using the formula: - - rho = 1 - sum(d**2) / choose(len(worder)+1, 3) - - Given that d is the sum of difference between the *worder* list of indices - and the original word indices from the reference sentence. - - Using the (H0,R0) and (H5, R5) example from the paper - - >>> worder = [7, 8, 9, 10, 6, 0, 1, 2, 3, 4, 5] - >>> round(spearman_rho(worder, normalize=False), 3) - -0.591 - >>> round(spearman_rho(worder), 3) - 0.205 - - :param worder: The worder list output from word_rank_alignment - :param type: list(int) - """ - worder_len = len(worder) - sum_d_square = sum((wi - i) ** 2 for wi, i in zip(worder, range(worder_len))) - rho = 1 - sum_d_square / choose(worder_len + 1, 3) - - if normalize: # If normalized, the rho output falls between 0.0 to 1.0 - return (rho + 1) / 2 - else: # Otherwise, the rho outputs falls between -1.0 to +1.0 - return rho diff --git a/pipeline/nltk/translate/stack_decoder.py b/pipeline/nltk/translate/stack_decoder.py deleted file mode 100644 index 29c6c99ff8d39848e3e17d413e9b40296bd5dc71..0000000000000000000000000000000000000000 --- a/pipeline/nltk/translate/stack_decoder.py +++ /dev/null @@ -1,515 +0,0 @@ -# Natural Language Toolkit: Stack decoder -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Tah Wei Hoon -# URL: -# For license information, see LICENSE.TXT - -""" -A decoder that uses stacks to implement phrase-based translation. - -In phrase-based translation, the source sentence is segmented into -phrases of one or more words, and translations for those phrases are -used to build the target sentence. - -Hypothesis data structures are used to keep track of the source words -translated so far and the partial output. A hypothesis can be expanded -by selecting an untranslated phrase, looking up its translation in a -phrase table, and appending that translation to the partial output. -Translation is complete when a hypothesis covers all source words. - -The search space is huge because the source sentence can be segmented -in different ways, the source phrases can be selected in any order, -and there could be multiple translations for the same source phrase in -the phrase table. To make decoding tractable, stacks are used to limit -the number of candidate hypotheses by doing histogram and/or threshold -pruning. - -Hypotheses with the same number of words translated are placed in the -same stack. In histogram pruning, each stack has a size limit, and -the hypothesis with the lowest score is removed when the stack is full. -In threshold pruning, hypotheses that score below a certain threshold -of the best hypothesis in that stack are removed. - -Hypothesis scoring can include various factors such as phrase -translation probability, language model probability, length of -translation, cost of remaining words to be translated, and so on. - - -References: -Philipp Koehn. 2010. Statistical Machine Translation. -Cambridge University Press, New York. -""" - -import warnings -from collections import defaultdict -from math import log - - -class StackDecoder: - """ - Phrase-based stack decoder for machine translation - - >>> from nltk.translate import PhraseTable - >>> phrase_table = PhraseTable() - >>> phrase_table.add(('niemand',), ('nobody',), log(0.8)) - >>> phrase_table.add(('niemand',), ('no', 'one'), log(0.2)) - >>> phrase_table.add(('erwartet',), ('expects',), log(0.8)) - >>> phrase_table.add(('erwartet',), ('expecting',), log(0.2)) - >>> phrase_table.add(('niemand', 'erwartet'), ('one', 'does', 'not', 'expect'), log(0.1)) - >>> phrase_table.add(('die', 'spanische', 'inquisition'), ('the', 'spanish', 'inquisition'), log(0.8)) - >>> phrase_table.add(('!',), ('!',), log(0.8)) - - >>> # nltk.model should be used here once it is implemented - >>> from collections import defaultdict - >>> language_prob = defaultdict(lambda: -999.0) - >>> language_prob[('nobody',)] = log(0.5) - >>> language_prob[('expects',)] = log(0.4) - >>> language_prob[('the', 'spanish', 'inquisition')] = log(0.2) - >>> language_prob[('!',)] = log(0.1) - >>> language_model = type('',(object,),{'probability_change': lambda self, context, phrase: language_prob[phrase], 'probability': lambda self, phrase: language_prob[phrase]})() - - >>> stack_decoder = StackDecoder(phrase_table, language_model) - - >>> stack_decoder.translate(['niemand', 'erwartet', 'die', 'spanische', 'inquisition', '!']) - ['nobody', 'expects', 'the', 'spanish', 'inquisition', '!'] - - """ - - def __init__(self, phrase_table, language_model): - """ - :param phrase_table: Table of translations for source language - phrases and the log probabilities for those translations. - :type phrase_table: PhraseTable - - :param language_model: Target language model. Must define a - ``probability_change`` method that calculates the change in - log probability of a sentence, if a given string is appended - to it. - This interface is experimental and will likely be replaced - with nltk.model once it is implemented. - :type language_model: object - """ - self.phrase_table = phrase_table - self.language_model = language_model - - self.word_penalty = 0.0 - """ - float: Influences the translation length exponentially. - If positive, shorter translations are preferred. - If negative, longer translations are preferred. - If zero, no penalty is applied. - """ - - self.beam_threshold = 0.0 - """ - float: Hypotheses that score below this factor of the best - hypothesis in a stack are dropped from consideration. - Value between 0.0 and 1.0. - """ - - self.stack_size = 100 - """ - int: Maximum number of hypotheses to consider in a stack. - Higher values increase the likelihood of a good translation, - but increases processing time. - """ - - self.__distortion_factor = 0.5 - self.__compute_log_distortion() - - @property - def distortion_factor(self): - """ - float: Amount of reordering of source phrases. - Lower values favour monotone translation, suitable when - word order is similar for both source and target languages. - Value between 0.0 and 1.0. Default 0.5. - """ - return self.__distortion_factor - - @distortion_factor.setter - def distortion_factor(self, d): - self.__distortion_factor = d - self.__compute_log_distortion() - - def __compute_log_distortion(self): - # cache log(distortion_factor) so we don't have to recompute it - # when scoring hypotheses - if self.__distortion_factor == 0.0: - self.__log_distortion_factor = log(1e-9) # 1e-9 is almost zero - else: - self.__log_distortion_factor = log(self.__distortion_factor) - - def translate(self, src_sentence): - """ - :param src_sentence: Sentence to be translated - :type src_sentence: list(str) - - :return: Translated sentence - :rtype: list(str) - """ - sentence = tuple(src_sentence) # prevent accidental modification - sentence_length = len(sentence) - stacks = [ - _Stack(self.stack_size, self.beam_threshold) - for _ in range(0, sentence_length + 1) - ] - empty_hypothesis = _Hypothesis() - stacks[0].push(empty_hypothesis) - - all_phrases = self.find_all_src_phrases(sentence) - future_score_table = self.compute_future_scores(sentence) - for stack in stacks: - for hypothesis in stack: - possible_expansions = StackDecoder.valid_phrases( - all_phrases, hypothesis - ) - for src_phrase_span in possible_expansions: - src_phrase = sentence[src_phrase_span[0] : src_phrase_span[1]] - for translation_option in self.phrase_table.translations_for( - src_phrase - ): - raw_score = self.expansion_score( - hypothesis, translation_option, src_phrase_span - ) - new_hypothesis = _Hypothesis( - raw_score=raw_score, - src_phrase_span=src_phrase_span, - trg_phrase=translation_option.trg_phrase, - previous=hypothesis, - ) - new_hypothesis.future_score = self.future_score( - new_hypothesis, future_score_table, sentence_length - ) - total_words = new_hypothesis.total_translated_words() - stacks[total_words].push(new_hypothesis) - - if not stacks[sentence_length]: - warnings.warn( - "Unable to translate all words. " - "The source sentence contains words not in " - "the phrase table" - ) - # Instead of returning empty output, perhaps a partial - # translation could be returned - return [] - - best_hypothesis = stacks[sentence_length].best() - return best_hypothesis.translation_so_far() - - def find_all_src_phrases(self, src_sentence): - """ - Finds all subsequences in src_sentence that have a phrase - translation in the translation table - - :type src_sentence: tuple(str) - - :return: Subsequences that have a phrase translation, - represented as a table of lists of end positions. - For example, if result[2] is [5, 6, 9], then there are - three phrases starting from position 2 in ``src_sentence``, - ending at positions 5, 6, and 9 exclusive. The list of - ending positions are in ascending order. - :rtype: list(list(int)) - """ - sentence_length = len(src_sentence) - phrase_indices = [[] for _ in src_sentence] - for start in range(0, sentence_length): - for end in range(start + 1, sentence_length + 1): - potential_phrase = src_sentence[start:end] - if potential_phrase in self.phrase_table: - phrase_indices[start].append(end) - return phrase_indices - - def compute_future_scores(self, src_sentence): - """ - Determines the approximate scores for translating every - subsequence in ``src_sentence`` - - Future scores can be used a look-ahead to determine the - difficulty of translating the remaining parts of a src_sentence. - - :type src_sentence: tuple(str) - - :return: Scores of subsequences referenced by their start and - end positions. For example, result[2][5] is the score of the - subsequence covering positions 2, 3, and 4. - :rtype: dict(int: (dict(int): float)) - """ - scores = defaultdict(lambda: defaultdict(lambda: float("-inf"))) - for seq_length in range(1, len(src_sentence) + 1): - for start in range(0, len(src_sentence) - seq_length + 1): - end = start + seq_length - phrase = src_sentence[start:end] - if phrase in self.phrase_table: - score = self.phrase_table.translations_for(phrase)[ - 0 - ].log_prob # pick best (first) translation - # Warning: API of language_model is subject to change - score += self.language_model.probability(phrase) - scores[start][end] = score - - # check if a better score can be obtained by combining - # two child subsequences - for mid in range(start + 1, end): - combined_score = scores[start][mid] + scores[mid][end] - if combined_score > scores[start][end]: - scores[start][end] = combined_score - return scores - - def future_score(self, hypothesis, future_score_table, sentence_length): - """ - Determines the approximate score for translating the - untranslated words in ``hypothesis`` - """ - score = 0.0 - for span in hypothesis.untranslated_spans(sentence_length): - score += future_score_table[span[0]][span[1]] - return score - - def expansion_score(self, hypothesis, translation_option, src_phrase_span): - """ - Calculate the score of expanding ``hypothesis`` with - ``translation_option`` - - :param hypothesis: Hypothesis being expanded - :type hypothesis: _Hypothesis - - :param translation_option: Information about the proposed expansion - :type translation_option: PhraseTableEntry - - :param src_phrase_span: Word position span of the source phrase - :type src_phrase_span: tuple(int, int) - """ - score = hypothesis.raw_score - score += translation_option.log_prob - # The API of language_model is subject to change; it could accept - # a string, a list of words, and/or some other type - score += self.language_model.probability_change( - hypothesis, translation_option.trg_phrase - ) - score += self.distortion_score(hypothesis, src_phrase_span) - score -= self.word_penalty * len(translation_option.trg_phrase) - return score - - def distortion_score(self, hypothesis, next_src_phrase_span): - if not hypothesis.src_phrase_span: - return 0.0 - next_src_phrase_start = next_src_phrase_span[0] - prev_src_phrase_end = hypothesis.src_phrase_span[1] - distortion_distance = next_src_phrase_start - prev_src_phrase_end - return abs(distortion_distance) * self.__log_distortion_factor - - @staticmethod - def valid_phrases(all_phrases_from, hypothesis): - """ - Extract phrases from ``all_phrases_from`` that contains words - that have not been translated by ``hypothesis`` - - :param all_phrases_from: Phrases represented by their spans, in - the same format as the return value of - ``find_all_src_phrases`` - :type all_phrases_from: list(list(int)) - - :type hypothesis: _Hypothesis - - :return: A list of phrases, represented by their spans, that - cover untranslated positions. - :rtype: list(tuple(int, int)) - """ - untranslated_spans = hypothesis.untranslated_spans(len(all_phrases_from)) - valid_phrases = [] - for available_span in untranslated_spans: - start = available_span[0] - available_end = available_span[1] - while start < available_end: - for phrase_end in all_phrases_from[start]: - if phrase_end > available_end: - # Subsequent elements in all_phrases_from[start] - # will also be > available_end, since the - # elements are in ascending order - break - valid_phrases.append((start, phrase_end)) - start += 1 - return valid_phrases - - -class _Hypothesis: - """ - Partial solution to a translation. - - Records the word positions of the phrase being translated, its - translation, raw score, and the cost of the untranslated parts of - the sentence. When the next phrase is selected to build upon the - partial solution, a new _Hypothesis object is created, with a back - pointer to the previous hypothesis. - - To find out which words have been translated so far, look at the - ``src_phrase_span`` in the hypothesis chain. Similarly, the - translation output can be found by traversing up the chain. - """ - - def __init__( - self, - raw_score=0.0, - src_phrase_span=(), - trg_phrase=(), - previous=None, - future_score=0.0, - ): - """ - :param raw_score: Likelihood of hypothesis so far. - Higher is better. Does not account for untranslated words. - :type raw_score: float - - :param src_phrase_span: Span of word positions covered by the - source phrase in this hypothesis expansion. For example, - (2, 5) means that the phrase is from the second word up to, - but not including the fifth word in the source sentence. - :type src_phrase_span: tuple(int) - - :param trg_phrase: Translation of the source phrase in this - hypothesis expansion - :type trg_phrase: tuple(str) - - :param previous: Previous hypothesis before expansion to this one - :type previous: _Hypothesis - - :param future_score: Approximate score for translating the - remaining words not covered by this hypothesis. Higher means - that the remaining words are easier to translate. - :type future_score: float - """ - self.raw_score = raw_score - self.src_phrase_span = src_phrase_span - self.trg_phrase = trg_phrase - self.previous = previous - self.future_score = future_score - - def score(self): - """ - Overall score of hypothesis after accounting for local and - global features - """ - return self.raw_score + self.future_score - - def untranslated_spans(self, sentence_length): - """ - Starting from each untranslated word, find the longest - continuous span of untranslated positions - - :param sentence_length: Length of source sentence being - translated by the hypothesis - :type sentence_length: int - - :rtype: list(tuple(int, int)) - """ - translated_positions = self.translated_positions() - translated_positions.sort() - translated_positions.append(sentence_length) # add sentinel position - - untranslated_spans = [] - start = 0 - # each untranslated span must end in one of the translated_positions - for end in translated_positions: - if start < end: - untranslated_spans.append((start, end)) - start = end + 1 - - return untranslated_spans - - def translated_positions(self): - """ - List of positions in the source sentence of words already - translated. The list is not sorted. - - :rtype: list(int) - """ - translated_positions = [] - current_hypothesis = self - while current_hypothesis.previous is not None: - translated_span = current_hypothesis.src_phrase_span - translated_positions.extend(range(translated_span[0], translated_span[1])) - current_hypothesis = current_hypothesis.previous - return translated_positions - - def total_translated_words(self): - return len(self.translated_positions()) - - def translation_so_far(self): - translation = [] - self.__build_translation(self, translation) - return translation - - def __build_translation(self, hypothesis, output): - if hypothesis.previous is None: - return - self.__build_translation(hypothesis.previous, output) - output.extend(hypothesis.trg_phrase) - - -class _Stack: - """ - Collection of _Hypothesis objects - """ - - def __init__(self, max_size=100, beam_threshold=0.0): - """ - :param beam_threshold: Hypotheses that score less than this - factor of the best hypothesis are discarded from the stack. - Value must be between 0.0 and 1.0. - :type beam_threshold: float - """ - self.max_size = max_size - self.items = [] - - if beam_threshold == 0.0: - self.__log_beam_threshold = float("-inf") - else: - self.__log_beam_threshold = log(beam_threshold) - - def push(self, hypothesis): - """ - Add ``hypothesis`` to the stack. - Removes lowest scoring hypothesis if the stack is full. - After insertion, hypotheses that score less than - ``beam_threshold`` times the score of the best hypothesis - are removed. - """ - self.items.append(hypothesis) - self.items.sort(key=lambda h: h.score(), reverse=True) - while len(self.items) > self.max_size: - self.items.pop() - self.threshold_prune() - - def threshold_prune(self): - if not self.items: - return - # log(score * beam_threshold) = log(score) + log(beam_threshold) - threshold = self.items[0].score() + self.__log_beam_threshold - for hypothesis in reversed(self.items): - if hypothesis.score() < threshold: - self.items.pop() - else: - break - - def best(self): - """ - :return: Hypothesis with the highest score in the stack - :rtype: _Hypothesis - """ - if self.items: - return self.items[0] - return None - - def __iter__(self): - return iter(self.items) - - def __contains__(self, hypothesis): - return hypothesis in self.items - - def __bool__(self): - return len(self.items) != 0 - - __nonzero__ = __bool__ diff --git a/pipeline/nltk/tree/__init__.py b/pipeline/nltk/tree/__init__.py deleted file mode 100644 index 39eb7195124c71050b5a7fb4dd030fdadf410967..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tree/__init__.py +++ /dev/null @@ -1,52 +0,0 @@ -# Natural Language Toolkit: Machine Translation -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird -# Peter Ljunglöf -# Tom Aarsen <> -# URL: -# For license information, see LICENSE.TXT - -""" -NLTK Tree Package - -This package may be used for representing hierarchical language -structures, such as syntax trees and morphological trees. -""" - -# TODO: add LabelledTree (can be used for dependency trees) - -from nltk.tree.immutable import ( - ImmutableMultiParentedTree, - ImmutableParentedTree, - ImmutableProbabilisticTree, - ImmutableTree, -) -from nltk.tree.parented import MultiParentedTree, ParentedTree -from nltk.tree.parsing import bracket_parse, sinica_parse -from nltk.tree.prettyprinter import TreePrettyPrinter -from nltk.tree.probabilistic import ProbabilisticTree -from nltk.tree.transforms import ( - chomsky_normal_form, - collapse_unary, - un_chomsky_normal_form, -) -from nltk.tree.tree import Tree - -__all__ = [ - "ImmutableMultiParentedTree", - "ImmutableParentedTree", - "ImmutableProbabilisticTree", - "ImmutableTree", - "MultiParentedTree", - "ParentedTree", - "bracket_parse", - "sinica_parse", - "TreePrettyPrinter", - "ProbabilisticTree", - "chomsky_normal_form", - "collapse_unary", - "un_chomsky_normal_form", - "Tree", -] diff --git a/pipeline/nltk/tree/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/tree/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 47f554caf467aa029f8206819f1259e589954064..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tree/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tree/__pycache__/immutable.cpython-39.pyc b/pipeline/nltk/tree/__pycache__/immutable.cpython-39.pyc deleted file mode 100644 index cd84ca7f894a0637c22b3b8315b8c384d65a18fa..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tree/__pycache__/immutable.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tree/__pycache__/parented.cpython-39.pyc b/pipeline/nltk/tree/__pycache__/parented.cpython-39.pyc deleted file mode 100644 index 9271a6a2a11cf110d103dc472158fcec7e76d13e..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tree/__pycache__/parented.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tree/__pycache__/parsing.cpython-39.pyc b/pipeline/nltk/tree/__pycache__/parsing.cpython-39.pyc deleted file mode 100644 index 74f3bcca9695cc48232687642b57380c12a2bd6c..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tree/__pycache__/parsing.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tree/__pycache__/prettyprinter.cpython-39.pyc b/pipeline/nltk/tree/__pycache__/prettyprinter.cpython-39.pyc deleted file mode 100644 index 20a13f68b5bafd9ebdbd7bae713dbd48eaf72f71..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tree/__pycache__/prettyprinter.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tree/__pycache__/probabilistic.cpython-39.pyc b/pipeline/nltk/tree/__pycache__/probabilistic.cpython-39.pyc deleted file mode 100644 index fb90712674185854adc7175e21164940586e1664..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tree/__pycache__/probabilistic.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tree/__pycache__/transforms.cpython-39.pyc b/pipeline/nltk/tree/__pycache__/transforms.cpython-39.pyc deleted file mode 100644 index 5eceee92b76fd18faa35aaeb539afab47a907e98..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tree/__pycache__/transforms.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tree/__pycache__/tree.cpython-39.pyc b/pipeline/nltk/tree/__pycache__/tree.cpython-39.pyc deleted file mode 100644 index c457e5c0529cb8cd9d2181c5afb7630ea4f520ff..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/tree/__pycache__/tree.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/tree/immutable.py b/pipeline/nltk/tree/immutable.py deleted file mode 100644 index 94e7ef9473a9c02b988db5318f3b282eb153e439..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tree/immutable.py +++ /dev/null @@ -1,124 +0,0 @@ -# Natural Language Toolkit: Text Trees -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird -# Peter Ljunglöf -# Tom Aarsen <> -# URL: -# For license information, see LICENSE.TXT - -from nltk.probability import ProbabilisticMixIn -from nltk.tree.parented import MultiParentedTree, ParentedTree -from nltk.tree.tree import Tree - - -class ImmutableTree(Tree): - def __init__(self, node, children=None): - super().__init__(node, children) - # Precompute our hash value. This ensures that we're really - # immutable. It also means we only have to calculate it once. - try: - self._hash = hash((self._label, tuple(self))) - except (TypeError, ValueError) as e: - raise ValueError( - "%s: node value and children " "must be immutable" % type(self).__name__ - ) from e - - def __setitem__(self, index, value): - raise ValueError("%s may not be modified" % type(self).__name__) - - def __setslice__(self, i, j, value): - raise ValueError("%s may not be modified" % type(self).__name__) - - def __delitem__(self, index): - raise ValueError("%s may not be modified" % type(self).__name__) - - def __delslice__(self, i, j): - raise ValueError("%s may not be modified" % type(self).__name__) - - def __iadd__(self, other): - raise ValueError("%s may not be modified" % type(self).__name__) - - def __imul__(self, other): - raise ValueError("%s may not be modified" % type(self).__name__) - - def append(self, v): - raise ValueError("%s may not be modified" % type(self).__name__) - - def extend(self, v): - raise ValueError("%s may not be modified" % type(self).__name__) - - def pop(self, v=None): - raise ValueError("%s may not be modified" % type(self).__name__) - - def remove(self, v): - raise ValueError("%s may not be modified" % type(self).__name__) - - def reverse(self): - raise ValueError("%s may not be modified" % type(self).__name__) - - def sort(self): - raise ValueError("%s may not be modified" % type(self).__name__) - - def __hash__(self): - return self._hash - - def set_label(self, value): - """ - Set the node label. This will only succeed the first time the - node label is set, which should occur in ImmutableTree.__init__(). - """ - if hasattr(self, "_label"): - raise ValueError("%s may not be modified" % type(self).__name__) - self._label = value - - -class ImmutableProbabilisticTree(ImmutableTree, ProbabilisticMixIn): - def __init__(self, node, children=None, **prob_kwargs): - ImmutableTree.__init__(self, node, children) - ProbabilisticMixIn.__init__(self, **prob_kwargs) - self._hash = hash((self._label, tuple(self), self.prob())) - - # We have to patch up these methods to make them work right: - def _frozen_class(self): - return ImmutableProbabilisticTree - - def __repr__(self): - return f"{Tree.__repr__(self)} [{self.prob()}]" - - def __str__(self): - return f"{self.pformat(margin=60)} [{self.prob()}]" - - def copy(self, deep=False): - if not deep: - return type(self)(self._label, self, prob=self.prob()) - else: - return type(self).convert(self) - - @classmethod - def convert(cls, val): - if isinstance(val, Tree): - children = [cls.convert(child) for child in val] - if isinstance(val, ProbabilisticMixIn): - return cls(val._label, children, prob=val.prob()) - else: - return cls(val._label, children, prob=1.0) - else: - return val - - -class ImmutableParentedTree(ImmutableTree, ParentedTree): - pass - - -class ImmutableMultiParentedTree(ImmutableTree, MultiParentedTree): - pass - - -__all__ = [ - "ImmutableProbabilisticTree", - "ImmutableTree", - "ImmutableParentedTree", - "ImmutableMultiParentedTree", -] diff --git a/pipeline/nltk/tree/parented.py b/pipeline/nltk/tree/parented.py deleted file mode 100644 index c43788f1a79902d02c10b0f699cd2f6026b2d646..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tree/parented.py +++ /dev/null @@ -1,590 +0,0 @@ -# Natural Language Toolkit: Text Trees -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird -# Peter Ljunglöf -# Tom Aarsen <> -# URL: -# For license information, see LICENSE.TXT - -import warnings -from abc import ABCMeta, abstractmethod - -from nltk.tree.tree import Tree -from nltk.util import slice_bounds - - -###################################################################### -## Parented trees -###################################################################### -class AbstractParentedTree(Tree, metaclass=ABCMeta): - """ - An abstract base class for a ``Tree`` that automatically maintains - pointers to parent nodes. These parent pointers are updated - whenever any change is made to a tree's structure. Two subclasses - are currently defined: - - - ``ParentedTree`` is used for tree structures where each subtree - has at most one parent. This class should be used in cases - where there is no"sharing" of subtrees. - - - ``MultiParentedTree`` is used for tree structures where a - subtree may have zero or more parents. This class should be - used in cases where subtrees may be shared. - - Subclassing - =========== - The ``AbstractParentedTree`` class redefines all operations that - modify a tree's structure to call two methods, which are used by - subclasses to update parent information: - - - ``_setparent()`` is called whenever a new child is added. - - ``_delparent()`` is called whenever a child is removed. - """ - - def __init__(self, node, children=None): - super().__init__(node, children) - # If children is None, the tree is read from node, and - # all parents will be set during parsing. - if children is not None: - # Otherwise we have to set the parent of the children. - # Iterate over self, and *not* children, because children - # might be an iterator. - for i, child in enumerate(self): - if isinstance(child, Tree): - self._setparent(child, i, dry_run=True) - for i, child in enumerate(self): - if isinstance(child, Tree): - self._setparent(child, i) - - # //////////////////////////////////////////////////////////// - # Parent management - # //////////////////////////////////////////////////////////// - @abstractmethod - def _setparent(self, child, index, dry_run=False): - """ - Update the parent pointer of ``child`` to point to ``self``. This - method is only called if the type of ``child`` is ``Tree``; - i.e., it is not called when adding a leaf to a tree. This method - is always called before the child is actually added to the - child list of ``self``. - - :type child: Tree - :type index: int - :param index: The index of ``child`` in ``self``. - :raise TypeError: If ``child`` is a tree with an impropriate - type. Typically, if ``child`` is a tree, then its type needs - to match the type of ``self``. This prevents mixing of - different tree types (single-parented, multi-parented, and - non-parented). - :param dry_run: If true, the don't actually set the child's - parent pointer; just check for any error conditions, and - raise an exception if one is found. - """ - - @abstractmethod - def _delparent(self, child, index): - """ - Update the parent pointer of ``child`` to not point to self. This - method is only called if the type of ``child`` is ``Tree``; i.e., it - is not called when removing a leaf from a tree. This method - is always called before the child is actually removed from the - child list of ``self``. - - :type child: Tree - :type index: int - :param index: The index of ``child`` in ``self``. - """ - - # //////////////////////////////////////////////////////////// - # Methods that add/remove children - # //////////////////////////////////////////////////////////// - # Every method that adds or removes a child must make - # appropriate calls to _setparent() and _delparent(). - - def __delitem__(self, index): - # del ptree[start:stop] - if isinstance(index, slice): - start, stop, step = slice_bounds(self, index, allow_step=True) - # Clear all the children pointers. - for i in range(start, stop, step): - if isinstance(self[i], Tree): - self._delparent(self[i], i) - # Delete the children from our child list. - super().__delitem__(index) - - # del ptree[i] - elif isinstance(index, int): - if index < 0: - index += len(self) - if index < 0: - raise IndexError("index out of range") - # Clear the child's parent pointer. - if isinstance(self[index], Tree): - self._delparent(self[index], index) - # Remove the child from our child list. - super().__delitem__(index) - - elif isinstance(index, (list, tuple)): - # del ptree[()] - if len(index) == 0: - raise IndexError("The tree position () may not be deleted.") - # del ptree[(i,)] - elif len(index) == 1: - del self[index[0]] - # del ptree[i1, i2, i3] - else: - del self[index[0]][index[1:]] - - else: - raise TypeError( - "%s indices must be integers, not %s" - % (type(self).__name__, type(index).__name__) - ) - - def __setitem__(self, index, value): - # ptree[start:stop] = value - if isinstance(index, slice): - start, stop, step = slice_bounds(self, index, allow_step=True) - # make a copy of value, in case it's an iterator - if not isinstance(value, (list, tuple)): - value = list(value) - # Check for any error conditions, so we can avoid ending - # up in an inconsistent state if an error does occur. - for i, child in enumerate(value): - if isinstance(child, Tree): - self._setparent(child, start + i * step, dry_run=True) - # clear the child pointers of all parents we're removing - for i in range(start, stop, step): - if isinstance(self[i], Tree): - self._delparent(self[i], i) - # set the child pointers of the new children. We do this - # after clearing *all* child pointers, in case we're e.g. - # reversing the elements in a tree. - for i, child in enumerate(value): - if isinstance(child, Tree): - self._setparent(child, start + i * step) - # finally, update the content of the child list itself. - super().__setitem__(index, value) - - # ptree[i] = value - elif isinstance(index, int): - if index < 0: - index += len(self) - if index < 0: - raise IndexError("index out of range") - # if the value is not changing, do nothing. - if value is self[index]: - return - # Set the new child's parent pointer. - if isinstance(value, Tree): - self._setparent(value, index) - # Remove the old child's parent pointer - if isinstance(self[index], Tree): - self._delparent(self[index], index) - # Update our child list. - super().__setitem__(index, value) - - elif isinstance(index, (list, tuple)): - # ptree[()] = value - if len(index) == 0: - raise IndexError("The tree position () may not be assigned to.") - # ptree[(i,)] = value - elif len(index) == 1: - self[index[0]] = value - # ptree[i1, i2, i3] = value - else: - self[index[0]][index[1:]] = value - - else: - raise TypeError( - "%s indices must be integers, not %s" - % (type(self).__name__, type(index).__name__) - ) - - def append(self, child): - if isinstance(child, Tree): - self._setparent(child, len(self)) - super().append(child) - - def extend(self, children): - for child in children: - if isinstance(child, Tree): - self._setparent(child, len(self)) - super().append(child) - - def insert(self, index, child): - # Handle negative indexes. Note that if index < -len(self), - # we do *not* raise an IndexError, unlike __getitem__. This - # is done for consistency with list.__getitem__ and list.index. - if index < 0: - index += len(self) - if index < 0: - index = 0 - # Set the child's parent, and update our child list. - if isinstance(child, Tree): - self._setparent(child, index) - super().insert(index, child) - - def pop(self, index=-1): - if index < 0: - index += len(self) - if index < 0: - raise IndexError("index out of range") - if isinstance(self[index], Tree): - self._delparent(self[index], index) - return super().pop(index) - - # n.b.: like `list`, this is done by equality, not identity! - # To remove a specific child, use del ptree[i]. - def remove(self, child): - index = self.index(child) - if isinstance(self[index], Tree): - self._delparent(self[index], index) - super().remove(child) - - # We need to implement __getslice__ and friends, even though - # they're deprecated, because otherwise list.__getslice__ will get - # called (since we're subclassing from list). Just delegate to - # __getitem__ etc., but use max(0, start) and max(0, stop) because - # because negative indices are already handled *before* - # __getslice__ is called; and we don't want to double-count them. - if hasattr(list, "__getslice__"): - - def __getslice__(self, start, stop): - return self.__getitem__(slice(max(0, start), max(0, stop))) - - def __delslice__(self, start, stop): - return self.__delitem__(slice(max(0, start), max(0, stop))) - - def __setslice__(self, start, stop, value): - return self.__setitem__(slice(max(0, start), max(0, stop)), value) - - def __getnewargs__(self): - """Method used by the pickle module when un-pickling. - This method provides the arguments passed to ``__new__`` - upon un-pickling. Without this method, ParentedTree instances - cannot be pickled and unpickled in Python 3.7+ onwards. - - :return: Tuple of arguments for ``__new__``, i.e. the label - and the children of this node. - :rtype: Tuple[Any, List[AbstractParentedTree]] - """ - return (self._label, list(self)) - - -class ParentedTree(AbstractParentedTree): - """ - A ``Tree`` that automatically maintains parent pointers for - single-parented trees. The following are methods for querying - the structure of a parented tree: ``parent``, ``parent_index``, - ``left_sibling``, ``right_sibling``, ``root``, ``treeposition``. - - Each ``ParentedTree`` may have at most one parent. In - particular, subtrees may not be shared. Any attempt to reuse a - single ``ParentedTree`` as a child of more than one parent (or - as multiple children of the same parent) will cause a - ``ValueError`` exception to be raised. - - ``ParentedTrees`` should never be used in the same tree as ``Trees`` - or ``MultiParentedTrees``. Mixing tree implementations may result - in incorrect parent pointers and in ``TypeError`` exceptions. - """ - - def __init__(self, node, children=None): - self._parent = None - """The parent of this Tree, or None if it has no parent.""" - super().__init__(node, children) - if children is None: - # If children is None, the tree is read from node. - # After parsing, the parent of the immediate children - # will point to an intermediate tree, not self. - # We fix this by brute force: - for i, child in enumerate(self): - if isinstance(child, Tree): - child._parent = None - self._setparent(child, i) - - def _frozen_class(self): - from nltk.tree.immutable import ImmutableParentedTree - - return ImmutableParentedTree - - def copy(self, deep=False): - if not deep: - warnings.warn( - f"{self.__class__.__name__} objects do not support shallow copies. Defaulting to a deep copy." - ) - return super().copy(deep=True) - - # ///////////////////////////////////////////////////////////////// - # Methods - # ///////////////////////////////////////////////////////////////// - - def parent(self): - """The parent of this tree, or None if it has no parent.""" - return self._parent - - def parent_index(self): - """ - The index of this tree in its parent. I.e., - ``ptree.parent()[ptree.parent_index()] is ptree``. Note that - ``ptree.parent_index()`` is not necessarily equal to - ``ptree.parent.index(ptree)``, since the ``index()`` method - returns the first child that is equal to its argument. - """ - if self._parent is None: - return None - for i, child in enumerate(self._parent): - if child is self: - return i - assert False, "expected to find self in self._parent!" - - def left_sibling(self): - """The left sibling of this tree, or None if it has none.""" - parent_index = self.parent_index() - if self._parent and parent_index > 0: - return self._parent[parent_index - 1] - return None # no left sibling - - def right_sibling(self): - """The right sibling of this tree, or None if it has none.""" - parent_index = self.parent_index() - if self._parent and parent_index < (len(self._parent) - 1): - return self._parent[parent_index + 1] - return None # no right sibling - - def root(self): - """ - The root of this tree. I.e., the unique ancestor of this tree - whose parent is None. If ``ptree.parent()`` is None, then - ``ptree`` is its own root. - """ - root = self - while root.parent() is not None: - root = root.parent() - return root - - def treeposition(self): - """ - The tree position of this tree, relative to the root of the - tree. I.e., ``ptree.root[ptree.treeposition] is ptree``. - """ - if self.parent() is None: - return () - else: - return self.parent().treeposition() + (self.parent_index(),) - - # ///////////////////////////////////////////////////////////////// - # Parent Management - # ///////////////////////////////////////////////////////////////// - - def _delparent(self, child, index): - # Sanity checks - assert isinstance(child, ParentedTree) - assert self[index] is child - assert child._parent is self - - # Delete child's parent pointer. - child._parent = None - - def _setparent(self, child, index, dry_run=False): - # If the child's type is incorrect, then complain. - if not isinstance(child, ParentedTree): - raise TypeError("Can not insert a non-ParentedTree into a ParentedTree") - - # If child already has a parent, then complain. - if hasattr(child, "_parent") and child._parent is not None: - raise ValueError("Can not insert a subtree that already has a parent.") - - # Set child's parent pointer & index. - if not dry_run: - child._parent = self - - -class MultiParentedTree(AbstractParentedTree): - """ - A ``Tree`` that automatically maintains parent pointers for - multi-parented trees. The following are methods for querying the - structure of a multi-parented tree: ``parents()``, ``parent_indices()``, - ``left_siblings()``, ``right_siblings()``, ``roots``, ``treepositions``. - - Each ``MultiParentedTree`` may have zero or more parents. In - particular, subtrees may be shared. If a single - ``MultiParentedTree`` is used as multiple children of the same - parent, then that parent will appear multiple times in its - ``parents()`` method. - - ``MultiParentedTrees`` should never be used in the same tree as - ``Trees`` or ``ParentedTrees``. Mixing tree implementations may - result in incorrect parent pointers and in ``TypeError`` exceptions. - """ - - def __init__(self, node, children=None): - self._parents = [] - """A list of this tree's parents. This list should not - contain duplicates, even if a parent contains this tree - multiple times.""" - super().__init__(node, children) - if children is None: - # If children is None, the tree is read from node. - # After parsing, the parent(s) of the immediate children - # will point to an intermediate tree, not self. - # We fix this by brute force: - for i, child in enumerate(self): - if isinstance(child, Tree): - child._parents = [] - self._setparent(child, i) - - def _frozen_class(self): - from nltk.tree.immutable import ImmutableMultiParentedTree - - return ImmutableMultiParentedTree - - # ///////////////////////////////////////////////////////////////// - # Methods - # ///////////////////////////////////////////////////////////////// - - def parents(self): - """ - The set of parents of this tree. If this tree has no parents, - then ``parents`` is the empty set. To check if a tree is used - as multiple children of the same parent, use the - ``parent_indices()`` method. - - :type: list(MultiParentedTree) - """ - return list(self._parents) - - def left_siblings(self): - """ - A list of all left siblings of this tree, in any of its parent - trees. A tree may be its own left sibling if it is used as - multiple contiguous children of the same parent. A tree may - appear multiple times in this list if it is the left sibling - of this tree with respect to multiple parents. - - :type: list(MultiParentedTree) - """ - return [ - parent[index - 1] - for (parent, index) in self._get_parent_indices() - if index > 0 - ] - - def right_siblings(self): - """ - A list of all right siblings of this tree, in any of its parent - trees. A tree may be its own right sibling if it is used as - multiple contiguous children of the same parent. A tree may - appear multiple times in this list if it is the right sibling - of this tree with respect to multiple parents. - - :type: list(MultiParentedTree) - """ - return [ - parent[index + 1] - for (parent, index) in self._get_parent_indices() - if index < (len(parent) - 1) - ] - - def _get_parent_indices(self): - return [ - (parent, index) - for parent in self._parents - for index, child in enumerate(parent) - if child is self - ] - - def roots(self): - """ - The set of all roots of this tree. This set is formed by - tracing all possible parent paths until trees with no parents - are found. - - :type: list(MultiParentedTree) - """ - return list(self._get_roots_helper({}).values()) - - def _get_roots_helper(self, result): - if self._parents: - for parent in self._parents: - parent._get_roots_helper(result) - else: - result[id(self)] = self - return result - - def parent_indices(self, parent): - """ - Return a list of the indices where this tree occurs as a child - of ``parent``. If this child does not occur as a child of - ``parent``, then the empty list is returned. The following is - always true:: - - for parent_index in ptree.parent_indices(parent): - parent[parent_index] is ptree - """ - if parent not in self._parents: - return [] - else: - return [index for (index, child) in enumerate(parent) if child is self] - - def treepositions(self, root): - """ - Return a list of all tree positions that can be used to reach - this multi-parented tree starting from ``root``. I.e., the - following is always true:: - - for treepos in ptree.treepositions(root): - root[treepos] is ptree - """ - if self is root: - return [()] - else: - return [ - treepos + (index,) - for parent in self._parents - for treepos in parent.treepositions(root) - for (index, child) in enumerate(parent) - if child is self - ] - - # ///////////////////////////////////////////////////////////////// - # Parent Management - # ///////////////////////////////////////////////////////////////// - - def _delparent(self, child, index): - # Sanity checks - assert isinstance(child, MultiParentedTree) - assert self[index] is child - assert len([p for p in child._parents if p is self]) == 1 - - # If the only copy of child in self is at index, then delete - # self from child's parent list. - for i, c in enumerate(self): - if c is child and i != index: - break - else: - child._parents.remove(self) - - def _setparent(self, child, index, dry_run=False): - # If the child's type is incorrect, then complain. - if not isinstance(child, MultiParentedTree): - raise TypeError( - "Can not insert a non-MultiParentedTree into a MultiParentedTree" - ) - - # Add self as a parent pointer if it's not already listed. - if not dry_run: - for parent in child._parents: - if parent is self: - break - else: - child._parents.append(self) - - -__all__ = [ - "ParentedTree", - "MultiParentedTree", -] diff --git a/pipeline/nltk/tree/parsing.py b/pipeline/nltk/tree/parsing.py deleted file mode 100644 index c2df4e166832b940eb5e961a9e7bf685d95b5e63..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tree/parsing.py +++ /dev/null @@ -1,66 +0,0 @@ -# Natural Language Toolkit: Text Trees -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird -# Peter Ljunglöf -# Tom Aarsen <> -# URL: -# For license information, see LICENSE.TXT - -import re - -from nltk.tree.tree import Tree - -###################################################################### -## Parsing -###################################################################### - - -def bracket_parse(s): - """ - Use Tree.read(s, remove_empty_top_bracketing=True) instead. - """ - raise NameError("Use Tree.read(s, remove_empty_top_bracketing=True) instead.") - - -def sinica_parse(s): - """ - Parse a Sinica Treebank string and return a tree. Trees are represented as nested brackettings, - as shown in the following example (X represents a Chinese character): - S(goal:NP(Head:Nep:XX)|theme:NP(Head:Nhaa:X)|quantity:Dab:X|Head:VL2:X)#0(PERIODCATEGORY) - - :return: A tree corresponding to the string representation. - :rtype: Tree - :param s: The string to be converted - :type s: str - """ - tokens = re.split(r"([()| ])", s) - for i in range(len(tokens)): - if tokens[i] == "(": - tokens[i - 1], tokens[i] = ( - tokens[i], - tokens[i - 1], - ) # pull nonterminal inside parens - elif ":" in tokens[i]: - fields = tokens[i].split(":") - if len(fields) == 2: # non-terminal - tokens[i] = fields[1] - else: - tokens[i] = "(" + fields[-2] + " " + fields[-1] + ")" - elif tokens[i] == "|": - tokens[i] = "" - - treebank_string = " ".join(tokens) - return Tree.fromstring(treebank_string, remove_empty_top_bracketing=True) - - -# s = re.sub(r'^#[^\s]*\s', '', s) # remove leading identifier -# s = re.sub(r'\w+:', '', s) # remove role tags - -# return s - -__all__ = [ - "bracket_parse", - "sinica_parse", -] diff --git a/pipeline/nltk/tree/prettyprinter.py b/pipeline/nltk/tree/prettyprinter.py deleted file mode 100644 index 33e1a93208b17a48e3ff4bbcbbb1d4017e89f198..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tree/prettyprinter.py +++ /dev/null @@ -1,627 +0,0 @@ -# Natural Language Toolkit: ASCII visualization of NLTK trees -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Andreas van Cranenburgh -# Peter Ljunglöf -# URL: -# For license information, see LICENSE.TXT - -""" -Pretty-printing of discontinuous trees. -Adapted from the disco-dop project, by Andreas van Cranenburgh. -https://github.com/andreasvc/disco-dop - -Interesting reference (not used for this code): -T. Eschbach et al., Orth. Hypergraph Drawing, Journal of -Graph Algorithms and Applications, 10(2) 141--157 (2006)149. -https://jgaa.info/accepted/2006/EschbachGuentherBecker2006.10.2.pdf -""" - -import re - -try: - from html import escape -except ImportError: - from cgi import escape - -from collections import defaultdict -from operator import itemgetter - -from nltk.tree.tree import Tree -from nltk.util import OrderedDict - -ANSICOLOR = { - "black": 30, - "red": 31, - "green": 32, - "yellow": 33, - "blue": 34, - "magenta": 35, - "cyan": 36, - "white": 37, -} - - -class TreePrettyPrinter: - """ - Pretty-print a tree in text format, either as ASCII or Unicode. - The tree can be a normal tree, or discontinuous. - - ``TreePrettyPrinter(tree, sentence=None, highlight=())`` - creates an object from which different visualizations can be created. - - :param tree: a Tree object. - :param sentence: a list of words (strings). If `sentence` is given, - `tree` must contain integers as leaves, which are taken as indices - in `sentence`. Using this you can display a discontinuous tree. - :param highlight: Optionally, a sequence of Tree objects in `tree` which - should be highlighted. Has the effect of only applying colors to nodes - in this sequence (nodes should be given as Tree objects, terminals as - indices). - - >>> from nltk.tree import Tree - >>> tree = Tree.fromstring('(S (NP Mary) (VP walks))') - >>> print(TreePrettyPrinter(tree).text()) - ... # doctest: +NORMALIZE_WHITESPACE - S - ____|____ - NP VP - | | - Mary walks - """ - - def __init__(self, tree, sentence=None, highlight=()): - if sentence is None: - leaves = tree.leaves() - if ( - leaves - and all(len(a) > 0 for a in tree.subtrees()) - and all(isinstance(a, int) for a in leaves) - ): - sentence = [str(a) for a in leaves] - else: - # this deals with empty nodes (frontier non-terminals) - # and multiple/mixed terminals under non-terminals. - tree = tree.copy(True) - sentence = [] - for a in tree.subtrees(): - if len(a) == 0: - a.append(len(sentence)) - sentence.append(None) - elif any(not isinstance(b, Tree) for b in a): - for n, b in enumerate(a): - if not isinstance(b, Tree): - a[n] = len(sentence) - if type(b) == tuple: - b = "/".join(b) - sentence.append("%s" % b) - self.nodes, self.coords, self.edges, self.highlight = self.nodecoords( - tree, sentence, highlight - ) - - def __str__(self): - return self.text() - - def __repr__(self): - return "" % len(self.nodes) - - @staticmethod - def nodecoords(tree, sentence, highlight): - """ - Produce coordinates of nodes on a grid. - - Objective: - - - Produce coordinates for a non-overlapping placement of nodes and - horizontal lines. - - Order edges so that crossing edges cross a minimal number of previous - horizontal lines (never vertical lines). - - Approach: - - - bottom up level order traversal (start at terminals) - - at each level, identify nodes which cannot be on the same row - - identify nodes which cannot be in the same column - - place nodes into a grid at (row, column) - - order child-parent edges with crossing edges last - - Coordinates are (row, column); the origin (0, 0) is at the top left; - the root node is on row 0. Coordinates do not consider the size of a - node (which depends on font, &c), so the width of a column of the grid - should be automatically determined by the element with the greatest - width in that column. Alternatively, the integer coordinates could be - converted to coordinates in which the distances between adjacent nodes - are non-uniform. - - Produces tuple (nodes, coords, edges, highlighted) where: - - - nodes[id]: Tree object for the node with this integer id - - coords[id]: (n, m) coordinate where to draw node with id in the grid - - edges[id]: parent id of node with this id (ordered dictionary) - - highlighted: set of ids that should be highlighted - """ - - def findcell(m, matrix, startoflevel, children): - """ - Find vacant row, column index for node ``m``. - Iterate over current rows for this level (try lowest first) - and look for cell between first and last child of this node, - add new row to level if no free row available. - """ - candidates = [a for _, a in children[m]] - minidx, maxidx = min(candidates), max(candidates) - leaves = tree[m].leaves() - center = scale * sum(leaves) // len(leaves) # center of gravity - if minidx < maxidx and not minidx < center < maxidx: - center = sum(candidates) // len(candidates) - if max(candidates) - min(candidates) > 2 * scale: - center -= center % scale # round to unscaled coordinate - if minidx < maxidx and not minidx < center < maxidx: - center += scale - if ids[m] == 0: - startoflevel = len(matrix) - for rowidx in range(startoflevel, len(matrix) + 1): - if rowidx == len(matrix): # need to add a new row - matrix.append( - [ - vertline if a not in (corner, None) else None - for a in matrix[-1] - ] - ) - row = matrix[rowidx] - if len(children[m]) == 1: # place unaries directly above child - return rowidx, next(iter(children[m]))[1] - elif all( - a is None or a == vertline - for a in row[min(candidates) : max(candidates) + 1] - ): - # find free column - for n in range(scale): - i = j = center + n - while j > minidx or i < maxidx: - if i < maxidx and ( - matrix[rowidx][i] is None or i in candidates - ): - return rowidx, i - elif j > minidx and ( - matrix[rowidx][j] is None or j in candidates - ): - return rowidx, j - i += scale - j -= scale - raise ValueError( - "could not find a free cell for:\n%s\n%s" - "min=%d; max=%d" % (tree[m], minidx, maxidx, dumpmatrix()) - ) - - def dumpmatrix(): - """Dump matrix contents for debugging purposes.""" - return "\n".join( - "%2d: %s" % (n, " ".join(("%2r" % i)[:2] for i in row)) - for n, row in enumerate(matrix) - ) - - leaves = tree.leaves() - if not all(isinstance(n, int) for n in leaves): - raise ValueError("All leaves must be integer indices.") - if len(leaves) != len(set(leaves)): - raise ValueError("Indices must occur at most once.") - if not all(0 <= n < len(sentence) for n in leaves): - raise ValueError( - "All leaves must be in the interval 0..n " - "with n=len(sentence)\ntokens: %d indices: " - "%r\nsentence: %s" % (len(sentence), tree.leaves(), sentence) - ) - vertline, corner = -1, -2 # constants - tree = tree.copy(True) - for a in tree.subtrees(): - a.sort(key=lambda n: min(n.leaves()) if isinstance(n, Tree) else n) - scale = 2 - crossed = set() - # internal nodes and lexical nodes (no frontiers) - positions = tree.treepositions() - maxdepth = max(map(len, positions)) + 1 - childcols = defaultdict(set) - matrix = [[None] * (len(sentence) * scale)] - nodes = {} - ids = {a: n for n, a in enumerate(positions)} - highlighted_nodes = { - n for a, n in ids.items() if not highlight or tree[a] in highlight - } - levels = {n: [] for n in range(maxdepth - 1)} - terminals = [] - for a in positions: - node = tree[a] - if isinstance(node, Tree): - levels[maxdepth - node.height()].append(a) - else: - terminals.append(a) - - for n in levels: - levels[n].sort(key=lambda n: max(tree[n].leaves()) - min(tree[n].leaves())) - terminals.sort() - positions = set(positions) - - for m in terminals: - i = int(tree[m]) * scale - assert matrix[0][i] is None, (matrix[0][i], m, i) - matrix[0][i] = ids[m] - nodes[ids[m]] = sentence[tree[m]] - if nodes[ids[m]] is None: - nodes[ids[m]] = "..." - highlighted_nodes.discard(ids[m]) - positions.remove(m) - childcols[m[:-1]].add((0, i)) - - # add other nodes centered on their children, - # if the center is already taken, back off - # to the left and right alternately, until an empty cell is found. - for n in sorted(levels, reverse=True): - nodesatdepth = levels[n] - startoflevel = len(matrix) - matrix.append( - [vertline if a not in (corner, None) else None for a in matrix[-1]] - ) - for m in nodesatdepth: # [::-1]: - if n < maxdepth - 1 and childcols[m]: - _, pivot = min(childcols[m], key=itemgetter(1)) - if { - a[:-1] - for row in matrix[:-1] - for a in row[:pivot] - if isinstance(a, tuple) - } & { - a[:-1] - for row in matrix[:-1] - for a in row[pivot:] - if isinstance(a, tuple) - }: - crossed.add(m) - - rowidx, i = findcell(m, matrix, startoflevel, childcols) - positions.remove(m) - - # block positions where children of this node branch out - for _, x in childcols[m]: - matrix[rowidx][x] = corner - # assert m == () or matrix[rowidx][i] in (None, corner), ( - # matrix[rowidx][i], m, str(tree), ' '.join(sentence)) - # node itself - matrix[rowidx][i] = ids[m] - nodes[ids[m]] = tree[m] - # add column to the set of children for its parent - if len(m) > 0: - childcols[m[:-1]].add((rowidx, i)) - assert len(positions) == 0 - - # remove unused columns, right to left - for m in range(scale * len(sentence) - 1, -1, -1): - if not any(isinstance(row[m], (Tree, int)) for row in matrix): - for row in matrix: - del row[m] - - # remove unused rows, reverse - matrix = [ - row - for row in reversed(matrix) - if not all(a is None or a == vertline for a in row) - ] - - # collect coordinates of nodes - coords = {} - for n, _ in enumerate(matrix): - for m, i in enumerate(matrix[n]): - if isinstance(i, int) and i >= 0: - coords[i] = n, m - - # move crossed edges last - positions = sorted( - (a for level in levels.values() for a in level), - key=lambda a: a[:-1] in crossed, - ) - - # collect edges from node to node - edges = OrderedDict() - for i in reversed(positions): - for j, _ in enumerate(tree[i]): - edges[ids[i + (j,)]] = ids[i] - - return nodes, coords, edges, highlighted_nodes - - def text( - self, - nodedist=1, - unicodelines=False, - html=False, - ansi=False, - nodecolor="blue", - leafcolor="red", - funccolor="green", - abbreviate=None, - maxwidth=16, - ): - """ - :return: ASCII art for a discontinuous tree. - - :param unicodelines: whether to use Unicode line drawing characters - instead of plain (7-bit) ASCII. - :param html: whether to wrap output in html code (default plain text). - :param ansi: whether to produce colors with ANSI escape sequences - (only effective when html==False). - :param leafcolor, nodecolor: specify colors of leaves and phrasal - nodes; effective when either html or ansi is True. - :param abbreviate: if True, abbreviate labels longer than 5 characters. - If integer, abbreviate labels longer than `abbr` characters. - :param maxwidth: maximum number of characters before a label starts to - wrap; pass None to disable. - """ - if abbreviate == True: - abbreviate = 5 - if unicodelines: - horzline = "\u2500" - leftcorner = "\u250c" - rightcorner = "\u2510" - vertline = " \u2502 " - tee = horzline + "\u252C" + horzline - bottom = horzline + "\u2534" + horzline - cross = horzline + "\u253c" + horzline - ellipsis = "\u2026" - else: - horzline = "_" - leftcorner = rightcorner = " " - vertline = " | " - tee = 3 * horzline - cross = bottom = "_|_" - ellipsis = "." - - def crosscell(cur, x=vertline): - """Overwrite center of this cell with a vertical branch.""" - splitl = len(cur) - len(cur) // 2 - len(x) // 2 - 1 - lst = list(cur) - lst[splitl : splitl + len(x)] = list(x) - return "".join(lst) - - result = [] - matrix = defaultdict(dict) - maxnodewith = defaultdict(lambda: 3) - maxnodeheight = defaultdict(lambda: 1) - maxcol = 0 - minchildcol = {} - maxchildcol = {} - childcols = defaultdict(set) - labels = {} - wrapre = re.compile( - "(.{%d,%d}\\b\\W*|.{%d})" % (maxwidth - 4, maxwidth, maxwidth) - ) - # collect labels and coordinates - for a in self.nodes: - row, column = self.coords[a] - matrix[row][column] = a - maxcol = max(maxcol, column) - label = ( - self.nodes[a].label() - if isinstance(self.nodes[a], Tree) - else self.nodes[a] - ) - if abbreviate and len(label) > abbreviate: - label = label[:abbreviate] + ellipsis - if maxwidth and len(label) > maxwidth: - label = wrapre.sub(r"\1\n", label).strip() - label = label.split("\n") - maxnodeheight[row] = max(maxnodeheight[row], len(label)) - maxnodewith[column] = max(maxnodewith[column], max(map(len, label))) - labels[a] = label - if a not in self.edges: - continue # e.g., root - parent = self.edges[a] - childcols[parent].add((row, column)) - minchildcol[parent] = min(minchildcol.get(parent, column), column) - maxchildcol[parent] = max(maxchildcol.get(parent, column), column) - # bottom up level order traversal - for row in sorted(matrix, reverse=True): - noderows = [ - ["".center(maxnodewith[col]) for col in range(maxcol + 1)] - for _ in range(maxnodeheight[row]) - ] - branchrow = ["".center(maxnodewith[col]) for col in range(maxcol + 1)] - for col in matrix[row]: - n = matrix[row][col] - node = self.nodes[n] - text = labels[n] - if isinstance(node, Tree): - # draw horizontal branch towards children for this node - if n in minchildcol and minchildcol[n] < maxchildcol[n]: - i, j = minchildcol[n], maxchildcol[n] - a, b = (maxnodewith[i] + 1) // 2 - 1, maxnodewith[j] // 2 - branchrow[i] = ((" " * a) + leftcorner).ljust( - maxnodewith[i], horzline - ) - branchrow[j] = (rightcorner + (" " * b)).rjust( - maxnodewith[j], horzline - ) - for i in range(minchildcol[n] + 1, maxchildcol[n]): - if i == col and any(a == i for _, a in childcols[n]): - line = cross - elif i == col: - line = bottom - elif any(a == i for _, a in childcols[n]): - line = tee - else: - line = horzline - branchrow[i] = line.center(maxnodewith[i], horzline) - else: # if n and n in minchildcol: - branchrow[col] = crosscell(branchrow[col]) - text = [a.center(maxnodewith[col]) for a in text] - color = nodecolor if isinstance(node, Tree) else leafcolor - if isinstance(node, Tree) and node.label().startswith("-"): - color = funccolor - if html: - text = [escape(a, quote=False) for a in text] - if n in self.highlight: - text = [f"{a}" for a in text] - elif ansi and n in self.highlight: - text = ["\x1b[%d;1m%s\x1b[0m" % (ANSICOLOR[color], a) for a in text] - for x in range(maxnodeheight[row]): - # draw vertical lines in partially filled multiline node - # labels, but only if it's not a frontier node. - noderows[x][col] = ( - text[x] - if x < len(text) - else (vertline if childcols[n] else " ").center( - maxnodewith[col], " " - ) - ) - # for each column, if there is a node below us which has a parent - # above us, draw a vertical branch in that column. - if row != max(matrix): - for n, (childrow, col) in self.coords.items(): - if n > 0 and self.coords[self.edges[n]][0] < row < childrow: - branchrow[col] = crosscell(branchrow[col]) - if col not in matrix[row]: - for noderow in noderows: - noderow[col] = crosscell(noderow[col]) - branchrow = [ - a + ((a[-1] if a[-1] != " " else b[0]) * nodedist) - for a, b in zip(branchrow, branchrow[1:] + [" "]) - ] - result.append("".join(branchrow)) - result.extend( - (" " * nodedist).join(noderow) for noderow in reversed(noderows) - ) - return "\n".join(reversed(result)) + "\n" - - def svg(self, nodecolor="blue", leafcolor="red", funccolor="green"): - """ - :return: SVG representation of a tree. - """ - fontsize = 12 - hscale = 40 - vscale = 25 - hstart = vstart = 20 - width = max(col for _, col in self.coords.values()) - height = max(row for row, _ in self.coords.values()) - result = [ - '' - % ( - width * 3, - height * 2.5, - -hstart, - -vstart, - width * hscale + 3 * hstart, - height * vscale + 3 * vstart, - ) - ] - - children = defaultdict(set) - for n in self.nodes: - if n: - children[self.edges[n]].add(n) - - # horizontal branches from nodes to children - for node in self.nodes: - if not children[node]: - continue - y, x = self.coords[node] - x *= hscale - y *= vscale - x += hstart - y += vstart + fontsize // 2 - childx = [self.coords[c][1] for c in children[node]] - xmin = hstart + hscale * min(childx) - xmax = hstart + hscale * max(childx) - result.append( - '\t' % (xmin, y, xmax, y) - ) - result.append( - '\t' % (x, y, x, y - fontsize // 3) - ) - - # vertical branches from children to parents - for child, parent in self.edges.items(): - y, _ = self.coords[parent] - y *= vscale - y += vstart + fontsize // 2 - childy, childx = self.coords[child] - childx *= hscale - childy *= vscale - childx += hstart - childy += vstart - fontsize - result += [ - '\t' % (childx, childy, childx, y + 5), - '\t' % (childx, childy, childx, y), - ] - - # write nodes with coordinates - for n, (row, column) in self.coords.items(): - node = self.nodes[n] - x = column * hscale + hstart - y = row * vscale + vstart - if n in self.highlight: - color = nodecolor if isinstance(node, Tree) else leafcolor - if isinstance(node, Tree) and node.label().startswith("-"): - color = funccolor - else: - color = "black" - result += [ - '\t%s' - % ( - color, - fontsize, - x, - y, - escape( - node.label() if isinstance(node, Tree) else node, quote=False - ), - ) - ] - - result += [""] - return "\n".join(result) - - -def test(): - """Do some tree drawing tests.""" - - def print_tree(n, tree, sentence=None, ansi=True, **xargs): - print() - print('{}: "{}"'.format(n, " ".join(sentence or tree.leaves()))) - print(tree) - print() - drawtree = TreePrettyPrinter(tree, sentence) - try: - print(drawtree.text(unicodelines=ansi, ansi=ansi, **xargs)) - except (UnicodeDecodeError, UnicodeEncodeError): - print(drawtree.text(unicodelines=False, ansi=False, **xargs)) - - from nltk.corpus import treebank - - for n in [0, 1440, 1591, 2771, 2170]: - tree = treebank.parsed_sents()[n] - print_tree(n, tree, nodedist=2, maxwidth=8) - print() - print("ASCII version:") - print(TreePrettyPrinter(tree).text(nodedist=2)) - - tree = Tree.fromstring( - "(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) " - "(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) " - "(vg 10) (inf (verb 11)))))) (punct 12))", - read_leaf=int, - ) - sentence = ( - "Ze had met haar moeder kunnen gaan winkelen ," - " zwemmen of terrassen .".split() - ) - print_tree("Discontinuous tree", tree, sentence, nodedist=2) - - -__all__ = ["TreePrettyPrinter"] - -if __name__ == "__main__": - test() diff --git a/pipeline/nltk/tree/probabilistic.py b/pipeline/nltk/tree/probabilistic.py deleted file mode 100644 index 79a4c798ad5f73b7c515e20456a7149232958f17..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tree/probabilistic.py +++ /dev/null @@ -1,74 +0,0 @@ -# Natural Language Toolkit: Text Trees -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird -# Peter Ljunglöf -# Tom Aarsen <> -# URL: -# For license information, see LICENSE.TXT - - -from nltk.internals import raise_unorderable_types -from nltk.probability import ProbabilisticMixIn -from nltk.tree.immutable import ImmutableProbabilisticTree -from nltk.tree.tree import Tree - -###################################################################### -## Probabilistic trees -###################################################################### - - -class ProbabilisticTree(Tree, ProbabilisticMixIn): - def __init__(self, node, children=None, **prob_kwargs): - Tree.__init__(self, node, children) - ProbabilisticMixIn.__init__(self, **prob_kwargs) - - # We have to patch up these methods to make them work right: - def _frozen_class(self): - return ImmutableProbabilisticTree - - def __repr__(self): - return f"{Tree.__repr__(self)} (p={self.prob()!r})" - - def __str__(self): - return f"{self.pformat(margin=60)} (p={self.prob():.6g})" - - def copy(self, deep=False): - if not deep: - return type(self)(self._label, self, prob=self.prob()) - else: - return type(self).convert(self) - - @classmethod - def convert(cls, val): - if isinstance(val, Tree): - children = [cls.convert(child) for child in val] - if isinstance(val, ProbabilisticMixIn): - return cls(val._label, children, prob=val.prob()) - else: - return cls(val._label, children, prob=1.0) - else: - return val - - def __eq__(self, other): - return self.__class__ is other.__class__ and ( - self._label, - list(self), - self.prob(), - ) == (other._label, list(other), other.prob()) - - def __lt__(self, other): - if not isinstance(other, Tree): - raise_unorderable_types("<", self, other) - if self.__class__ is other.__class__: - return (self._label, list(self), self.prob()) < ( - other._label, - list(other), - other.prob(), - ) - else: - return self.__class__.__name__ < other.__class__.__name__ - - -__all__ = ["ProbabilisticTree"] diff --git a/pipeline/nltk/tree/transforms.py b/pipeline/nltk/tree/transforms.py deleted file mode 100644 index 99cd6893ce9f168ffa024f2bb8c39177617dced2..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tree/transforms.py +++ /dev/null @@ -1,338 +0,0 @@ -# Natural Language Toolkit: Tree Transformations -# -# Copyright (C) 2005-2007 Oregon Graduate Institute -# Author: Nathan Bodenstab -# URL: -# For license information, see LICENSE.TXT - -r""" -A collection of methods for tree (grammar) transformations used -in parsing natural language. - -Although many of these methods are technically grammar transformations -(ie. Chomsky Norm Form), when working with treebanks it is much more -natural to visualize these modifications in a tree structure. Hence, -we will do all transformation directly to the tree itself. -Transforming the tree directly also allows us to do parent annotation. -A grammar can then be simply induced from the modified tree. - -The following is a short tutorial on the available transformations. - - 1. Chomsky Normal Form (binarization) - - It is well known that any grammar has a Chomsky Normal Form (CNF) - equivalent grammar where CNF is defined by every production having - either two non-terminals or one terminal on its right hand side. - When we have hierarchically structured data (ie. a treebank), it is - natural to view this in terms of productions where the root of every - subtree is the head (left hand side) of the production and all of - its children are the right hand side constituents. In order to - convert a tree into CNF, we simply need to ensure that every subtree - has either two subtrees as children (binarization), or one leaf node - (non-terminal). In order to binarize a subtree with more than two - children, we must introduce artificial nodes. - - There are two popular methods to convert a tree into CNF: left - factoring and right factoring. The following example demonstrates - the difference between them. Example:: - - Original Right-Factored Left-Factored - - A A A - / | \ / \ / \ - B C D ==> B A| OR A| D - / \ / \ - C D B C - - 2. Parent Annotation - - In addition to binarizing the tree, there are two standard - modifications to node labels we can do in the same traversal: parent - annotation and Markov order-N smoothing (or sibling smoothing). - - The purpose of parent annotation is to refine the probabilities of - productions by adding a small amount of context. With this simple - addition, a CYK (inside-outside, dynamic programming chart parse) - can improve from 74% to 79% accuracy. A natural generalization from - parent annotation is to grandparent annotation and beyond. The - tradeoff becomes accuracy gain vs. computational complexity. We - must also keep in mind data sparcity issues. Example:: - - Original Parent Annotation - - A A^ - / | \ / \ - B C D ==> B^
    A|^ where ? is the - / \ parent of A - C^ D^ - - - 3. Markov order-N smoothing - - Markov smoothing combats data sparcity issues as well as decreasing - computational requirements by limiting the number of children - included in artificial nodes. In practice, most people use an order - 2 grammar. Example:: - - Original No Smoothing Markov order 1 Markov order 2 etc. - - __A__ A A A - / /|\ \ / \ / \ / \ - B C D E F ==> B A| ==> B A| ==> B A| - / \ / \ / \ - C ... C ... C ... - - - - Annotation decisions can be thought about in the vertical direction - (parent, grandparent, etc) and the horizontal direction (number of - siblings to keep). Parameters to the following functions specify - these values. For more information see: - - Dan Klein and Chris Manning (2003) "Accurate Unlexicalized - Parsing", ACL-03. https://www.aclweb.org/anthology/P03-1054 - - 4. Unary Collapsing - - Collapse unary productions (ie. subtrees with a single child) into a - new non-terminal (Tree node). This is useful when working with - algorithms that do not allow unary productions, yet you do not wish - to lose the parent information. Example:: - - A - | - B ==> A+B - / \ / \ - C D C D - -""" - -from nltk.tree.tree import Tree - - -def chomsky_normal_form( - tree, factor="right", horzMarkov=None, vertMarkov=0, childChar="|", parentChar="^" -): - # assume all subtrees have homogeneous children - # assume all terminals have no siblings - - # A semi-hack to have elegant looking code below. As a result, - # any subtree with a branching factor greater than 999 will be incorrectly truncated. - if horzMarkov is None: - horzMarkov = 999 - - # Traverse the tree depth-first keeping a list of ancestor nodes to the root. - # I chose not to use the tree.treepositions() method since it requires - # two traversals of the tree (one to get the positions, one to iterate - # over them) and node access time is proportional to the height of the node. - # This method is 7x faster which helps when parsing 40,000 sentences. - - nodeList = [(tree, [tree.label()])] - while nodeList != []: - node, parent = nodeList.pop() - if isinstance(node, Tree): - - # parent annotation - parentString = "" - originalNode = node.label() - if vertMarkov != 0 and node != tree and isinstance(node[0], Tree): - parentString = "{}<{}>".format(parentChar, "-".join(parent)) - node.set_label(node.label() + parentString) - parent = [originalNode] + parent[: vertMarkov - 1] - - # add children to the agenda before we mess with them - for child in node: - nodeList.append((child, parent)) - - # chomsky normal form factorization - if len(node) > 2: - childNodes = [child.label() for child in node] - nodeCopy = node.copy() - node[0:] = [] # delete the children - - curNode = node - numChildren = len(nodeCopy) - for i in range(1, numChildren - 1): - if factor == "right": - newHead = "{}{}<{}>{}".format( - originalNode, - childChar, - "-".join( - childNodes[i : min([i + horzMarkov, numChildren])] - ), - parentString, - ) # create new head - newNode = Tree(newHead, []) - curNode[0:] = [nodeCopy.pop(0), newNode] - else: - newHead = "{}{}<{}>{}".format( - originalNode, - childChar, - "-".join( - childNodes[max([numChildren - i - horzMarkov, 0]) : -i] - ), - parentString, - ) - newNode = Tree(newHead, []) - curNode[0:] = [newNode, nodeCopy.pop()] - - curNode = newNode - - curNode[0:] = [child for child in nodeCopy] - - -def un_chomsky_normal_form( - tree, expandUnary=True, childChar="|", parentChar="^", unaryChar="+" -): - # Traverse the tree-depth first keeping a pointer to the parent for modification purposes. - nodeList = [(tree, [])] - while nodeList != []: - node, parent = nodeList.pop() - if isinstance(node, Tree): - # if the node contains the 'childChar' character it means that - # it is an artificial node and can be removed, although we still need - # to move its children to its parent - childIndex = node.label().find(childChar) - if childIndex != -1: - nodeIndex = parent.index(node) - parent.remove(parent[nodeIndex]) - # Generated node was on the left if the nodeIndex is 0 which - # means the grammar was left factored. We must insert the children - # at the beginning of the parent's children - if nodeIndex == 0: - parent.insert(0, node[0]) - parent.insert(1, node[1]) - else: - parent.extend([node[0], node[1]]) - - # parent is now the current node so the children of parent will be added to the agenda - node = parent - else: - parentIndex = node.label().find(parentChar) - if parentIndex != -1: - # strip the node name of the parent annotation - node.set_label(node.label()[:parentIndex]) - - # expand collapsed unary productions - if expandUnary == True: - unaryIndex = node.label().find(unaryChar) - if unaryIndex != -1: - newNode = Tree( - node.label()[unaryIndex + 1 :], [i for i in node] - ) - node.set_label(node.label()[:unaryIndex]) - node[0:] = [newNode] - - for child in node: - nodeList.append((child, node)) - - -def collapse_unary(tree, collapsePOS=False, collapseRoot=False, joinChar="+"): - """ - Collapse subtrees with a single child (ie. unary productions) - into a new non-terminal (Tree node) joined by 'joinChar'. - This is useful when working with algorithms that do not allow - unary productions, and completely removing the unary productions - would require loss of useful information. The Tree is modified - directly (since it is passed by reference) and no value is returned. - - :param tree: The Tree to be collapsed - :type tree: Tree - :param collapsePOS: 'False' (default) will not collapse the parent of leaf nodes (ie. - Part-of-Speech tags) since they are always unary productions - :type collapsePOS: bool - :param collapseRoot: 'False' (default) will not modify the root production - if it is unary. For the Penn WSJ treebank corpus, this corresponds - to the TOP -> productions. - :type collapseRoot: bool - :param joinChar: A string used to connect collapsed node values (default = "+") - :type joinChar: str - """ - - if collapseRoot == False and isinstance(tree, Tree) and len(tree) == 1: - nodeList = [tree[0]] - else: - nodeList = [tree] - - # depth-first traversal of tree - while nodeList != []: - node = nodeList.pop() - if isinstance(node, Tree): - if ( - len(node) == 1 - and isinstance(node[0], Tree) - and (collapsePOS == True or isinstance(node[0, 0], Tree)) - ): - node.set_label(node.label() + joinChar + node[0].label()) - node[0:] = [child for child in node[0]] - # since we assigned the child's children to the current node, - # evaluate the current node again - nodeList.append(node) - else: - for child in node: - nodeList.append(child) - - -################################################################# -# Demonstration -################################################################# - - -def demo(): - """ - A demonstration showing how each tree transform can be used. - """ - - from copy import deepcopy - - from nltk.draw.tree import draw_trees - from nltk.tree.tree import Tree - - # original tree from WSJ bracketed text - sentence = """(TOP - (S - (S - (VP - (VBN Turned) - (ADVP (RB loose)) - (PP - (IN in) - (NP - (NP (NNP Shane) (NNP Longman) (POS 's)) - (NN trading) - (NN room))))) - (, ,) - (NP (DT the) (NN yuppie) (NNS dealers)) - (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) - (. .)))""" - t = Tree.fromstring(sentence, remove_empty_top_bracketing=True) - - # collapse subtrees with only one child - collapsedTree = deepcopy(t) - collapse_unary(collapsedTree) - - # convert the tree to CNF - cnfTree = deepcopy(collapsedTree) - chomsky_normal_form(cnfTree) - - # convert the tree to CNF with parent annotation (one level) and horizontal smoothing of order two - parentTree = deepcopy(collapsedTree) - chomsky_normal_form(parentTree, horzMarkov=2, vertMarkov=1) - - # convert the tree back to its original form (used to make CYK results comparable) - original = deepcopy(parentTree) - un_chomsky_normal_form(original) - - # convert tree back to bracketed text - sentence2 = original.pprint() - print(sentence) - print(sentence2) - print("Sentences the same? ", sentence == sentence2) - - draw_trees(t, collapsedTree, cnfTree, parentTree, original) - - -if __name__ == "__main__": - demo() - -__all__ = ["chomsky_normal_form", "un_chomsky_normal_form", "collapse_unary"] diff --git a/pipeline/nltk/tree/tree.py b/pipeline/nltk/tree/tree.py deleted file mode 100644 index d21be4091c5f6855a42a12dcddc53f21aafa16ea..0000000000000000000000000000000000000000 --- a/pipeline/nltk/tree/tree.py +++ /dev/null @@ -1,982 +0,0 @@ -# Natural Language Toolkit: Text Trees -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Edward Loper -# Steven Bird -# Peter Ljunglöf -# Nathan Bodenstab (tree transforms) -# Eric Kafe (Tree.fromlist()) -# Mohaned mashaly (Deprecating methods) -# URL: -# For license information, see LICENSE.TXT - -""" -Class for representing hierarchical language structures, such as -syntax trees and morphological trees. -""" - -import re - -from nltk.grammar import Nonterminal, Production -from nltk.internals import deprecated - -###################################################################### -## Trees -###################################################################### - - -class Tree(list): - r""" - A Tree represents a hierarchical grouping of leaves and subtrees. - For example, each constituent in a syntax tree is represented by a single Tree. - - A tree's children are encoded as a list of leaves and subtrees, - where a leaf is a basic (non-tree) value; and a subtree is a - nested Tree. - - >>> from nltk.tree import Tree - >>> print(Tree(1, [2, Tree(3, [4]), 5])) - (1 2 (3 4) 5) - >>> vp = Tree('VP', [Tree('V', ['saw']), - ... Tree('NP', ['him'])]) - >>> s = Tree('S', [Tree('NP', ['I']), vp]) - >>> print(s) - (S (NP I) (VP (V saw) (NP him))) - >>> print(s[1]) - (VP (V saw) (NP him)) - >>> print(s[1,1]) - (NP him) - >>> t = Tree.fromstring("(S (NP I) (VP (V saw) (NP him)))") - >>> s == t - True - >>> t[1][1].set_label('X') - >>> t[1][1].label() - 'X' - >>> print(t) - (S (NP I) (VP (V saw) (X him))) - >>> t[0], t[1,1] = t[1,1], t[0] - >>> print(t) - (S (X him) (VP (V saw) (NP I))) - - The length of a tree is the number of children it has. - - >>> len(t) - 2 - - The set_label() and label() methods allow individual constituents - to be labeled. For example, syntax trees use this label to specify - phrase tags, such as "NP" and "VP". - - Several Tree methods use "tree positions" to specify - children or descendants of a tree. Tree positions are defined as - follows: - - - The tree position *i* specifies a Tree's *i*\ th child. - - The tree position ``()`` specifies the Tree itself. - - If *p* is the tree position of descendant *d*, then - *p+i* specifies the *i*\ th child of *d*. - - I.e., every tree position is either a single index *i*, - specifying ``tree[i]``; or a sequence *i1, i2, ..., iN*, - specifying ``tree[i1][i2]...[iN]``. - - Construct a new tree. This constructor can be called in one - of two ways: - - - ``Tree(label, children)`` constructs a new tree with the - specified label and list of children. - - - ``Tree.fromstring(s)`` constructs a new tree by parsing the string ``s``. - """ - - def __init__(self, node, children=None): - if children is None: - raise TypeError( - "%s: Expected a node value and child list " % type(self).__name__ - ) - elif isinstance(children, str): - raise TypeError( - "%s() argument 2 should be a list, not a " - "string" % type(self).__name__ - ) - else: - list.__init__(self, children) - self._label = node - - # //////////////////////////////////////////////////////////// - # Comparison operators - # //////////////////////////////////////////////////////////// - - def __eq__(self, other): - return self.__class__ is other.__class__ and (self._label, list(self)) == ( - other._label, - list(other), - ) - - def __lt__(self, other): - if not isinstance(other, Tree): - # raise_unorderable_types("<", self, other) - # Sometimes children can be pure strings, - # so we need to be able to compare with non-trees: - return self.__class__.__name__ < other.__class__.__name__ - elif self.__class__ is other.__class__: - return (self._label, list(self)) < (other._label, list(other)) - else: - return self.__class__.__name__ < other.__class__.__name__ - - # @total_ordering doesn't work here, since the class inherits from a builtin class - __ne__ = lambda self, other: not self == other - __gt__ = lambda self, other: not (self < other or self == other) - __le__ = lambda self, other: self < other or self == other - __ge__ = lambda self, other: not self < other - - # //////////////////////////////////////////////////////////// - # Disabled list operations - # //////////////////////////////////////////////////////////// - - def __mul__(self, v): - raise TypeError("Tree does not support multiplication") - - def __rmul__(self, v): - raise TypeError("Tree does not support multiplication") - - def __add__(self, v): - raise TypeError("Tree does not support addition") - - def __radd__(self, v): - raise TypeError("Tree does not support addition") - - # //////////////////////////////////////////////////////////// - # Indexing (with support for tree positions) - # //////////////////////////////////////////////////////////// - - def __getitem__(self, index): - if isinstance(index, (int, slice)): - return list.__getitem__(self, index) - elif isinstance(index, (list, tuple)): - if len(index) == 0: - return self - elif len(index) == 1: - return self[index[0]] - else: - return self[index[0]][index[1:]] - else: - raise TypeError( - "%s indices must be integers, not %s" - % (type(self).__name__, type(index).__name__) - ) - - def __setitem__(self, index, value): - if isinstance(index, (int, slice)): - return list.__setitem__(self, index, value) - elif isinstance(index, (list, tuple)): - if len(index) == 0: - raise IndexError("The tree position () may not be " "assigned to.") - elif len(index) == 1: - self[index[0]] = value - else: - self[index[0]][index[1:]] = value - else: - raise TypeError( - "%s indices must be integers, not %s" - % (type(self).__name__, type(index).__name__) - ) - - def __delitem__(self, index): - if isinstance(index, (int, slice)): - return list.__delitem__(self, index) - elif isinstance(index, (list, tuple)): - if len(index) == 0: - raise IndexError("The tree position () may not be deleted.") - elif len(index) == 1: - del self[index[0]] - else: - del self[index[0]][index[1:]] - else: - raise TypeError( - "%s indices must be integers, not %s" - % (type(self).__name__, type(index).__name__) - ) - - # //////////////////////////////////////////////////////////// - # Basic tree operations - # //////////////////////////////////////////////////////////// - @deprecated("Use label() instead") - def _get_node(self): - """Outdated method to access the node value; use the label() method instead.""" - - @deprecated("Use set_label() instead") - def _set_node(self, value): - """Outdated method to set the node value; use the set_label() method instead.""" - - node = property(_get_node, _set_node) - - def label(self): - """ - Return the node label of the tree. - - >>> t = Tree.fromstring('(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))') - >>> t.label() - 'S' - - :return: the node label (typically a string) - :rtype: any - """ - return self._label - - def set_label(self, label): - """ - Set the node label of the tree. - - >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") - >>> t.set_label("T") - >>> print(t) - (T (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat)))) - - :param label: the node label (typically a string) - :type label: any - """ - self._label = label - - def leaves(self): - """ - Return the leaves of the tree. - - >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") - >>> t.leaves() - ['the', 'dog', 'chased', 'the', 'cat'] - - :return: a list containing this tree's leaves. - The order reflects the order of the - leaves in the tree's hierarchical structure. - :rtype: list - """ - leaves = [] - for child in self: - if isinstance(child, Tree): - leaves.extend(child.leaves()) - else: - leaves.append(child) - return leaves - - def flatten(self): - """ - Return a flat version of the tree, with all non-root non-terminals removed. - - >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") - >>> print(t.flatten()) - (S the dog chased the cat) - - :return: a tree consisting of this tree's root connected directly to - its leaves, omitting all intervening non-terminal nodes. - :rtype: Tree - """ - return Tree(self.label(), self.leaves()) - - def height(self): - """ - Return the height of the tree. - - >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") - >>> t.height() - 5 - >>> print(t[0,0]) - (D the) - >>> t[0,0].height() - 2 - - :return: The height of this tree. The height of a tree - containing no children is 1; the height of a tree - containing only leaves is 2; and the height of any other - tree is one plus the maximum of its children's - heights. - :rtype: int - """ - max_child_height = 0 - for child in self: - if isinstance(child, Tree): - max_child_height = max(max_child_height, child.height()) - else: - max_child_height = max(max_child_height, 1) - return 1 + max_child_height - - def treepositions(self, order="preorder"): - """ - >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") - >>> t.treepositions() # doctest: +ELLIPSIS - [(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0), (1, 0, 0), ...] - >>> for pos in t.treepositions('leaves'): - ... t[pos] = t[pos][::-1].upper() - >>> print(t) - (S (NP (D EHT) (N GOD)) (VP (V DESAHC) (NP (D EHT) (N TAC)))) - - :param order: One of: ``preorder``, ``postorder``, ``bothorder``, - ``leaves``. - """ - positions = [] - if order in ("preorder", "bothorder"): - positions.append(()) - for i, child in enumerate(self): - if isinstance(child, Tree): - childpos = child.treepositions(order) - positions.extend((i,) + p for p in childpos) - else: - positions.append((i,)) - if order in ("postorder", "bothorder"): - positions.append(()) - return positions - - def subtrees(self, filter=None): - """ - Generate all the subtrees of this tree, optionally restricted - to trees matching the filter function. - - >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") - >>> for s in t.subtrees(lambda t: t.height() == 2): - ... print(s) - (D the) - (N dog) - (V chased) - (D the) - (N cat) - - :type filter: function - :param filter: the function to filter all local trees - """ - if not filter or filter(self): - yield self - for child in self: - if isinstance(child, Tree): - yield from child.subtrees(filter) - - def productions(self): - """ - Generate the productions that correspond to the non-terminal nodes of the tree. - For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the - form P -> C1 C2 ... Cn. - - >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") - >>> t.productions() # doctest: +NORMALIZE_WHITESPACE - [S -> NP VP, NP -> D N, D -> 'the', N -> 'dog', VP -> V NP, V -> 'chased', - NP -> D N, D -> 'the', N -> 'cat'] - - :rtype: list(Production) - """ - - if not isinstance(self._label, str): - raise TypeError( - "Productions can only be generated from trees having node labels that are strings" - ) - - prods = [Production(Nonterminal(self._label), _child_names(self))] - for child in self: - if isinstance(child, Tree): - prods += child.productions() - return prods - - def pos(self): - """ - Return a sequence of pos-tagged words extracted from the tree. - - >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") - >>> t.pos() - [('the', 'D'), ('dog', 'N'), ('chased', 'V'), ('the', 'D'), ('cat', 'N')] - - :return: a list of tuples containing leaves and pre-terminals (part-of-speech tags). - The order reflects the order of the leaves in the tree's hierarchical structure. - :rtype: list(tuple) - """ - pos = [] - for child in self: - if isinstance(child, Tree): - pos.extend(child.pos()) - else: - pos.append((child, self._label)) - return pos - - def leaf_treeposition(self, index): - """ - :return: The tree position of the ``index``-th leaf in this - tree. I.e., if ``tp=self.leaf_treeposition(i)``, then - ``self[tp]==self.leaves()[i]``. - - :raise IndexError: If this tree contains fewer than ``index+1`` - leaves, or if ``index<0``. - """ - if index < 0: - raise IndexError("index must be non-negative") - - stack = [(self, ())] - while stack: - value, treepos = stack.pop() - if not isinstance(value, Tree): - if index == 0: - return treepos - else: - index -= 1 - else: - for i in range(len(value) - 1, -1, -1): - stack.append((value[i], treepos + (i,))) - - raise IndexError("index must be less than or equal to len(self)") - - def treeposition_spanning_leaves(self, start, end): - """ - :return: The tree position of the lowest descendant of this - tree that dominates ``self.leaves()[start:end]``. - :raise ValueError: if ``end <= start`` - """ - if end <= start: - raise ValueError("end must be greater than start") - # Find the tree positions of the start & end leaves, and - # take the longest common subsequence. - start_treepos = self.leaf_treeposition(start) - end_treepos = self.leaf_treeposition(end - 1) - # Find the first index where they mismatch: - for i in range(len(start_treepos)): - if i == len(end_treepos) or start_treepos[i] != end_treepos[i]: - return start_treepos[:i] - return start_treepos - - # //////////////////////////////////////////////////////////// - # Transforms - # //////////////////////////////////////////////////////////// - - def chomsky_normal_form( - self, - factor="right", - horzMarkov=None, - vertMarkov=0, - childChar="|", - parentChar="^", - ): - """ - This method can modify a tree in three ways: - - 1. Convert a tree into its Chomsky Normal Form (CNF) - equivalent -- Every subtree has either two non-terminals - or one terminal as its children. This process requires - the creation of more"artificial" non-terminal nodes. - 2. Markov (vertical) smoothing of children in new artificial - nodes - 3. Horizontal (parent) annotation of nodes - - :param factor: Right or left factoring method (default = "right") - :type factor: str = [left|right] - :param horzMarkov: Markov order for sibling smoothing in artificial nodes (None (default) = include all siblings) - :type horzMarkov: int | None - :param vertMarkov: Markov order for parent smoothing (0 (default) = no vertical annotation) - :type vertMarkov: int | None - :param childChar: A string used in construction of the artificial nodes, separating the head of the - original subtree from the child nodes that have yet to be expanded (default = "|") - :type childChar: str - :param parentChar: A string used to separate the node representation from its vertical annotation - :type parentChar: str - """ - from nltk.tree.transforms import chomsky_normal_form - - chomsky_normal_form(self, factor, horzMarkov, vertMarkov, childChar, parentChar) - - def un_chomsky_normal_form( - self, expandUnary=True, childChar="|", parentChar="^", unaryChar="+" - ): - """ - This method modifies the tree in three ways: - - 1. Transforms a tree in Chomsky Normal Form back to its - original structure (branching greater than two) - 2. Removes any parent annotation (if it exists) - 3. (optional) expands unary subtrees (if previously - collapsed with collapseUnary(...) ) - - :param expandUnary: Flag to expand unary or not (default = True) - :type expandUnary: bool - :param childChar: A string separating the head node from its children in an artificial node (default = "|") - :type childChar: str - :param parentChar: A string separating the node label from its parent annotation (default = "^") - :type parentChar: str - :param unaryChar: A string joining two non-terminals in a unary production (default = "+") - :type unaryChar: str - """ - from nltk.tree.transforms import un_chomsky_normal_form - - un_chomsky_normal_form(self, expandUnary, childChar, parentChar, unaryChar) - - def collapse_unary(self, collapsePOS=False, collapseRoot=False, joinChar="+"): - """ - Collapse subtrees with a single child (ie. unary productions) - into a new non-terminal (Tree node) joined by 'joinChar'. - This is useful when working with algorithms that do not allow - unary productions, and completely removing the unary productions - would require loss of useful information. The Tree is modified - directly (since it is passed by reference) and no value is returned. - - :param collapsePOS: 'False' (default) will not collapse the parent of leaf nodes (ie. - Part-of-Speech tags) since they are always unary productions - :type collapsePOS: bool - :param collapseRoot: 'False' (default) will not modify the root production - if it is unary. For the Penn WSJ treebank corpus, this corresponds - to the TOP -> productions. - :type collapseRoot: bool - :param joinChar: A string used to connect collapsed node values (default = "+") - :type joinChar: str - """ - from nltk.tree.transforms import collapse_unary - - collapse_unary(self, collapsePOS, collapseRoot, joinChar) - - # //////////////////////////////////////////////////////////// - # Convert, copy - # //////////////////////////////////////////////////////////// - - @classmethod - def convert(cls, tree): - """ - Convert a tree between different subtypes of Tree. ``cls`` determines - which class will be used to encode the new tree. - - :type tree: Tree - :param tree: The tree that should be converted. - :return: The new Tree. - """ - if isinstance(tree, Tree): - children = [cls.convert(child) for child in tree] - return cls(tree._label, children) - else: - return tree - - def __copy__(self): - return self.copy() - - def __deepcopy__(self, memo): - return self.copy(deep=True) - - def copy(self, deep=False): - if not deep: - return type(self)(self._label, self) - else: - return type(self).convert(self) - - def _frozen_class(self): - from nltk.tree.immutable import ImmutableTree - - return ImmutableTree - - def freeze(self, leaf_freezer=None): - frozen_class = self._frozen_class() - if leaf_freezer is None: - newcopy = frozen_class.convert(self) - else: - newcopy = self.copy(deep=True) - for pos in newcopy.treepositions("leaves"): - newcopy[pos] = leaf_freezer(newcopy[pos]) - newcopy = frozen_class.convert(newcopy) - hash(newcopy) # Make sure the leaves are hashable. - return newcopy - - # //////////////////////////////////////////////////////////// - # Parsing - # //////////////////////////////////////////////////////////// - - @classmethod - def fromstring( - cls, - s, - brackets="()", - read_node=None, - read_leaf=None, - node_pattern=None, - leaf_pattern=None, - remove_empty_top_bracketing=False, - ): - """ - Read a bracketed tree string and return the resulting tree. - Trees are represented as nested brackettings, such as:: - - (S (NP (NNP John)) (VP (V runs))) - - :type s: str - :param s: The string to read - - :type brackets: str (length=2) - :param brackets: The bracket characters used to mark the - beginning and end of trees and subtrees. - - :type read_node: function - :type read_leaf: function - :param read_node, read_leaf: If specified, these functions - are applied to the substrings of ``s`` corresponding to - nodes and leaves (respectively) to obtain the values for - those nodes and leaves. They should have the following - signature: - - read_node(str) -> value - - For example, these functions could be used to process nodes - and leaves whose values should be some type other than - string (such as ``FeatStruct``). - Note that by default, node strings and leaf strings are - delimited by whitespace and brackets; to override this - default, use the ``node_pattern`` and ``leaf_pattern`` - arguments. - - :type node_pattern: str - :type leaf_pattern: str - :param node_pattern, leaf_pattern: Regular expression patterns - used to find node and leaf substrings in ``s``. By - default, both nodes patterns are defined to match any - sequence of non-whitespace non-bracket characters. - - :type remove_empty_top_bracketing: bool - :param remove_empty_top_bracketing: If the resulting tree has - an empty node label, and is length one, then return its - single child instead. This is useful for treebank trees, - which sometimes contain an extra level of bracketing. - - :return: A tree corresponding to the string representation ``s``. - If this class method is called using a subclass of Tree, - then it will return a tree of that type. - :rtype: Tree - """ - if not isinstance(brackets, str) or len(brackets) != 2: - raise TypeError("brackets must be a length-2 string") - if re.search(r"\s", brackets): - raise TypeError("whitespace brackets not allowed") - # Construct a regexp that will tokenize the string. - open_b, close_b = brackets - open_pattern, close_pattern = (re.escape(open_b), re.escape(close_b)) - if node_pattern is None: - node_pattern = rf"[^\s{open_pattern}{close_pattern}]+" - if leaf_pattern is None: - leaf_pattern = rf"[^\s{open_pattern}{close_pattern}]+" - token_re = re.compile( - r"%s\s*(%s)?|%s|(%s)" - % (open_pattern, node_pattern, close_pattern, leaf_pattern) - ) - # Walk through each token, updating a stack of trees. - stack = [(None, [])] # list of (node, children) tuples - for match in token_re.finditer(s): - token = match.group() - # Beginning of a tree/subtree - if token[0] == open_b: - if len(stack) == 1 and len(stack[0][1]) > 0: - cls._parse_error(s, match, "end-of-string") - label = token[1:].lstrip() - if read_node is not None: - label = read_node(label) - stack.append((label, [])) - # End of a tree/subtree - elif token == close_b: - if len(stack) == 1: - if len(stack[0][1]) == 0: - cls._parse_error(s, match, open_b) - else: - cls._parse_error(s, match, "end-of-string") - label, children = stack.pop() - stack[-1][1].append(cls(label, children)) - # Leaf node - else: - if len(stack) == 1: - cls._parse_error(s, match, open_b) - if read_leaf is not None: - token = read_leaf(token) - stack[-1][1].append(token) - - # check that we got exactly one complete tree. - if len(stack) > 1: - cls._parse_error(s, "end-of-string", close_b) - elif len(stack[0][1]) == 0: - cls._parse_error(s, "end-of-string", open_b) - else: - assert stack[0][0] is None - assert len(stack[0][1]) == 1 - tree = stack[0][1][0] - - # If the tree has an extra level with node='', then get rid of - # it. E.g.: "((S (NP ...) (VP ...)))" - if remove_empty_top_bracketing and tree._label == "" and len(tree) == 1: - tree = tree[0] - # return the tree. - return tree - - @classmethod - def _parse_error(cls, s, match, expecting): - """ - Display a friendly error message when parsing a tree string fails. - :param s: The string we're parsing. - :param match: regexp match of the problem token. - :param expecting: what we expected to see instead. - """ - # Construct a basic error message - if match == "end-of-string": - pos, token = len(s), "end-of-string" - else: - pos, token = match.start(), match.group() - msg = "%s.read(): expected %r but got %r\n%sat index %d." % ( - cls.__name__, - expecting, - token, - " " * 12, - pos, - ) - # Add a display showing the error token itsels: - s = s.replace("\n", " ").replace("\t", " ") - offset = pos - if len(s) > pos + 10: - s = s[: pos + 10] + "..." - if pos > 10: - s = "..." + s[pos - 10 :] - offset = 13 - msg += '\n{}"{}"\n{}^'.format(" " * 16, s, " " * (17 + offset)) - raise ValueError(msg) - - @classmethod - def fromlist(cls, l): - """ - :type l: list - :param l: a tree represented as nested lists - - :return: A tree corresponding to the list representation ``l``. - :rtype: Tree - - Convert nested lists to a NLTK Tree - """ - if type(l) == list and len(l) > 0: - label = repr(l[0]) - if len(l) > 1: - return Tree(label, [cls.fromlist(child) for child in l[1:]]) - else: - return label - - # //////////////////////////////////////////////////////////// - # Visualization & String Representation - # //////////////////////////////////////////////////////////// - - def draw(self): - """ - Open a new window containing a graphical diagram of this tree. - """ - from nltk.draw.tree import draw_trees - - draw_trees(self) - - def pretty_print(self, sentence=None, highlight=(), stream=None, **kwargs): - """ - Pretty-print this tree as ASCII or Unicode art. - For explanation of the arguments, see the documentation for - `nltk.tree.prettyprinter.TreePrettyPrinter`. - """ - from nltk.tree.prettyprinter import TreePrettyPrinter - - print(TreePrettyPrinter(self, sentence, highlight).text(**kwargs), file=stream) - - def __repr__(self): - childstr = ", ".join(repr(c) for c in self) - return "{}({}, [{}])".format( - type(self).__name__, - repr(self._label), - childstr, - ) - - def _repr_svg_(self): - from svgling import draw_tree - - return draw_tree(self)._repr_svg_() - - def __str__(self): - return self.pformat() - - def pprint(self, **kwargs): - """ - Print a string representation of this Tree to 'stream' - """ - - if "stream" in kwargs: - stream = kwargs["stream"] - del kwargs["stream"] - else: - stream = None - print(self.pformat(**kwargs), file=stream) - - def pformat(self, margin=70, indent=0, nodesep="", parens="()", quotes=False): - """ - :return: A pretty-printed string representation of this tree. - :rtype: str - :param margin: The right margin at which to do line-wrapping. - :type margin: int - :param indent: The indentation level at which printing - begins. This number is used to decide how far to indent - subsequent lines. - :type indent: int - :param nodesep: A string that is used to separate the node - from the children. E.g., the default value ``':'`` gives - trees like ``(S: (NP: I) (VP: (V: saw) (NP: it)))``. - """ - - # Try writing it on one line. - s = self._pformat_flat(nodesep, parens, quotes) - if len(s) + indent < margin: - return s - - # If it doesn't fit on one line, then write it on multi-lines. - if isinstance(self._label, str): - s = f"{parens[0]}{self._label}{nodesep}" - else: - s = f"{parens[0]}{repr(self._label)}{nodesep}" - for child in self: - if isinstance(child, Tree): - s += ( - "\n" - + " " * (indent + 2) - + child.pformat(margin, indent + 2, nodesep, parens, quotes) - ) - elif isinstance(child, tuple): - s += "\n" + " " * (indent + 2) + "/".join(child) - elif isinstance(child, str) and not quotes: - s += "\n" + " " * (indent + 2) + "%s" % child - else: - s += "\n" + " " * (indent + 2) + repr(child) - return s + parens[1] - - def pformat_latex_qtree(self): - r""" - Returns a representation of the tree compatible with the - LaTeX qtree package. This consists of the string ``\Tree`` - followed by the tree represented in bracketed notation. - - For example, the following result was generated from a parse tree of - the sentence ``The announcement astounded us``:: - - \Tree [.I'' [.N'' [.D The ] [.N' [.N announcement ] ] ] - [.I' [.V'' [.V' [.V astounded ] [.N'' [.N' [.N us ] ] ] ] ] ] ] - - See https://www.ling.upenn.edu/advice/latex.html for the LaTeX - style file for the qtree package. - - :return: A latex qtree representation of this tree. - :rtype: str - """ - reserved_chars = re.compile(r"([#\$%&~_\{\}])") - - pformat = self.pformat(indent=6, nodesep="", parens=("[.", " ]")) - return r"\Tree " + re.sub(reserved_chars, r"\\\1", pformat) - - def _pformat_flat(self, nodesep, parens, quotes): - childstrs = [] - for child in self: - if isinstance(child, Tree): - childstrs.append(child._pformat_flat(nodesep, parens, quotes)) - elif isinstance(child, tuple): - childstrs.append("/".join(child)) - elif isinstance(child, str) and not quotes: - childstrs.append("%s" % child) - else: - childstrs.append(repr(child)) - if isinstance(self._label, str): - return "{}{}{} {}{}".format( - parens[0], - self._label, - nodesep, - " ".join(childstrs), - parens[1], - ) - else: - return "{}{}{} {}{}".format( - parens[0], - repr(self._label), - nodesep, - " ".join(childstrs), - parens[1], - ) - - -def _child_names(tree): - names = [] - for child in tree: - if isinstance(child, Tree): - names.append(Nonterminal(child._label)) - else: - names.append(child) - return names - - -###################################################################### -## Demonstration -###################################################################### - - -def demo(): - """ - A demonstration showing how Trees and Trees can be - used. This demonstration creates a Tree, and loads a - Tree from the Treebank corpus, - and shows the results of calling several of their methods. - """ - - from nltk import ProbabilisticTree, Tree - - # Demonstrate tree parsing. - s = "(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))" - t = Tree.fromstring(s) - print("Convert bracketed string into tree:") - print(t) - print(t.__repr__()) - - print("Display tree properties:") - print(t.label()) # tree's constituent type - print(t[0]) # tree's first child - print(t[1]) # tree's second child - print(t.height()) - print(t.leaves()) - print(t[1]) - print(t[1, 1]) - print(t[1, 1, 0]) - - # Demonstrate tree modification. - the_cat = t[0] - the_cat.insert(1, Tree.fromstring("(JJ big)")) - print("Tree modification:") - print(t) - t[1, 1, 1] = Tree.fromstring("(NN cake)") - print(t) - print() - - # Tree transforms - print("Collapse unary:") - t.collapse_unary() - print(t) - print("Chomsky normal form:") - t.chomsky_normal_form() - print(t) - print() - - # Demonstrate probabilistic trees. - pt = ProbabilisticTree("x", ["y", "z"], prob=0.5) - print("Probabilistic Tree:") - print(pt) - print() - - # Demonstrate parsing of treebank output format. - t = Tree.fromstring(t.pformat()) - print("Convert tree to bracketed string and back again:") - print(t) - print() - - # Demonstrate LaTeX output - print("LaTeX output:") - print(t.pformat_latex_qtree()) - print() - - # Demonstrate Productions - print("Production output:") - print(t.productions()) - print() - - # Demonstrate tree nodes containing objects other than strings - t.set_label(("test", 3)) - print(t) - - -__all__ = [ - "Tree", -] diff --git a/pipeline/nltk/treeprettyprinter.py b/pipeline/nltk/treeprettyprinter.py deleted file mode 100644 index ed4e766b47c123d66b7df326cbfde26a2db99363..0000000000000000000000000000000000000000 --- a/pipeline/nltk/treeprettyprinter.py +++ /dev/null @@ -1,28 +0,0 @@ -# Natural Language Toolkit: ASCII visualization of NLTK trees -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Andreas van Cranenburgh -# Peter Ljunglöf -# URL: -# For license information, see LICENSE.TXT - -""" -Pretty-printing of discontinuous trees. -Adapted from the disco-dop project, by Andreas van Cranenburgh. -https://github.com/andreasvc/disco-dop - -Interesting reference (not used for this code): -T. Eschbach et al., Orth. Hypergraph Drawing, Journal of -Graph Algorithms and Applications, 10(2) 141--157 (2006)149. -https://jgaa.info/accepted/2006/EschbachGuentherBecker2006.10.2.pdf -""" - -from nltk.internals import Deprecated -from nltk.tree.prettyprinter import TreePrettyPrinter as TPP - - -class TreePrettyPrinter(Deprecated, TPP): - """Import `TreePrettyPrinter` using `from nltk.tree import TreePrettyPrinter` instead.""" - - -__all__ = ["TreePrettyPrinter"] diff --git a/pipeline/nltk/treetransforms.py b/pipeline/nltk/treetransforms.py deleted file mode 100644 index 6ebc061f321c701c7851370cd00cacb4499a256c..0000000000000000000000000000000000000000 --- a/pipeline/nltk/treetransforms.py +++ /dev/null @@ -1,126 +0,0 @@ -# Natural Language Toolkit: Tree Transformations -# -# Copyright (C) 2005-2007 Oregon Graduate Institute -# Author: Nathan Bodenstab -# URL: -# For license information, see LICENSE.TXT - -r""" -A collection of methods for tree (grammar) transformations used -in parsing natural language. - -Although many of these methods are technically grammar transformations -(ie. Chomsky Norm Form), when working with treebanks it is much more -natural to visualize these modifications in a tree structure. Hence, -we will do all transformation directly to the tree itself. -Transforming the tree directly also allows us to do parent annotation. -A grammar can then be simply induced from the modified tree. - -The following is a short tutorial on the available transformations. - - 1. Chomsky Normal Form (binarization) - - It is well known that any grammar has a Chomsky Normal Form (CNF) - equivalent grammar where CNF is defined by every production having - either two non-terminals or one terminal on its right hand side. - When we have hierarchically structured data (ie. a treebank), it is - natural to view this in terms of productions where the root of every - subtree is the head (left hand side) of the production and all of - its children are the right hand side constituents. In order to - convert a tree into CNF, we simply need to ensure that every subtree - has either two subtrees as children (binarization), or one leaf node - (non-terminal). In order to binarize a subtree with more than two - children, we must introduce artificial nodes. - - There are two popular methods to convert a tree into CNF: left - factoring and right factoring. The following example demonstrates - the difference between them. Example:: - - Original Right-Factored Left-Factored - - A A A - / | \ / \ / \ - B C D ==> B A| OR A| D - / \ / \ - C D B C - - 2. Parent Annotation - - In addition to binarizing the tree, there are two standard - modifications to node labels we can do in the same traversal: parent - annotation and Markov order-N smoothing (or sibling smoothing). - - The purpose of parent annotation is to refine the probabilities of - productions by adding a small amount of context. With this simple - addition, a CYK (inside-outside, dynamic programming chart parse) - can improve from 74% to 79% accuracy. A natural generalization from - parent annotation is to grandparent annotation and beyond. The - tradeoff becomes accuracy gain vs. computational complexity. We - must also keep in mind data sparcity issues. Example:: - - Original Parent Annotation - - A A^ - / | \ / \ - B C D ==> B^ A|^ where ? is the - / \ parent of A - C^ D^ - - - 3. Markov order-N smoothing - - Markov smoothing combats data sparcity issues as well as decreasing - computational requirements by limiting the number of children - included in artificial nodes. In practice, most people use an order - 2 grammar. Example:: - - Original No Smoothing Markov order 1 Markov order 2 etc. - - __A__ A A A - / /|\ \ / \ / \ / \ - B C D E F ==> B A| ==> B A| ==> B A| - / \ / \ / \ - C ... C ... C ... - - - - Annotation decisions can be thought about in the vertical direction - (parent, grandparent, etc) and the horizontal direction (number of - siblings to keep). Parameters to the following functions specify - these values. For more information see: - - Dan Klein and Chris Manning (2003) "Accurate Unlexicalized - Parsing", ACL-03. https://www.aclweb.org/anthology/P03-1054 - - 4. Unary Collapsing - - Collapse unary productions (ie. subtrees with a single child) into a - new non-terminal (Tree node). This is useful when working with - algorithms that do not allow unary productions, yet you do not wish - to lose the parent information. Example:: - - A - | - B ==> A+B - / \ / \ - C D C D - -""" - -from nltk.internals import deprecated -from nltk.tree.transforms import chomsky_normal_form as cnf -from nltk.tree.transforms import collapse_unary as cu -from nltk.tree.transforms import un_chomsky_normal_form as ucnf - -chomsky_normal_form = deprecated( - "Import using `from nltk.tree import chomsky_normal_form` instead." -)(cnf) -un_chomsky_normal_form = deprecated( - "Import using `from nltk.tree import un_chomsky_normal_form` instead." -)(ucnf) -collapse_unary = deprecated( - "Import using `from nltk.tree import collapse_unary` instead." -)(cu) - - -__all__ = ["chomsky_normal_form", "un_chomsky_normal_form", "collapse_unary"] diff --git a/pipeline/nltk/twitter/__init__.py b/pipeline/nltk/twitter/__init__.py deleted file mode 100644 index cd14ffb4703bf38bb349cc19cca2d97b6df29f77..0000000000000000000000000000000000000000 --- a/pipeline/nltk/twitter/__init__.py +++ /dev/null @@ -1,35 +0,0 @@ -# Natural Language Toolkit: Twitter -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ewan Klein -# URL: -# For license information, see LICENSE.TXT - -""" -NLTK Twitter Package - -This package contains classes for retrieving Tweet documents using the -Twitter API. - -""" -try: - import twython -except ImportError: - import warnings - - warnings.warn( - "The twython library has not been installed. " - "Some functionality from the twitter package will not be available." - ) -else: - from nltk.twitter.util import Authenticate, credsfromfile - from nltk.twitter.twitterclient import ( - Streamer, - Query, - Twitter, - TweetViewer, - TweetWriter, - ) - - -from nltk.twitter.common import json2csv diff --git a/pipeline/nltk/twitter/__pycache__/__init__.cpython-39.pyc b/pipeline/nltk/twitter/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 53f11cbbc18298a5b75b2bc06eadddb8d677bd99..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/twitter/__pycache__/__init__.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/twitter/__pycache__/api.cpython-39.pyc b/pipeline/nltk/twitter/__pycache__/api.cpython-39.pyc deleted file mode 100644 index 2b1b05bebb5a6f1c2f1b79bcf269927eb7920ddc..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/twitter/__pycache__/api.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/twitter/__pycache__/common.cpython-39.pyc b/pipeline/nltk/twitter/__pycache__/common.cpython-39.pyc deleted file mode 100644 index 981f52aa13ea6b0c3c0d2898c23050521a11297a..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/twitter/__pycache__/common.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/twitter/__pycache__/twitter_demo.cpython-39.pyc b/pipeline/nltk/twitter/__pycache__/twitter_demo.cpython-39.pyc deleted file mode 100644 index 1639159514f5fb3903a35c2809ae661717e1eeb7..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/twitter/__pycache__/twitter_demo.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/twitter/__pycache__/twitterclient.cpython-39.pyc b/pipeline/nltk/twitter/__pycache__/twitterclient.cpython-39.pyc deleted file mode 100644 index 94d4f2674ee57b61270c88f8f166c9fcb7bd1c7c..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/twitter/__pycache__/twitterclient.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/twitter/__pycache__/util.cpython-39.pyc b/pipeline/nltk/twitter/__pycache__/util.cpython-39.pyc deleted file mode 100644 index 0c139da30c9e821be9a379b5490c251cac5fc0b4..0000000000000000000000000000000000000000 Binary files a/pipeline/nltk/twitter/__pycache__/util.cpython-39.pyc and /dev/null differ diff --git a/pipeline/nltk/twitter/api.py b/pipeline/nltk/twitter/api.py deleted file mode 100644 index 71248b176340abd0d0d7d51e8ed68700f7948e13..0000000000000000000000000000000000000000 --- a/pipeline/nltk/twitter/api.py +++ /dev/null @@ -1,145 +0,0 @@ -# Natural Language Toolkit: Twitter API -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ewan Klein -# Lorenzo Rubio -# URL: -# For license information, see LICENSE.TXT - -""" -This module provides an interface for TweetHandlers, and support for timezone -handling. -""" - -import time as _time -from abc import ABCMeta, abstractmethod -from datetime import datetime, timedelta, timezone, tzinfo - - -class LocalTimezoneOffsetWithUTC(tzinfo): - """ - This is not intended to be a general purpose class for dealing with the - local timezone. In particular: - - * it assumes that the date passed has been created using - `datetime(..., tzinfo=Local)`, where `Local` is an instance of - the object `LocalTimezoneOffsetWithUTC`; - * for such an object, it returns the offset with UTC, used for date comparisons. - - Reference: https://docs.python.org/3/library/datetime.html - """ - - STDOFFSET = timedelta(seconds=-_time.timezone) - - if _time.daylight: - DSTOFFSET = timedelta(seconds=-_time.altzone) - else: - DSTOFFSET = STDOFFSET - - def utcoffset(self, dt): - """ - Access the relevant time offset. - """ - return self.DSTOFFSET - - -LOCAL = LocalTimezoneOffsetWithUTC() - - -class BasicTweetHandler(metaclass=ABCMeta): - """ - Minimal implementation of `TweetHandler`. - - Counts the number of Tweets and decides when the client should stop - fetching them. - """ - - def __init__(self, limit=20): - self.limit = limit - self.counter = 0 - - """ - A flag to indicate to the client whether to stop fetching data given - some condition (e.g., reaching a date limit). - """ - self.do_stop = False - - """ - Stores the id of the last fetched Tweet to handle pagination. - """ - self.max_id = None - - def do_continue(self): - """ - Returns `False` if the client should stop fetching Tweets. - """ - return self.counter < self.limit and not self.do_stop - - -class TweetHandlerI(BasicTweetHandler): - """ - Interface class whose subclasses should implement a handle method that - Twitter clients can delegate to. - """ - - def __init__(self, limit=20, upper_date_limit=None, lower_date_limit=None): - """ - :param int limit: The number of data items to process in the current\ - round of processing. - - :param tuple upper_date_limit: The date at which to stop collecting\ - new data. This should be entered as a tuple which can serve as the\ - argument to `datetime.datetime`.\ - E.g. `date_limit=(2015, 4, 1, 12, 40)` for 12:30 pm on April 1 2015. - - :param tuple lower_date_limit: The date at which to stop collecting\ - new data. See `upper_data_limit` for formatting. - """ - BasicTweetHandler.__init__(self, limit) - - self.upper_date_limit = None - self.lower_date_limit = None - if upper_date_limit: - self.upper_date_limit = datetime(*upper_date_limit, tzinfo=LOCAL) - if lower_date_limit: - self.lower_date_limit = datetime(*lower_date_limit, tzinfo=LOCAL) - - self.startingup = True - - @abstractmethod - def handle(self, data): - """ - Deal appropriately with data returned by the Twitter API - """ - - @abstractmethod - def on_finish(self): - """ - Actions when the tweet limit has been reached - """ - - def check_date_limit(self, data, verbose=False): - """ - Validate date limits. - """ - if self.upper_date_limit or self.lower_date_limit: - date_fmt = "%a %b %d %H:%M:%S +0000 %Y" - tweet_date = datetime.strptime(data["created_at"], date_fmt).replace( - tzinfo=timezone.utc - ) - if (self.upper_date_limit and tweet_date > self.upper_date_limit) or ( - self.lower_date_limit and tweet_date < self.lower_date_limit - ): - if self.upper_date_limit: - message = "earlier" - date_limit = self.upper_date_limit - else: - message = "later" - date_limit = self.lower_date_limit - if verbose: - print( - "Date limit {} is {} than date of current tweet {}".format( - date_limit, message, tweet_date - ) - ) - self.do_stop = True diff --git a/pipeline/nltk/twitter/common.py b/pipeline/nltk/twitter/common.py deleted file mode 100644 index d9428724cfa8cae69e14d899cb73eee5607475d0..0000000000000000000000000000000000000000 --- a/pipeline/nltk/twitter/common.py +++ /dev/null @@ -1,270 +0,0 @@ -# Natural Language Toolkit: Twitter client -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ewan Klein -# Lorenzo Rubio -# URL: -# For license information, see LICENSE.TXT - -""" -Utility functions for the `twitterclient` module which do not require -the `twython` library to have been installed. -""" -import csv -import gzip -import json - -from nltk.internals import deprecated - -HIER_SEPARATOR = "." - - -def extract_fields(tweet, fields): - """ - Extract field values from a full tweet and return them as a list - - :param json tweet: The tweet in JSON format - :param list fields: The fields to be extracted from the tweet - :rtype: list(str) - """ - out = [] - for field in fields: - try: - _add_field_to_out(tweet, field, out) - except TypeError as e: - raise RuntimeError( - "Fatal error when extracting fields. Cannot find field ", field - ) from e - return out - - -def _add_field_to_out(json, field, out): - if _is_composed_key(field): - key, value = _get_key_value_composed(field) - _add_field_to_out(json[key], value, out) - else: - out += [json[field]] - - -def _is_composed_key(field): - return HIER_SEPARATOR in field - - -def _get_key_value_composed(field): - out = field.split(HIER_SEPARATOR) - # there could be up to 3 levels - key = out[0] - value = HIER_SEPARATOR.join(out[1:]) - return key, value - - -def _get_entity_recursive(json, entity): - if not json: - return None - elif isinstance(json, dict): - for key, value in json.items(): - if key == entity: - return value - # 'entities' and 'extended_entities' are wrappers in Twitter json - # structure that contain other Twitter objects. See: - # https://dev.twitter.com/overview/api/entities-in-twitter-objects - - if key == "entities" or key == "extended_entities": - candidate = _get_entity_recursive(value, entity) - if candidate is not None: - return candidate - return None - elif isinstance(json, list): - for item in json: - candidate = _get_entity_recursive(item, entity) - if candidate is not None: - return candidate - return None - else: - return None - - -def json2csv( - fp, outfile, fields, encoding="utf8", errors="replace", gzip_compress=False -): - """ - Extract selected fields from a file of line-separated JSON tweets and - write to a file in CSV format. - - This utility function allows a file of full tweets to be easily converted - to a CSV file for easier processing. For example, just TweetIDs or - just the text content of the Tweets can be extracted. - - Additionally, the function allows combinations of fields of other Twitter - objects (mainly the users, see below). - - For Twitter entities (e.g. hashtags of a Tweet), and for geolocation, see - `json2csv_entities` - - :param str infile: The name of the file containing full tweets - - :param str outfile: The name of the text file where results should be\ - written - - :param list fields: The list of fields to be extracted. Useful examples\ - are 'id_str' for the tweetID and 'text' for the text of the tweet. See\ - for a full list of fields.\ - e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']\ - Additionally, it allows IDs from other Twitter objects, e. g.,\ - ['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count'] - - :param error: Behaviour for encoding errors, see\ - https://docs.python.org/3/library/codecs.html#codec-base-classes - - :param gzip_compress: if `True`, output files are compressed with gzip - """ - (writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress) - # write the list of fields as header - writer.writerow(fields) - # process the file - for line in fp: - tweet = json.loads(line) - row = extract_fields(tweet, fields) - writer.writerow(row) - outf.close() - - -@deprecated("Use open() and csv.writer() directly instead.") -def outf_writer_compat(outfile, encoding, errors, gzip_compress=False): - """Get a CSV writer with optional compression.""" - return _outf_writer(outfile, encoding, errors, gzip_compress) - - -def _outf_writer(outfile, encoding, errors, gzip_compress=False): - if gzip_compress: - outf = gzip.open(outfile, "wt", newline="", encoding=encoding, errors=errors) - else: - outf = open(outfile, "w", newline="", encoding=encoding, errors=errors) - writer = csv.writer(outf) - return (writer, outf) - - -def json2csv_entities( - tweets_file, - outfile, - main_fields, - entity_type, - entity_fields, - encoding="utf8", - errors="replace", - gzip_compress=False, -): - """ - Extract selected fields from a file of line-separated JSON tweets and - write to a file in CSV format. - - This utility function allows a file of full Tweets to be easily converted - to a CSV file for easier processing of Twitter entities. For example, the - hashtags or media elements of a tweet can be extracted. - - It returns one line per entity of a Tweet, e.g. if a tweet has two hashtags - there will be two lines in the output file, one per hashtag - - :param tweets_file: the file-like object containing full Tweets - - :param str outfile: The path of the text file where results should be\ - written - - :param list main_fields: The list of fields to be extracted from the main\ - object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\ - for a full list of fields. - e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count'] - If `entity_type` is expressed with hierarchy, then it is the list of\ - fields of the object that corresponds to the key of the entity_type,\ - (e.g., for entity_type='user.urls', the fields in the main_fields list\ - belong to the user object; for entity_type='place.bounding_box', the\ - files in the main_field list belong to the place object of the tweet). - - :param list entity_type: The name of the entity: 'hashtags', 'media',\ - 'urls' and 'user_mentions' for the tweet object. For a user object,\ - this needs to be expressed with a hierarchy: `'user.urls'`. For the\ - bounding box of the Tweet location, use `'place.bounding_box'`. - - :param list entity_fields: The list of fields to be extracted from the\ - entity. E.g. `['text']` (of the Tweet) - - :param error: Behaviour for encoding errors, see\ - https://docs.python.org/3/library/codecs.html#codec-base-classes - - :param gzip_compress: if `True`, output files are compressed with gzip - """ - - (writer, outf) = _outf_writer(outfile, encoding, errors, gzip_compress) - header = get_header_field_list(main_fields, entity_type, entity_fields) - writer.writerow(header) - for line in tweets_file: - tweet = json.loads(line) - if _is_composed_key(entity_type): - key, value = _get_key_value_composed(entity_type) - object_json = _get_entity_recursive(tweet, key) - if not object_json: - # this can happen in the case of "place" - continue - object_fields = extract_fields(object_json, main_fields) - items = _get_entity_recursive(object_json, value) - _write_to_file(object_fields, items, entity_fields, writer) - else: - tweet_fields = extract_fields(tweet, main_fields) - items = _get_entity_recursive(tweet, entity_type) - _write_to_file(tweet_fields, items, entity_fields, writer) - outf.close() - - -def get_header_field_list(main_fields, entity_type, entity_fields): - if _is_composed_key(entity_type): - key, value = _get_key_value_composed(entity_type) - main_entity = key - sub_entity = value - else: - main_entity = None - sub_entity = entity_type - - if main_entity: - output1 = [HIER_SEPARATOR.join([main_entity, x]) for x in main_fields] - else: - output1 = main_fields - output2 = [HIER_SEPARATOR.join([sub_entity, x]) for x in entity_fields] - return output1 + output2 - - -def _write_to_file(object_fields, items, entity_fields, writer): - if not items: - # it could be that the entity is just not present for the tweet - # e.g. tweet hashtag is always present, even as [], however - # tweet media may not be present - return - if isinstance(items, dict): - # this happens e.g. for "place" of a tweet - row = object_fields - # there might be composed keys in de list of required fields - entity_field_values = [x for x in entity_fields if not _is_composed_key(x)] - entity_field_composed = [x for x in entity_fields if _is_composed_key(x)] - for field in entity_field_values: - value = items[field] - if isinstance(value, list): - row += value - else: - row += [value] - # now check required dictionaries - for d in entity_field_composed: - kd, vd = _get_key_value_composed(d) - json_dict = items[kd] - if not isinstance(json_dict, dict): - raise RuntimeError( - """Key {} does not contain a dictionary - in the json file""".format( - kd - ) - ) - row += [json_dict[vd]] - writer.writerow(row) - return - # in general it is a list - for item in items: - row = object_fields + extract_fields(item, entity_fields) - writer.writerow(row) diff --git a/pipeline/nltk/twitter/twitter_demo.py b/pipeline/nltk/twitter/twitter_demo.py deleted file mode 100644 index 554bdfef511190b28504f9ded8dc8a6098e16ed9..0000000000000000000000000000000000000000 --- a/pipeline/nltk/twitter/twitter_demo.py +++ /dev/null @@ -1,306 +0,0 @@ -# Natural Language Toolkit: Twitter client -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ewan Klein -# Lorenzo Rubio -# URL: -# For license information, see LICENSE.TXT - -""" -Examples to demo the :py:mod:`twitterclient` code. - -These demo functions should all run, with the following caveats: - -* You must have obtained API keys from Twitter, and installed them according to - the instructions in the `twitter HOWTO `_. - -* If you are on a slow network, some of the calls to the Twitter API may - timeout. - -* If you are being rate limited while searching, you will receive a 420 - error response. - -* Your terminal window / console must be able to display UTF-8 encoded characters. - -For documentation about the Twitter APIs, see `The Streaming APIs Overview -`_ and `The REST APIs Overview -`_. - -For error codes see Twitter's -`Error Codes and Responses ` -""" - -import datetime -import json -from functools import wraps -from io import StringIO - -from nltk.twitter import ( - Query, - Streamer, - TweetViewer, - TweetWriter, - Twitter, - credsfromfile, -) - -SPACER = "###################################" - - -def verbose(func): - """Decorator for demo functions""" - - @wraps(func) - def with_formatting(*args, **kwargs): - print() - print(SPACER) - print("Using %s" % (func.__name__)) - print(SPACER) - return func(*args, **kwargs) - - return with_formatting - - -def yesterday(): - """ - Get yesterday's datetime as a 5-tuple. - """ - date = datetime.datetime.now() - date -= datetime.timedelta(days=1) - date_tuple = date.timetuple()[:6] - return date_tuple - - -def setup(): - """ - Initialize global variables for the demos. - """ - global USERIDS, FIELDS - - USERIDS = ["759251", "612473", "15108702", "6017542", "2673523800"] - # UserIDs corresponding to\ - # @CNN, @BBCNews, @ReutersLive, @BreakingNews, @AJELive - FIELDS = ["id_str"] - - -@verbose -def twitterclass_demo(): - """ - Use the simplified :class:`Twitter` class to write some tweets to a file. - """ - tw = Twitter() - print("Track from the public stream\n") - tw.tweets(keywords="love, hate", limit=10) # public stream - print(SPACER) - print("Search past Tweets\n") - tw = Twitter() - tw.tweets(keywords="love, hate", stream=False, limit=10) # search past tweets - print(SPACER) - print( - "Follow two accounts in the public stream" - + " -- be prepared to wait a few minutes\n" - ) - tw = Twitter() - tw.tweets(follow=["759251", "6017542"], stream=True, limit=5) # public stream - - -@verbose -def sampletoscreen_demo(limit=20): - """ - Sample from the Streaming API and send output to terminal. - """ - oauth = credsfromfile() - client = Streamer(**oauth) - client.register(TweetViewer(limit=limit)) - client.sample() - - -@verbose -def tracktoscreen_demo(track="taylor swift", limit=10): - """ - Track keywords from the public Streaming API and send output to terminal. - """ - oauth = credsfromfile() - client = Streamer(**oauth) - client.register(TweetViewer(limit=limit)) - client.filter(track=track) - - -@verbose -def search_demo(keywords="nltk"): - """ - Use the REST API to search for past tweets containing a given keyword. - """ - oauth = credsfromfile() - client = Query(**oauth) - for tweet in client.search_tweets(keywords=keywords, limit=10): - print(tweet["text"]) - - -@verbose -def tweets_by_user_demo(user="NLTK_org", count=200): - """ - Use the REST API to search for past tweets by a given user. - """ - oauth = credsfromfile() - client = Query(**oauth) - client.register(TweetWriter()) - client.user_tweets(user, count) - - -@verbose -def lookup_by_userid_demo(): - """ - Use the REST API to convert a userID to a screen name. - """ - oauth = credsfromfile() - client = Query(**oauth) - user_info = client.user_info_from_id(USERIDS) - for info in user_info: - name = info["screen_name"] - followers = info["followers_count"] - following = info["friends_count"] - print(f"{name}, followers: {followers}, following: {following}") - - -@verbose -def followtoscreen_demo(limit=10): - """ - Using the Streaming API, select just the tweets from a specified list of - userIDs. - - This is will only give results in a reasonable time if the users in - question produce a high volume of tweets, and may even so show some delay. - """ - oauth = credsfromfile() - client = Streamer(**oauth) - client.register(TweetViewer(limit=limit)) - client.statuses.filter(follow=USERIDS) - - -@verbose -def streamtofile_demo(limit=20): - """ - Write 20 tweets sampled from the public Streaming API to a file. - """ - oauth = credsfromfile() - client = Streamer(**oauth) - client.register(TweetWriter(limit=limit, repeat=False)) - client.statuses.sample() - - -@verbose -def limit_by_time_demo(keywords="nltk"): - """ - Query the REST API for Tweets about NLTK since yesterday and send - the output to terminal. - - This example makes the assumption that there are sufficient Tweets since - yesterday for the date to be an effective cut-off. - """ - date = yesterday() - dt_date = datetime.datetime(*date) - oauth = credsfromfile() - client = Query(**oauth) - client.register(TweetViewer(limit=100, lower_date_limit=date)) - - print(f"Cutoff date: {dt_date}\n") - - for tweet in client.search_tweets(keywords=keywords): - print("{} ".format(tweet["created_at"]), end="") - client.handler.handle(tweet) - - -@verbose -def corpusreader_demo(): - """ - Use `TwitterCorpusReader` tp read a file of tweets, and print out - - * some full tweets in JSON format; - * some raw strings from the tweets (i.e., the value of the `text` field); and - * the result of tokenising the raw strings. - - """ - from nltk.corpus import twitter_samples as tweets - - print() - print("Complete tweet documents") - print(SPACER) - for tweet in tweets.docs("tweets.20150430-223406.json")[:1]: - print(json.dumps(tweet, indent=1, sort_keys=True)) - - print() - print("Raw tweet strings:") - print(SPACER) - for text in tweets.strings("tweets.20150430-223406.json")[:15]: - print(text) - - print() - print("Tokenized tweet strings:") - print(SPACER) - for toks in tweets.tokenized("tweets.20150430-223406.json")[:15]: - print(toks) - - -@verbose -def expand_tweetids_demo(): - """ - Given a file object containing a list of Tweet IDs, fetch the - corresponding full Tweets, if available. - - """ - ids_f = StringIO( - """\ - 588665495492124672 - 588665495487909888 - 588665495508766721 - 588665495513006080 - 588665495517200384 - 588665495487811584 - 588665495525588992 - 588665495487844352 - 588665495492014081 - 588665495512948737""" - ) - oauth = credsfromfile() - client = Query(**oauth) - hydrated = client.expand_tweetids(ids_f) - - for tweet in hydrated: - id_str = tweet["id_str"] - print(f"id: {id_str}") - text = tweet["text"] - if text.startswith("@null"): - text = "[Tweet not available]" - print(text + "\n") - - -ALL = [ - twitterclass_demo, - sampletoscreen_demo, - tracktoscreen_demo, - search_demo, - tweets_by_user_demo, - lookup_by_userid_demo, - followtoscreen_demo, - streamtofile_demo, - limit_by_time_demo, - corpusreader_demo, - expand_tweetids_demo, -] - -""" -Select demo functions to run. E.g. replace the following line with "DEMOS = -ALL[8:]" to execute only the final three demos. -""" -DEMOS = ALL[:] - -if __name__ == "__main__": - setup() - - for demo in DEMOS: - demo() - - print("\n" + SPACER) - print("All demos completed") - print(SPACER) diff --git a/pipeline/nltk/twitter/twitterclient.py b/pipeline/nltk/twitter/twitterclient.py deleted file mode 100644 index d556738e0849faf35454166cec8a5949fcca93dc..0000000000000000000000000000000000000000 --- a/pipeline/nltk/twitter/twitterclient.py +++ /dev/null @@ -1,564 +0,0 @@ -# Natural Language Toolkit: Twitter client -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ewan Klein -# Lorenzo Rubio -# URL: -# For license information, see LICENSE.TXT - - -""" -NLTK Twitter client - -This module offers methods for collecting and processing Tweets. Most of the -functionality depends on access to the Twitter APIs, and this is handled via -the third party Twython library. - -If one of the methods below returns an integer, it is probably a `Twitter -error code `_. For -example, the response of '420' means that you have reached the limit of the -requests you can currently make to the Twitter API. Currently, `rate limits -for the search API `_ are -divided into 15 minute windows. -""" - -import datetime -import gzip -import itertools -import json -import os -import time - -import requests -from twython import Twython, TwythonStreamer -from twython.exceptions import TwythonError, TwythonRateLimitError - -from nltk.twitter.api import BasicTweetHandler, TweetHandlerI -from nltk.twitter.util import credsfromfile, guess_path - - -class Streamer(TwythonStreamer): - """ - Retrieve data from the Twitter Streaming API. - - The streaming API requires - `OAuth 1.0 `_ authentication. - """ - - def __init__(self, app_key, app_secret, oauth_token, oauth_token_secret): - - self.handler = None - self.do_continue = True - TwythonStreamer.__init__( - self, app_key, app_secret, oauth_token, oauth_token_secret - ) - - def register(self, handler): - """ - Register a method for handling Tweets. - - :param TweetHandlerI handler: method for viewing - """ - self.handler = handler - - def on_success(self, data): - """ - :param data: response from Twitter API - """ - if self.do_continue: - if self.handler is not None: - if "text" in data: - self.handler.counter += 1 - self.handler.handle(data) - self.do_continue = self.handler.do_continue() - else: - raise ValueError("No data handler has been registered.") - else: - self.disconnect() - self.handler.on_finish() - - def on_error(self, status_code, data): - """ - :param status_code: The status code returned by the Twitter API - :param data: The response from Twitter API - - """ - print(status_code) - - def sample(self): - """ - Wrapper for 'statuses / sample' API call - """ - while self.do_continue: - - # Stream in an endless loop until limit is reached. See twython - # issue 288: https://github.com/ryanmcgrath/twython/issues/288 - # colditzjb commented on 9 Dec 2014 - - try: - self.statuses.sample() - except requests.exceptions.ChunkedEncodingError as e: - if e is not None: - print(f"Error (stream will continue): {e}") - continue - - def filter(self, track="", follow="", lang="en"): - """ - Wrapper for 'statuses / filter' API call - """ - while self.do_continue: - # Stream in an endless loop until limit is reached - - try: - if track == "" and follow == "": - msg = "Please supply a value for 'track', 'follow'" - raise ValueError(msg) - self.statuses.filter(track=track, follow=follow, lang=lang) - except requests.exceptions.ChunkedEncodingError as e: - if e is not None: - print(f"Error (stream will continue): {e}") - continue - - -class Query(Twython): - """ - Retrieve data from the Twitter REST API. - """ - - def __init__(self, app_key, app_secret, oauth_token, oauth_token_secret): - """ - :param app_key: (optional) Your applications key - :param app_secret: (optional) Your applications secret key - :param oauth_token: (optional) When using **OAuth 1**, combined with - oauth_token_secret to make authenticated calls - :param oauth_token_secret: (optional) When using **OAuth 1** combined - with oauth_token to make authenticated calls - """ - self.handler = None - self.do_continue = True - Twython.__init__(self, app_key, app_secret, oauth_token, oauth_token_secret) - - def register(self, handler): - """ - Register a method for handling Tweets. - - :param TweetHandlerI handler: method for viewing or writing Tweets to a file. - """ - self.handler = handler - - def expand_tweetids(self, ids_f, verbose=True): - """ - Given a file object containing a list of Tweet IDs, fetch the - corresponding full Tweets from the Twitter API. - - The API call `statuses/lookup` will fail to retrieve a Tweet if the - user has deleted it. - - This call to the Twitter API is rate-limited. See - for details. - - :param ids_f: input file object consisting of Tweet IDs, one to a line - :return: iterable of Tweet objects in JSON format - """ - ids = [line.strip() for line in ids_f if line] - - if verbose: - print(f"Counted {len(ids)} Tweet IDs in {ids_f}.") - - # The Twitter endpoint takes lists of up to 100 ids, so we chunk the - # ids. - id_chunks = [ids[i : i + 100] for i in range(0, len(ids), 100)] - - chunked_tweets = (self.lookup_status(id=chunk) for chunk in id_chunks) - - return itertools.chain.from_iterable(chunked_tweets) - - def _search_tweets(self, keywords, limit=100, lang="en"): - """ - Assumes that the handler has been informed. Fetches Tweets from - search_tweets generator output and passses them to handler - - :param str keywords: A list of query terms to search for, written as\ - a comma-separated string. - :param int limit: Number of Tweets to process - :param str lang: language - """ - while True: - tweets = self.search_tweets( - keywords=keywords, limit=limit, lang=lang, max_id=self.handler.max_id - ) - for tweet in tweets: - self.handler.handle(tweet) - if not (self.handler.do_continue() and self.handler.repeat): - break - self.handler.on_finish() - - def search_tweets( - self, - keywords, - limit=100, - lang="en", - max_id=None, - retries_after_twython_exception=0, - ): - """ - Call the REST API ``'search/tweets'`` endpoint with some plausible - defaults. See `the Twitter search documentation - `_ for more information - about admissible search parameters. - - :param str keywords: A list of query terms to search for, written as\ - a comma-separated string - :param int limit: Number of Tweets to process - :param str lang: language - :param int max_id: id of the last tweet fetched - :param int retries_after_twython_exception: number of retries when\ - searching Tweets before raising an exception - :rtype: python generator - """ - if not self.handler: - # if no handler is provided, `BasicTweetHandler` provides minimum - # functionality for limiting the number of Tweets retrieved - self.handler = BasicTweetHandler(limit=limit) - - count_from_query = 0 - if max_id: - self.handler.max_id = max_id - else: - results = self.search( - q=keywords, count=min(100, limit), lang=lang, result_type="recent" - ) - count = len(results["statuses"]) - if count == 0: - print("No Tweets available through REST API for those keywords") - return - count_from_query = count - self.handler.max_id = results["statuses"][count - 1]["id"] - 1 - - for result in results["statuses"]: - yield result - self.handler.counter += 1 - if self.handler.do_continue() == False: - return - - # Pagination loop: keep fetching Tweets until the desired count is - # reached while dealing with Twitter rate limits. - retries = 0 - while count_from_query < limit: - try: - mcount = min(100, limit - count_from_query) - results = self.search( - q=keywords, - count=mcount, - lang=lang, - max_id=self.handler.max_id, - result_type="recent", - ) - except TwythonRateLimitError as e: - print(f"Waiting for 15 minutes -{e}") - time.sleep(15 * 60) # wait 15 minutes - continue - except TwythonError as e: - print(f"Fatal error in Twython request -{e}") - if retries_after_twython_exception == retries: - raise e - retries += 1 - - count = len(results["statuses"]) - if count == 0: - print("No more Tweets available through rest api") - return - count_from_query += count - # the max_id is also present in the Tweet metadata - # results['search_metadata']['next_results'], but as part of a - # query and difficult to fetch. This is doing the equivalent - # (last tweet id minus one) - self.handler.max_id = results["statuses"][count - 1]["id"] - 1 - - for result in results["statuses"]: - yield result - self.handler.counter += 1 - if self.handler.do_continue() == False: - return - - def user_info_from_id(self, userids): - """ - Convert a list of userIDs into a variety of information about the users. - - See . - - :param list userids: A list of integer strings corresponding to Twitter userIDs - :rtype: list(json) - """ - return [self.show_user(user_id=userid) for userid in userids] - - def user_tweets(self, screen_name, limit, include_rts="false"): - """ - Return a collection of the most recent Tweets posted by the user - - :param str user: The user's screen name; the initial '@' symbol\ - should be omitted - :param int limit: The number of Tweets to recover; 200 is the maximum allowed - :param str include_rts: Whether to include statuses which have been\ - retweeted by the user; possible values are 'true' and 'false' - """ - data = self.get_user_timeline( - screen_name=screen_name, count=limit, include_rts=include_rts - ) - for item in data: - self.handler.handle(item) - - -class Twitter: - """ - Wrapper class with restricted functionality and fewer options. - """ - - def __init__(self): - self._oauth = credsfromfile() - self.streamer = Streamer(**self._oauth) - self.query = Query(**self._oauth) - - def tweets( - self, - keywords="", - follow="", - to_screen=True, - stream=True, - limit=100, - date_limit=None, - lang="en", - repeat=False, - gzip_compress=False, - ): - """ - Process some Tweets in a simple manner. - - :param str keywords: Keywords to use for searching or filtering - :param list follow: UserIDs to use for filtering Tweets from the public stream - :param bool to_screen: If `True`, display the tweet texts on the screen,\ - otherwise print to a file - - :param bool stream: If `True`, use the live public stream,\ - otherwise search past public Tweets - - :param int limit: The number of data items to process in the current\ - round of processing. - - :param tuple date_limit: The date at which to stop collecting\ - new data. This should be entered as a tuple which can serve as the\ - argument to `datetime.datetime`.\ - E.g. `date_limit=(2015, 4, 1, 12, 40)` for 12:30 pm on April 1 2015. - Note that, in the case of streaming, this is the maximum date, i.e.\ - a date in the future; if not, it is the minimum date, i.e. a date\ - in the past - - :param str lang: language - - :param bool repeat: A flag to determine whether multiple files should\ - be written. If `True`, the length of each file will be set by the\ - value of `limit`. Use only if `to_screen` is `False`. See also - :py:func:`handle`. - - :param gzip_compress: if `True`, output files are compressed with gzip. - """ - if stream: - upper_date_limit = date_limit - lower_date_limit = None - else: - upper_date_limit = None - lower_date_limit = date_limit - - if to_screen: - handler = TweetViewer( - limit=limit, - upper_date_limit=upper_date_limit, - lower_date_limit=lower_date_limit, - ) - else: - handler = TweetWriter( - limit=limit, - upper_date_limit=upper_date_limit, - lower_date_limit=lower_date_limit, - repeat=repeat, - gzip_compress=gzip_compress, - ) - - if to_screen: - handler = TweetViewer(limit=limit) - else: - if stream: - upper_date_limit = date_limit - lower_date_limit = None - else: - upper_date_limit = None - lower_date_limit = date_limit - - handler = TweetWriter( - limit=limit, - upper_date_limit=upper_date_limit, - lower_date_limit=lower_date_limit, - repeat=repeat, - gzip_compress=gzip_compress, - ) - - if stream: - self.streamer.register(handler) - if keywords == "" and follow == "": - self.streamer.sample() - else: - self.streamer.filter(track=keywords, follow=follow, lang=lang) - else: - self.query.register(handler) - if keywords == "": - raise ValueError("Please supply at least one keyword to search for.") - else: - self.query._search_tweets(keywords, limit=limit, lang=lang) - - -class TweetViewer(TweetHandlerI): - """ - Handle data by sending it to the terminal. - """ - - def handle(self, data): - """ - Direct data to `sys.stdout` - - :return: return ``False`` if processing should cease, otherwise return ``True``. - :rtype: bool - :param data: Tweet object returned by Twitter API - """ - text = data["text"] - print(text) - - self.check_date_limit(data) - if self.do_stop: - return - - def on_finish(self): - print(f"Written {self.counter} Tweets") - - -class TweetWriter(TweetHandlerI): - """ - Handle data by writing it to a file. - """ - - def __init__( - self, - limit=2000, - upper_date_limit=None, - lower_date_limit=None, - fprefix="tweets", - subdir="twitter-files", - repeat=False, - gzip_compress=False, - ): - """ - The difference between the upper and lower date limits depends on - whether Tweets are coming in an ascending date order (i.e. when - streaming) or descending date order (i.e. when searching past Tweets). - - :param int limit: number of data items to process in the current\ - round of processing. - - :param tuple upper_date_limit: The date at which to stop collecting new\ - data. This should be entered as a tuple which can serve as the\ - argument to `datetime.datetime`. E.g. `upper_date_limit=(2015, 4, 1, 12,\ - 40)` for 12:30 pm on April 1 2015. - - :param tuple lower_date_limit: The date at which to stop collecting new\ - data. See `upper_data_limit` for formatting. - - :param str fprefix: The prefix to use in creating file names for Tweet\ - collections. - - :param str subdir: The name of the directory where Tweet collection\ - files should be stored. - - :param bool repeat: flag to determine whether multiple files should be\ - written. If `True`, the length of each file will be set by the value\ - of `limit`. See also :py:func:`handle`. - - :param gzip_compress: if `True`, output files are compressed with gzip. - """ - self.fprefix = fprefix - self.subdir = guess_path(subdir) - self.gzip_compress = gzip_compress - self.fname = self.timestamped_file() - self.repeat = repeat - self.output = None - TweetHandlerI.__init__(self, limit, upper_date_limit, lower_date_limit) - - def timestamped_file(self): - """ - :return: timestamped file name - :rtype: str - """ - subdir = self.subdir - fprefix = self.fprefix - if subdir: - if not os.path.exists(subdir): - os.mkdir(subdir) - - fname = os.path.join(subdir, fprefix) - fmt = "%Y%m%d-%H%M%S" - timestamp = datetime.datetime.now().strftime(fmt) - if self.gzip_compress: - suffix = ".gz" - else: - suffix = "" - outfile = f"{fname}.{timestamp}.json{suffix}" - return outfile - - def handle(self, data): - """ - Write Twitter data as line-delimited JSON into one or more files. - - :return: return `False` if processing should cease, otherwise return `True`. - :param data: tweet object returned by Twitter API - """ - if self.startingup: - if self.gzip_compress: - self.output = gzip.open(self.fname, "w") - else: - self.output = open(self.fname, "w") - print(f"Writing to {self.fname}") - - json_data = json.dumps(data) - if self.gzip_compress: - self.output.write((json_data + "\n").encode("utf-8")) - else: - self.output.write(json_data + "\n") - - self.check_date_limit(data) - if self.do_stop: - return - - self.startingup = False - - def on_finish(self): - print(f"Written {self.counter} Tweets") - if self.output: - self.output.close() - - def do_continue(self): - if self.repeat == False: - return TweetHandlerI.do_continue(self) - - if self.do_stop: - # stop for a functional cause (e.g. date limit) - return False - - if self.counter == self.limit: - # repeat is True, thus close output file and - # create a new one - self._restart_file() - return True - - def _restart_file(self): - self.on_finish() - self.fname = self.timestamped_file() - self.startingup = True - self.counter = 0 diff --git a/pipeline/nltk/twitter/util.py b/pipeline/nltk/twitter/util.py deleted file mode 100644 index adfa08853867280da85642778c3e9fb89a532574..0000000000000000000000000000000000000000 --- a/pipeline/nltk/twitter/util.py +++ /dev/null @@ -1,147 +0,0 @@ -# Natural Language Toolkit: Twitter client -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Ewan Klein -# Lorenzo Rubio -# URL: -# For license information, see LICENSE.TXT - -""" -Authentication utilities to accompany `twitterclient`. -""" - -import os -import pprint - -from twython import Twython - - -def credsfromfile(creds_file=None, subdir=None, verbose=False): - """ - Convenience function for authentication - """ - return Authenticate().load_creds( - creds_file=creds_file, subdir=subdir, verbose=verbose - ) - - -class Authenticate: - """ - Methods for authenticating with Twitter. - """ - - def __init__(self): - self.creds_file = "credentials.txt" - self.creds_fullpath = None - - self.oauth = {} - try: - self.twitter_dir = os.environ["TWITTER"] - self.creds_subdir = self.twitter_dir - except KeyError: - self.twitter_dir = None - self.creds_subdir = None - - def load_creds(self, creds_file=None, subdir=None, verbose=False): - """ - Read OAuth credentials from a text file. - - File format for OAuth 1:: - - app_key=YOUR_APP_KEY - app_secret=YOUR_APP_SECRET - oauth_token=OAUTH_TOKEN - oauth_token_secret=OAUTH_TOKEN_SECRET - - - File format for OAuth 2:: - - app_key=YOUR_APP_KEY - app_secret=YOUR_APP_SECRET - access_token=ACCESS_TOKEN - - :param str file_name: File containing credentials. ``None`` (default) reads - data from `TWITTER/'credentials.txt'` - """ - if creds_file is not None: - self.creds_file = creds_file - - if subdir is None: - if self.creds_subdir is None: - msg = ( - "Supply a value to the 'subdir' parameter or" - + " set the TWITTER environment variable." - ) - raise ValueError(msg) - else: - self.creds_subdir = subdir - - self.creds_fullpath = os.path.normpath( - os.path.join(self.creds_subdir, self.creds_file) - ) - - if not os.path.isfile(self.creds_fullpath): - raise OSError(f"Cannot find file {self.creds_fullpath}") - - with open(self.creds_fullpath) as infile: - if verbose: - print(f"Reading credentials file {self.creds_fullpath}") - - for line in infile: - if "=" in line: - name, value = line.split("=", 1) - self.oauth[name.strip()] = value.strip() - - self._validate_creds_file(verbose=verbose) - - return self.oauth - - def _validate_creds_file(self, verbose=False): - """Check validity of a credentials file.""" - oauth1 = False - oauth1_keys = ["app_key", "app_secret", "oauth_token", "oauth_token_secret"] - oauth2 = False - oauth2_keys = ["app_key", "app_secret", "access_token"] - if all(k in self.oauth for k in oauth1_keys): - oauth1 = True - elif all(k in self.oauth for k in oauth2_keys): - oauth2 = True - - if not (oauth1 or oauth2): - msg = f"Missing or incorrect entries in {self.creds_file}\n" - msg += pprint.pformat(self.oauth) - raise ValueError(msg) - elif verbose: - print(f'Credentials file "{self.creds_file}" looks good') - - -def add_access_token(creds_file=None): - """ - For OAuth 2, retrieve an access token for an app and append it to a - credentials file. - """ - if creds_file is None: - path = os.path.dirname(__file__) - creds_file = os.path.join(path, "credentials2.txt") - oauth2 = credsfromfile(creds_file=creds_file) - app_key = oauth2["app_key"] - app_secret = oauth2["app_secret"] - - twitter = Twython(app_key, app_secret, oauth_version=2) - access_token = twitter.obtain_access_token() - tok = f"access_token={access_token}\n" - with open(creds_file, "a") as infile: - print(tok, file=infile) - - -def guess_path(pth): - """ - If the path is not absolute, guess that it is a subdirectory of the - user's home directory. - - :param str pth: The pathname of the directory where files of tweets should be written - """ - if os.path.isabs(pth): - return pth - else: - return os.path.expanduser(os.path.join("~", pth)) diff --git a/pipeline/nltk/util.py b/pipeline/nltk/util.py deleted file mode 100644 index 4d2d96fb74f2ec375596ae8761f565351cbedf31..0000000000000000000000000000000000000000 --- a/pipeline/nltk/util.py +++ /dev/null @@ -1,1216 +0,0 @@ -# Natural Language Toolkit: Utility functions -# -# Copyright (C) 2001-2023 NLTK Project -# Author: Steven Bird -# Eric Kafe (acyclic closures) -# URL: -# For license information, see LICENSE.TXT - -import inspect -import locale -import os -import pydoc -import re -import textwrap -import warnings -from collections import defaultdict, deque -from itertools import chain, combinations, islice, tee -from pprint import pprint -from urllib.request import ( - HTTPPasswordMgrWithDefaultRealm, - ProxyBasicAuthHandler, - ProxyDigestAuthHandler, - ProxyHandler, - build_opener, - getproxies, - install_opener, -) - -from nltk.collections import * -from nltk.internals import deprecated, raise_unorderable_types, slice_bounds - -###################################################################### -# Short usage message -###################################################################### - - -@deprecated("Use help(obj) instead.") -def usage(obj): - str(obj) # In case it's lazy, this will load it. - - if not isinstance(obj, type): - obj = obj.__class__ - - print(f"{obj.__name__} supports the following operations:") - for (name, method) in sorted(pydoc.allmethods(obj).items()): - if name.startswith("_"): - continue - if getattr(method, "__deprecated__", False): - continue - - try: - sig = str(inspect.signature(method)) - except ValueError as e: - # builtins sometimes don't support introspection - if "builtin" in str(e): - continue - else: - raise - - args = sig.lstrip("(").rstrip(")").split(", ") - meth = inspect.getattr_static(obj, name) - if isinstance(meth, (classmethod, staticmethod)): - name = f"cls.{name}" - elif args and args[0] == "self": - name = f"self.{name}" - args.pop(0) - print( - textwrap.fill( - f"{name}({', '.join(args)})", - initial_indent=" - ", - subsequent_indent=" " * (len(name) + 5), - ) - ) - - -########################################################################## -# IDLE -########################################################################## - - -def in_idle(): - """ - Return True if this function is run within idle. Tkinter - programs that are run in idle should never call ``Tk.mainloop``; so - this function should be used to gate all calls to ``Tk.mainloop``. - - :warning: This function works by checking ``sys.stdin``. If the - user has modified ``sys.stdin``, then it may return incorrect - results. - :rtype: bool - """ - import sys - - return sys.stdin.__class__.__name__ in ("PyShell", "RPCProxy") - - -########################################################################## -# PRETTY PRINTING -########################################################################## - - -def pr(data, start=0, end=None): - """ - Pretty print a sequence of data items - - :param data: the data stream to print - :type data: sequence or iter - :param start: the start position - :type start: int - :param end: the end position - :type end: int - """ - pprint(list(islice(data, start, end))) - - -def print_string(s, width=70): - """ - Pretty print a string, breaking lines on whitespace - - :param s: the string to print, consisting of words and spaces - :type s: str - :param width: the display width - :type width: int - """ - print("\n".join(textwrap.wrap(s, width=width))) - - -def tokenwrap(tokens, separator=" ", width=70): - """ - Pretty print a list of text tokens, breaking lines on whitespace - - :param tokens: the tokens to print - :type tokens: list - :param separator: the string to use to separate tokens - :type separator: str - :param width: the display width (default=70) - :type width: int - """ - return "\n".join(textwrap.wrap(separator.join(tokens), width=width)) - - -########################################################################## -# Indexing -########################################################################## - - -class Index(defaultdict): - def __init__(self, pairs): - defaultdict.__init__(self, list) - for key, value in pairs: - self[key].append(value) - - -###################################################################### -## Regexp display (thanks to David Mertz) -###################################################################### - - -def re_show(regexp, string, left="{", right="}"): - """ - Return a string with markers surrounding the matched substrings. - Search str for substrings matching ``regexp`` and wrap the matches - with braces. This is convenient for learning about regular expressions. - - :param regexp: The regular expression. - :type regexp: str - :param string: The string being matched. - :type string: str - :param left: The left delimiter (printed before the matched substring) - :type left: str - :param right: The right delimiter (printed after the matched substring) - :type right: str - :rtype: str - """ - print(re.compile(regexp, re.M).sub(left + r"\g<0>" + right, string.rstrip())) - - -########################################################################## -# READ FROM FILE OR STRING -########################################################################## - -# recipe from David Mertz -def filestring(f): - if hasattr(f, "read"): - return f.read() - elif isinstance(f, str): - with open(f) as infile: - return infile.read() - else: - raise ValueError("Must be called with a filename or file-like object") - - -########################################################################## -# Breadth-First Search -########################################################################## - - -def breadth_first(tree, children=iter, maxdepth=-1): - """Traverse the nodes of a tree in breadth-first order. - (No check for cycles.) - The first argument should be the tree root; - children should be a function taking as argument a tree node - and returning an iterator of the node's children. - """ - queue = deque([(tree, 0)]) - - while queue: - node, depth = queue.popleft() - yield node - - if depth != maxdepth: - try: - queue.extend((c, depth + 1) for c in children(node)) - except TypeError: - pass - - -########################################################################## -# Graph Drawing -########################################################################## - - -def edge_closure(tree, children=iter, maxdepth=-1, verbose=False): - """Yield the edges of a graph in breadth-first order, - discarding eventual cycles. - The first argument should be the start node; - children should be a function taking as argument a graph node - and returning an iterator of the node's children. - - >>> from nltk.util import edge_closure - >>> print(list(edge_closure('A', lambda node:{'A':['B','C'], 'B':'C', 'C':'B'}[node]))) - [('A', 'B'), ('A', 'C'), ('B', 'C'), ('C', 'B')] - """ - traversed = set() - edges = set() - queue = deque([(tree, 0)]) - while queue: - node, depth = queue.popleft() - traversed.add(node) - if depth != maxdepth: - try: - for child in children(node): - if child not in traversed: - queue.append((child, depth + 1)) - else: - if verbose: - warnings.warn( - f"Discarded redundant search for {child} at depth {depth + 1}", - stacklevel=2, - ) - edge = (node, child) - if edge not in edges: - yield edge - edges.add(edge) - except TypeError: - pass - - -def edges2dot(edges, shapes=None, attr=None): - """ - :param edges: the set (or list) of edges of a directed graph. - - :return dot_string: a representation of 'edges' as a string in the DOT - graph language, which can be converted to an image by the 'dot' program - from the Graphviz package, or nltk.parse.dependencygraph.dot2img(dot_string). - - :param shapes: dictionary of strings that trigger a specified shape. - :param attr: dictionary with global graph attributes - - >>> import nltk - >>> from nltk.util import edges2dot - >>> print(edges2dot([('A', 'B'), ('A', 'C'), ('B', 'C'), ('C', 'B')])) - digraph G { - "A" -> "B"; - "A" -> "C"; - "B" -> "C"; - "C" -> "B"; - } - - """ - if not shapes: - shapes = dict() - if not attr: - attr = dict() - - dot_string = "digraph G {\n" - - for pair in attr.items(): - dot_string += f"{pair[0]} = {pair[1]};\n" - - for edge in edges: - for shape in shapes.items(): - for node in range(2): - if shape[0] in repr(edge[node]): - dot_string += f'"{edge[node]}" [shape = {shape[1]}];\n' - dot_string += f'"{edge[0]}" -> "{edge[1]}";\n' - - dot_string += "}\n" - return dot_string - - -def unweighted_minimum_spanning_digraph(tree, children=iter, shapes=None, attr=None): - """ - - Build a Minimum Spanning Tree (MST) of an unweighted graph, - by traversing the nodes of a tree in breadth-first order, - discarding eventual cycles. - - Return a representation of this MST as a string in the DOT graph language, - which can be converted to an image by the 'dot' program from the Graphviz - package, or nltk.parse.dependencygraph.dot2img(dot_string). - - The first argument should be the tree root; - children should be a function taking as argument a tree node - and returning an iterator of the node's children. - - >>> import nltk - >>> wn=nltk.corpus.wordnet - >>> from nltk.util import unweighted_minimum_spanning_digraph as umsd - >>> print(umsd(wn.synset('bound.a.01'), lambda s:s.also_sees())) - digraph G { - "Synset('bound.a.01')" -> "Synset('unfree.a.02')"; - "Synset('unfree.a.02')" -> "Synset('confined.a.02')"; - "Synset('unfree.a.02')" -> "Synset('dependent.a.01')"; - "Synset('unfree.a.02')" -> "Synset('restricted.a.01')"; - "Synset('restricted.a.01')" -> "Synset('classified.a.02')"; - } - - """ - return edges2dot( - edge_closure( - tree, lambda node: unweighted_minimum_spanning_dict(tree, children)[node] - ), - shapes, - attr, - ) - - -########################################################################## -# Breadth-First / Depth-first Searches with Cycle Detection -########################################################################## - - -def acyclic_breadth_first(tree, children=iter, maxdepth=-1): - """Traverse the nodes of a tree in breadth-first order, - discarding eventual cycles. - - The first argument should be the tree root; - children should be a function taking as argument a tree node - and returning an iterator of the node's children. - """ - traversed = set() - queue = deque([(tree, 0)]) - while queue: - node, depth = queue.popleft() - yield node - traversed.add(node) - if depth != maxdepth: - try: - for child in children(node): - if child not in traversed: - queue.append((child, depth + 1)) - else: - warnings.warn( - "Discarded redundant search for {} at depth {}".format( - child, depth + 1 - ), - stacklevel=2, - ) - except TypeError: - pass - - -def acyclic_depth_first(tree, children=iter, depth=-1, cut_mark=None, traversed=None): - """Traverse the nodes of a tree in depth-first order, - discarding eventual cycles within any branch, - adding cut_mark (when specified) if cycles were truncated. - - The first argument should be the tree root; - children should be a function taking as argument a tree node - and returning an iterator of the node's children. - - Catches all cycles: - - >>> import nltk - >>> from nltk.util import acyclic_depth_first as acyclic_tree - >>> wn=nltk.corpus.wordnet - >>> from pprint import pprint - >>> pprint(acyclic_tree(wn.synset('dog.n.01'), lambda s:s.hypernyms(),cut_mark='...')) - [Synset('dog.n.01'), - [Synset('canine.n.02'), - [Synset('carnivore.n.01'), - [Synset('placental.n.01'), - [Synset('mammal.n.01'), - [Synset('vertebrate.n.01'), - [Synset('chordate.n.01'), - [Synset('animal.n.01'), - [Synset('organism.n.01'), - [Synset('living_thing.n.01'), - [Synset('whole.n.02'), - [Synset('object.n.01'), - [Synset('physical_entity.n.01'), - [Synset('entity.n.01')]]]]]]]]]]]]], - [Synset('domestic_animal.n.01'), "Cycle(Synset('animal.n.01'),-3,...)"]] - """ - if traversed is None: - traversed = {tree} - out_tree = [tree] - if depth != 0: - try: - for child in children(tree): - if child not in traversed: - # Recurse with a common "traversed" set for all children: - traversed.add(child) - out_tree += [ - acyclic_depth_first( - child, children, depth - 1, cut_mark, traversed - ) - ] - else: - warnings.warn( - "Discarded redundant search for {} at depth {}".format( - child, depth - 1 - ), - stacklevel=3, - ) - if cut_mark: - out_tree += [f"Cycle({child},{depth - 1},{cut_mark})"] - except TypeError: - pass - elif cut_mark: - out_tree += [cut_mark] - return out_tree - - -def acyclic_branches_depth_first( - tree, children=iter, depth=-1, cut_mark=None, traversed=None -): - """Traverse the nodes of a tree in depth-first order, - discarding eventual cycles within the same branch, - but keep duplicate paths in different branches. - Add cut_mark (when defined) if cycles were truncated. - - The first argument should be the tree root; - children should be a function taking as argument a tree node - and returning an iterator of the node's children. - - Catches only only cycles within the same branch, - but keeping cycles from different branches: - - >>> import nltk - >>> from nltk.util import acyclic_branches_depth_first as tree - >>> wn=nltk.corpus.wordnet - >>> from pprint import pprint - >>> pprint(tree(wn.synset('certified.a.01'), lambda s:s.also_sees(), cut_mark='...', depth=4)) - [Synset('certified.a.01'), - [Synset('authorized.a.01'), - [Synset('lawful.a.01'), - [Synset('legal.a.01'), - "Cycle(Synset('lawful.a.01'),0,...)", - [Synset('legitimate.a.01'), '...']], - [Synset('straight.a.06'), - [Synset('honest.a.01'), '...'], - "Cycle(Synset('lawful.a.01'),0,...)"]], - [Synset('legitimate.a.01'), - "Cycle(Synset('authorized.a.01'),1,...)", - [Synset('legal.a.01'), - [Synset('lawful.a.01'), '...'], - "Cycle(Synset('legitimate.a.01'),0,...)"], - [Synset('valid.a.01'), - "Cycle(Synset('legitimate.a.01'),0,...)", - [Synset('reasonable.a.01'), '...']]], - [Synset('official.a.01'), "Cycle(Synset('authorized.a.01'),1,...)"]], - [Synset('documented.a.01')]] - """ - if traversed is None: - traversed = {tree} - out_tree = [tree] - if depth != 0: - try: - for child in children(tree): - if child not in traversed: - # Recurse with a different "traversed" set for each child: - out_tree += [ - acyclic_branches_depth_first( - child, - children, - depth - 1, - cut_mark, - traversed.union({child}), - ) - ] - else: - warnings.warn( - "Discarded redundant search for {} at depth {}".format( - child, depth - 1 - ), - stacklevel=3, - ) - if cut_mark: - out_tree += [f"Cycle({child},{depth - 1},{cut_mark})"] - except TypeError: - pass - elif cut_mark: - out_tree += [cut_mark] - return out_tree - - -def acyclic_dic2tree(node, dic): - """Convert acyclic dictionary 'dic', where the keys are nodes, and the - values are lists of children, to output tree suitable for pprint(), - starting at root 'node', with subtrees as nested lists.""" - return [node] + [acyclic_dic2tree(child, dic) for child in dic[node]] - - -def unweighted_minimum_spanning_dict(tree, children=iter): - """ - Output a dictionary representing a Minimum Spanning Tree (MST) - of an unweighted graph, by traversing the nodes of a tree in - breadth-first order, discarding eventual cycles. - - The first argument should be the tree root; - children should be a function taking as argument a tree node - and returning an iterator of the node's children. - - >>> import nltk - >>> from nltk.corpus import wordnet as wn - >>> from nltk.util import unweighted_minimum_spanning_dict as umsd - >>> from pprint import pprint - >>> pprint(umsd(wn.synset('bound.a.01'), lambda s:s.also_sees())) - {Synset('bound.a.01'): [Synset('unfree.a.02')], - Synset('classified.a.02'): [], - Synset('confined.a.02'): [], - Synset('dependent.a.01'): [], - Synset('restricted.a.01'): [Synset('classified.a.02')], - Synset('unfree.a.02'): [Synset('confined.a.02'), - Synset('dependent.a.01'), - Synset('restricted.a.01')]} - - """ - traversed = set() # Empty set of traversed nodes - queue = deque([tree]) # Initialize queue - agenda = {tree} # Set of all nodes ever queued - mstdic = {} # Empty MST dictionary - while queue: - node = queue.popleft() # Node is not yet in the MST dictionary, - mstdic[node] = [] # so add it with an empty list of children - if node not in traversed: # Avoid cycles - traversed.add(node) - for child in children(node): - if child not in agenda: # Queue nodes only once - mstdic[node].append(child) # Add child to the MST - queue.append(child) # Add child to queue - agenda.add(child) - return mstdic - - -def unweighted_minimum_spanning_tree(tree, children=iter): - """ - Output a Minimum Spanning Tree (MST) of an unweighted graph, - by traversing the nodes of a tree in breadth-first order, - discarding eventual cycles. - - The first argument should be the tree root; - children should be a function taking as argument a tree node - and returning an iterator of the node's children. - - >>> import nltk - >>> from nltk.util import unweighted_minimum_spanning_tree as mst - >>> wn=nltk.corpus.wordnet - >>> from pprint import pprint - >>> pprint(mst(wn.synset('bound.a.01'), lambda s:s.also_sees())) - [Synset('bound.a.01'), - [Synset('unfree.a.02'), - [Synset('confined.a.02')], - [Synset('dependent.a.01')], - [Synset('restricted.a.01'), [Synset('classified.a.02')]]]] - """ - return acyclic_dic2tree(tree, unweighted_minimum_spanning_dict(tree, children)) - - -########################################################################## -# Guess Character Encoding -########################################################################## - -# adapted from io.py in the docutils extension module (https://docutils.sourceforge.io/) -# http://www.pyzine.com/Issue008/Section_Articles/article_Encodings.html - - -def guess_encoding(data): - """ - Given a byte string, attempt to decode it. - Tries the standard 'UTF8' and 'latin-1' encodings, - Plus several gathered from locale information. - - The calling program *must* first call:: - - locale.setlocale(locale.LC_ALL, '') - - If successful it returns ``(decoded_unicode, successful_encoding)``. - If unsuccessful it raises a ``UnicodeError``. - """ - successful_encoding = None - # we make 'utf-8' the first encoding - encodings = ["utf-8"] - # - # next we add anything we can learn from the locale - try: - encodings.append(locale.nl_langinfo(locale.CODESET)) - except AttributeError: - pass - try: - encodings.append(locale.getlocale()[1]) - except (AttributeError, IndexError): - pass - try: - encodings.append(locale.getdefaultlocale()[1]) - except (AttributeError, IndexError): - pass - # - # we try 'latin-1' last - encodings.append("latin-1") - for enc in encodings: - # some of the locale calls - # may have returned None - if not enc: - continue - try: - decoded = str(data, enc) - successful_encoding = enc - - except (UnicodeError, LookupError): - pass - else: - break - if not successful_encoding: - raise UnicodeError( - "Unable to decode input data. " - "Tried the following encodings: %s." - % ", ".join([repr(enc) for enc in encodings if enc]) - ) - else: - return (decoded, successful_encoding) - - -########################################################################## -# Remove repeated elements from a list deterministcally -########################################################################## - - -def unique_list(xs): - seen = set() - # not seen.add(x) here acts to make the code shorter without using if statements, seen.add(x) always returns None. - return [x for x in xs if x not in seen and not seen.add(x)] - - -########################################################################## -# Invert a dictionary -########################################################################## - - -def invert_dict(d): - inverted_dict = defaultdict(list) - for key in d: - if hasattr(d[key], "__iter__"): - for term in d[key]: - inverted_dict[term].append(key) - else: - inverted_dict[d[key]] = key - return inverted_dict - - -########################################################################## -# Utilities for directed graphs: transitive closure, and inversion -# The graph is represented as a dictionary of sets -########################################################################## - - -def transitive_closure(graph, reflexive=False): - """ - Calculate the transitive closure of a directed graph, - optionally the reflexive transitive closure. - - The algorithm is a slight modification of the "Marking Algorithm" of - Ioannidis & Ramakrishnan (1998) "Efficient Transitive Closure Algorithms". - - :param graph: the initial graph, represented as a dictionary of sets - :type graph: dict(set) - :param reflexive: if set, also make the closure reflexive - :type reflexive: bool - :rtype: dict(set) - """ - if reflexive: - base_set = lambda k: {k} - else: - base_set = lambda k: set() - # The graph U_i in the article: - agenda_graph = {k: graph[k].copy() for k in graph} - # The graph M_i in the article: - closure_graph = {k: base_set(k) for k in graph} - for i in graph: - agenda = agenda_graph[i] - closure = closure_graph[i] - while agenda: - j = agenda.pop() - closure.add(j) - closure |= closure_graph.setdefault(j, base_set(j)) - agenda |= agenda_graph.get(j, base_set(j)) - agenda -= closure - return closure_graph - - -def invert_graph(graph): - """ - Inverts a directed graph. - - :param graph: the graph, represented as a dictionary of sets - :type graph: dict(set) - :return: the inverted graph - :rtype: dict(set) - """ - inverted = {} - for key in graph: - for value in graph[key]: - inverted.setdefault(value, set()).add(key) - return inverted - - -########################################################################## -# HTML Cleaning -########################################################################## - - -def clean_html(html): - raise NotImplementedError( - "To remove HTML markup, use BeautifulSoup's get_text() function" - ) - - -def clean_url(url): - raise NotImplementedError( - "To remove HTML markup, use BeautifulSoup's get_text() function" - ) - - -########################################################################## -# FLATTEN LISTS -########################################################################## - - -def flatten(*args): - """ - Flatten a list. - - >>> from nltk.util import flatten - >>> flatten(1, 2, ['b', 'a' , ['c', 'd']], 3) - [1, 2, 'b', 'a', 'c', 'd', 3] - - :param args: items and lists to be combined into a single list - :rtype: list - """ - - x = [] - for l in args: - if not isinstance(l, (list, tuple)): - l = [l] - for item in l: - if isinstance(item, (list, tuple)): - x.extend(flatten(item)) - else: - x.append(item) - return x - - -########################################################################## -# Ngram iteration -########################################################################## - - -def pad_sequence( - sequence, - n, - pad_left=False, - pad_right=False, - left_pad_symbol=None, - right_pad_symbol=None, -): - """ - Returns a padded sequence of items before ngram extraction. - - >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='', right_pad_symbol='')) - ['', 1, 2, 3, 4, 5, ''] - >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='')) - ['', 1, 2, 3, 4, 5] - >>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='')) - [1, 2, 3, 4, 5, ''] - - :param sequence: the source data to be padded - :type sequence: sequence or iter - :param n: the degree of the ngrams - :type n: int - :param pad_left: whether the ngrams should be left-padded - :type pad_left: bool - :param pad_right: whether the ngrams should be right-padded - :type pad_right: bool - :param left_pad_symbol: the symbol to use for left padding (default is None) - :type left_pad_symbol: any - :param right_pad_symbol: the symbol to use for right padding (default is None) - :type right_pad_symbol: any - :rtype: sequence or iter - """ - sequence = iter(sequence) - if pad_left: - sequence = chain((left_pad_symbol,) * (n - 1), sequence) - if pad_right: - sequence = chain(sequence, (right_pad_symbol,) * (n - 1)) - return sequence - - -# add a flag to pad the sequence so we get peripheral ngrams? - - -def ngrams(sequence, n, **kwargs): - """ - Return the ngrams generated from a sequence of items, as an iterator. - For example: - - >>> from nltk.util import ngrams - >>> list(ngrams([1,2,3,4,5], 3)) - [(1, 2, 3), (2, 3, 4), (3, 4, 5)] - - Wrap with list for a list version of this function. Set pad_left - or pad_right to true in order to get additional ngrams: - - >>> list(ngrams([1,2,3,4,5], 2, pad_right=True)) - [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)] - >>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='
    ')) - [(1, 2), (2, 3), (3, 4), (4, 5), (5, '
    ')] - >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='')) - [('', 1), (1, 2), (2, 3), (3, 4), (4, 5)] - >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='', right_pad_symbol='')) - [('', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '')] - - - :param sequence: the source data to be converted into ngrams - :type sequence: sequence or iter - :param n: the degree of the ngrams - :type n: int - :param pad_left: whether the ngrams should be left-padded - :type pad_left: bool - :param pad_right: whether the ngrams should be right-padded - :type pad_right: bool - :param left_pad_symbol: the symbol to use for left padding (default is None) - :type left_pad_symbol: any - :param right_pad_symbol: the symbol to use for right padding (default is None) - :type right_pad_symbol: any - :rtype: sequence or iter - """ - sequence = pad_sequence(sequence, n, **kwargs) - - # Creates the sliding window, of n no. of items. - # `iterables` is a tuple of iterables where each iterable is a window of n items. - iterables = tee(sequence, n) - - for i, sub_iterable in enumerate(iterables): # For each window, - for _ in range(i): # iterate through every order of ngrams - next(sub_iterable, None) # generate the ngrams within the window. - return zip(*iterables) # Unpack and flattens the iterables. - - -def bigrams(sequence, **kwargs): - """ - Return the bigrams generated from a sequence of items, as an iterator. - For example: - - >>> from nltk.util import bigrams - >>> list(bigrams([1,2,3,4,5])) - [(1, 2), (2, 3), (3, 4), (4, 5)] - - Use bigrams for a list version of this function. - - :param sequence: the source data to be converted into bigrams - :type sequence: sequence or iter - :rtype: iter(tuple) - """ - - yield from ngrams(sequence, 2, **kwargs) - - -def trigrams(sequence, **kwargs): - """ - Return the trigrams generated from a sequence of items, as an iterator. - For example: - - >>> from nltk.util import trigrams - >>> list(trigrams([1,2,3,4,5])) - [(1, 2, 3), (2, 3, 4), (3, 4, 5)] - - Use trigrams for a list version of this function. - - :param sequence: the source data to be converted into trigrams - :type sequence: sequence or iter - :rtype: iter(tuple) - """ - - yield from ngrams(sequence, 3, **kwargs) - - -def everygrams( - sequence, min_len=1, max_len=-1, pad_left=False, pad_right=False, **kwargs -): - """ - Returns all possible ngrams generated from a sequence of items, as an iterator. - - >>> sent = 'a b c'.split() - - New version outputs for everygrams. - >>> list(everygrams(sent)) - [('a',), ('a', 'b'), ('a', 'b', 'c'), ('b',), ('b', 'c'), ('c',)] - - Old version outputs for everygrams. - >>> sorted(everygrams(sent), key=len) - [('a',), ('b',), ('c',), ('a', 'b'), ('b', 'c'), ('a', 'b', 'c')] - - >>> list(everygrams(sent, max_len=2)) - [('a',), ('a', 'b'), ('b',), ('b', 'c'), ('c',)] - - :param sequence: the source data to be converted into ngrams. If max_len is - not provided, this sequence will be loaded into memory - :type sequence: sequence or iter - :param min_len: minimum length of the ngrams, aka. n-gram order/degree of ngram - :type min_len: int - :param max_len: maximum length of the ngrams (set to length of sequence by default) - :type max_len: int - :param pad_left: whether the ngrams should be left-padded - :type pad_left: bool - :param pad_right: whether the ngrams should be right-padded - :type pad_right: bool - :rtype: iter(tuple) - """ - - # Get max_len for padding. - if max_len == -1: - try: - max_len = len(sequence) - except TypeError: - sequence = list(sequence) - max_len = len(sequence) - - # Pad if indicated using max_len. - sequence = pad_sequence(sequence, max_len, pad_left, pad_right, **kwargs) - - # Sliding window to store grams. - history = list(islice(sequence, max_len)) - - # Yield ngrams from sequence. - while history: - for ngram_len in range(min_len, len(history) + 1): - yield tuple(history[:ngram_len]) - - # Append element to history if sequence has more items. - try: - history.append(next(sequence)) - except StopIteration: - pass - - del history[0] - - -def skipgrams(sequence, n, k, **kwargs): - """ - Returns all possible skipgrams generated from a sequence of items, as an iterator. - Skipgrams are ngrams that allows tokens to be skipped. - Refer to http://homepages.inf.ed.ac.uk/ballison/pdf/lrec_skipgrams.pdf - - >>> sent = "Insurgents killed in ongoing fighting".split() - >>> list(skipgrams(sent, 2, 2)) - [('Insurgents', 'killed'), ('Insurgents', 'in'), ('Insurgents', 'ongoing'), ('killed', 'in'), ('killed', 'ongoing'), ('killed', 'fighting'), ('in', 'ongoing'), ('in', 'fighting'), ('ongoing', 'fighting')] - >>> list(skipgrams(sent, 3, 2)) - [('Insurgents', 'killed', 'in'), ('Insurgents', 'killed', 'ongoing'), ('Insurgents', 'killed', 'fighting'), ('Insurgents', 'in', 'ongoing'), ('Insurgents', 'in', 'fighting'), ('Insurgents', 'ongoing', 'fighting'), ('killed', 'in', 'ongoing'), ('killed', 'in', 'fighting'), ('killed', 'ongoing', 'fighting'), ('in', 'ongoing', 'fighting')] - - :param sequence: the source data to be converted into trigrams - :type sequence: sequence or iter - :param n: the degree of the ngrams - :type n: int - :param k: the skip distance - :type k: int - :rtype: iter(tuple) - """ - - # Pads the sequence as desired by **kwargs. - if "pad_left" in kwargs or "pad_right" in kwargs: - sequence = pad_sequence(sequence, n, **kwargs) - - # Note when iterating through the ngrams, the pad_right here is not - # the **kwargs padding, it's for the algorithm to detect the SENTINEL - # object on the right pad to stop inner loop. - SENTINEL = object() - for ngram in ngrams(sequence, n + k, pad_right=True, right_pad_symbol=SENTINEL): - head = ngram[:1] - tail = ngram[1:] - for skip_tail in combinations(tail, n - 1): - if skip_tail[-1] is SENTINEL: - continue - yield head + skip_tail - - -###################################################################### -# Binary Search in a File -###################################################################### - -# inherited from pywordnet, by Oliver Steele -def binary_search_file(file, key, cache=None, cacheDepth=-1): - """ - Return the line from the file with first word key. - Searches through a sorted file using the binary search algorithm. - - :type file: file - :param file: the file to be searched through. - :type key: str - :param key: the identifier we are searching for. - """ - - key = key + " " - keylen = len(key) - start = 0 - currentDepth = 0 - - if hasattr(file, "name"): - end = os.stat(file.name).st_size - 1 - else: - file.seek(0, 2) - end = file.tell() - 1 - file.seek(0) - - if cache is None: - cache = {} - - while start < end: - lastState = start, end - middle = (start + end) // 2 - - if cache.get(middle): - offset, line = cache[middle] - - else: - line = "" - while True: - file.seek(max(0, middle - 1)) - if middle > 0: - file.discard_line() - offset = file.tell() - line = file.readline() - if line != "": - break - # at EOF; try to find start of the last line - middle = (start + middle) // 2 - if middle == end - 1: - return None - if currentDepth < cacheDepth: - cache[middle] = (offset, line) - - if offset > end: - assert end != middle - 1, "infinite loop" - end = middle - 1 - elif line[:keylen] == key: - return line - elif line > key: - assert end != middle - 1, "infinite loop" - end = middle - 1 - elif line < key: - start = offset + len(line) - 1 - - currentDepth += 1 - thisState = start, end - - if lastState == thisState: - # Detects the condition where we're searching past the end - # of the file, which is otherwise difficult to detect - return None - - return None - - -###################################################################### -# Proxy configuration -###################################################################### - - -def set_proxy(proxy, user=None, password=""): - """ - Set the HTTP proxy for Python to download through. - - If ``proxy`` is None then tries to set proxy from environment or system - settings. - - :param proxy: The HTTP proxy server to use. For example: - 'http://proxy.example.com:3128/' - :param user: The username to authenticate with. Use None to disable - authentication. - :param password: The password to authenticate with. - """ - if proxy is None: - # Try and find the system proxy settings - try: - proxy = getproxies()["http"] - except KeyError as e: - raise ValueError("Could not detect default proxy settings") from e - - # Set up the proxy handler - proxy_handler = ProxyHandler({"https": proxy, "http": proxy}) - opener = build_opener(proxy_handler) - - if user is not None: - # Set up basic proxy authentication if provided - password_manager = HTTPPasswordMgrWithDefaultRealm() - password_manager.add_password(realm=None, uri=proxy, user=user, passwd=password) - opener.add_handler(ProxyBasicAuthHandler(password_manager)) - opener.add_handler(ProxyDigestAuthHandler(password_manager)) - - # Override the existing url opener - install_opener(opener) - - -###################################################################### -# ElementTree pretty printing from https://www.effbot.org/zone/element-lib.htm -###################################################################### - - -def elementtree_indent(elem, level=0): - """ - Recursive function to indent an ElementTree._ElementInterface - used for pretty printing. Run indent on elem and then output - in the normal way. - - :param elem: element to be indented. will be modified. - :type elem: ElementTree._ElementInterface - :param level: level of indentation for this element - :type level: nonnegative integer - :rtype: ElementTree._ElementInterface - :return: Contents of elem indented to reflect its structure - """ - - i = "\n" + level * " " - if len(elem): - if not elem.text or not elem.text.strip(): - elem.text = i + " " - for elem in elem: - elementtree_indent(elem, level + 1) - if not elem.tail or not elem.tail.strip(): - elem.tail = i - else: - if level and (not elem.tail or not elem.tail.strip()): - elem.tail = i - - -###################################################################### -# Mathematical approximations -###################################################################### - - -def choose(n, k): - """ - This function is a fast way to calculate binomial coefficients, commonly - known as nCk, i.e. the number of combinations of n things taken k at a time. - (https://en.wikipedia.org/wiki/Binomial_coefficient). - - This is the *scipy.special.comb()* with long integer computation but this - approximation is faster, see https://github.com/nltk/nltk/issues/1181 - - >>> choose(4, 2) - 6 - >>> choose(6, 2) - 15 - - :param n: The number of things. - :type n: int - :param r: The number of times a thing is taken. - :type r: int - """ - if 0 <= k <= n: - ntok, ktok = 1, 1 - for t in range(1, min(k, n - k) + 1): - ntok *= n - ktok *= t - n -= 1 - return ntok // ktok - else: - return 0 - - -###################################################################### -# Iteration utilities -###################################################################### - - -def pairwise(iterable): - """s -> (s0,s1), (s1,s2), (s2, s3), ...""" - a, b = tee(iterable) - next(b, None) - return zip(a, b) - - -###################################################################### -# Parallelization. -###################################################################### - - -def parallelize_preprocess(func, iterator, processes, progress_bar=False): - from joblib import Parallel, delayed - from tqdm import tqdm - - iterator = tqdm(iterator) if progress_bar else iterator - if processes <= 1: - return map(func, iterator) - return Parallel(n_jobs=processes)(delayed(func)(line) for line in iterator) diff --git a/pipeline/nltk/wsd.py b/pipeline/nltk/wsd.py deleted file mode 100644 index 8e29ce1e44b302d751a55d9512363f364a7c3f47..0000000000000000000000000000000000000000 --- a/pipeline/nltk/wsd.py +++ /dev/null @@ -1,51 +0,0 @@ -# Natural Language Toolkit: Word Sense Disambiguation Algorithms -# -# Authors: Liling Tan , -# Dmitrijs Milajevs -# -# Copyright (C) 2001-2023 NLTK Project -# URL: -# For license information, see LICENSE.TXT - -from nltk.corpus import wordnet - - -def lesk(context_sentence, ambiguous_word, pos=None, synsets=None): - """Return a synset for an ambiguous word in a context. - - :param iter context_sentence: The context sentence where the ambiguous word - occurs, passed as an iterable of words. - :param str ambiguous_word: The ambiguous word that requires WSD. - :param str pos: A specified Part-of-Speech (POS). - :param iter synsets: Possible synsets of the ambiguous word. - :return: ``lesk_sense`` The Synset() object with the highest signature overlaps. - - This function is an implementation of the original Lesk algorithm (1986) [1]. - - Usage example:: - - >>> lesk(['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.'], 'bank', 'n') - Synset('savings_bank.n.02') - - [1] Lesk, Michael. "Automatic sense disambiguation using machine - readable dictionaries: how to tell a pine cone from an ice cream - cone." Proceedings of the 5th Annual International Conference on - Systems Documentation. ACM, 1986. - https://dl.acm.org/citation.cfm?id=318728 - """ - - context = set(context_sentence) - if synsets is None: - synsets = wordnet.synsets(ambiguous_word) - - if pos: - synsets = [ss for ss in synsets if str(ss.pos()) == pos] - - if not synsets: - return None - - _, sense = max( - (len(context.intersection(ss.definition().split())), ss) for ss in synsets - ) - - return sense diff --git a/prompts/claim_generate.yaml b/pipeline/prompts/claim_generate.yaml similarity index 100% rename from prompts/claim_generate.yaml rename to pipeline/prompts/claim_generate.yaml diff --git a/prompts/query_generate.yaml b/pipeline/prompts/query_generate.yaml similarity index 100% rename from prompts/query_generate.yaml rename to pipeline/prompts/query_generate.yaml diff --git a/prompts/verification.yaml b/pipeline/prompts/verification.yaml similarity index 100% rename from prompts/verification.yaml rename to pipeline/prompts/verification.yaml diff --git a/pipeline/query_generate.py b/pipeline/query_generate.py index dd9854009015e091f72cd78f975d029a9d336888..06ffeba93a1dab3acca2bf8bed9cd19dfa9bac3b 100644 --- a/pipeline/query_generate.py +++ b/pipeline/query_generate.py @@ -5,21 +5,22 @@ import asyncio from nltk.corpus import wordnet class QueryGenerator: - def __init__(self, prompt_path, chat, type): + def __init__(self, prompt_path, chat): self.type = type with open(prompt_path,"r",encoding='utf-8') as file: - self.prompt = yaml.load(file, yaml.FullLoader)[type] + self.prompt = yaml.load(file, yaml.FullLoader) self.chat = chat - def objects_extract(self, claim_list): - user_prompt = self.prompt["object"]["user"].format(claims=claim_list) - message = [[ - {"role": "system", "content": self.prompt["object"]["system"]}, - {"role": "user", "content": user_prompt} - ],] - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - response = loop.run_until_complete(self.chat.get_response(messages=message)) + def objects_extract(self, claim_list, use_attribue=False, response=None): + if use_attribue: + user_prompt = self.prompt[self.type]["object"]["user"].format(claims=claim_list) + message = [[ + {"role": "system", "content": self.prompt["object"]["system"]}, + {"role": "user", "content": user_prompt} + ],] + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + response = loop.run_until_complete(self.chat.get_response(messages=message)) try: response = json.loads(response[0]) @@ -68,15 +69,17 @@ class QueryGenerator: objects = ".".join([object for object in backup]) return objects - def filter(self, res, object_list): - attribute_ques_list = json.loads(res[0]) + def filter(self, res, object_list, use_attribue=False): + if use_attribue: + attribute_ques_list = json.loads(res[0]) scenetext_ques_list = json.loads(res[1]) fact_ques_list = json.loads(res[2]) objects = set(()) for idx, key in enumerate(fact_ques_list): if fact_ques_list[key][0] != "none": object_list[idx] = "none" # 将对应的object赋值为0 - attribute_ques_list[key] = ["none"] + if use_attribue: + attribute_ques_list[key] = ["none"] scenetext_ques_list[key] = ["none"] else: for object in object_list[key]: @@ -84,20 +87,44 @@ class QueryGenerator: objects.add(object) objects = self.remove_hypernyms(objects) - return attribute_ques_list, scenetext_ques_list, fact_ques_list, objects + if use_attribue: + return attribute_ques_list, scenetext_ques_list, fact_ques_list, objects + else: + return scenetext_ques_list, fact_ques_list, objects + + def get_response(self, claim_list, type, use_attribute=False): + self.type = type + if use_attribute: + object_list, objects = self.objects_extract(claim_list=claim_list, use_attribue=True) + self.message_list = [ + [{"role": "system", "content": self.prompt[type]["attribute"]["system"]}, {"role": "user", "content": self.prompt[type]["attribute"]["user"].format(objects=objects,claims=claim_list)}], + [{"role": "system", "content": self.prompt[type]["scene-text"]["system"]}, {"role": "user", "content": self.prompt[type]["scene-text"]["user"].format(claims=claim_list)}], + [{"role": "system", "content": self.prompt[type]["fact"]["system"]}, {"role": "user", "content": self.prompt[type]["fact"]["user"].format(claims=claim_list)}] + ] + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + res = loop.run_until_complete(self.chat.get_response(messages=self.message_list)) + if self.type == "image-to-text": + attribute_ques_list, scenetext_ques_list, fact_ques_list, objects = self.filter(res, object_list) + else: + attribute_ques_list, scenetext_ques_list, fact_ques_list = json.loads(res[0]), json.loads(res[1]), json.loads(res[2]) + + return objects, attribute_ques_list, scenetext_ques_list, fact_ques_list + else: + self.message_list = [ + [{"role": "system", "content": self.prompt[type]["object"]["system"]},{"role": "user", "content": self.prompt[type]["object"]["user"].format(claims=claim_list)}], + [{"role": "system", "content": self.prompt[type]["scene-text"]["system"]}, {"role": "user", "content": self.prompt[type]["scene-text"]["user"].format(claims=claim_list)}], + [{"role": "system", "content": self.prompt[type]["fact"]["system"]}, {"role": "user", "content": self.prompt[type]["fact"]["user"].format(claims=claim_list)}] + ] + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + res = loop.run_until_complete(self.chat.get_response(messages=self.message_list)) + object_list, objects = self.objects_extract(claim_list=claim_list, response=res) + if self.type == "image-to-text": + scenetext_ques_list, fact_ques_list, objects = self.filter(res, object_list) + else: + scenetext_ques_list, fact_ques_list = json.loads(res[1]), json.loads(res[2]) + + return objects, scenetext_ques_list, fact_ques_list - def get_response(self, claim_list): - object_list, objects = self.objects_extract(claim_list=claim_list) - self.message_list = [ - [{"role": "system", "content": self.prompt["attribute"]["system"]}, {"role": "user", "content": self.prompt["attribute"]["user"].format(objects=objects,claims=claim_list)}], - [{"role": "system", "content": self.prompt["scene-text"]["system"]}, {"role": "user", "content": self.prompt["scene-text"]["user"].format(claims=claim_list)}], - [{"role": "system", "content": self.prompt["fact"]["system"]}, {"role": "user", "content": self.prompt["fact"]["user"].format(claims=claim_list)}] - ] - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - res = loop.run_until_complete(self.chat.get_response(messages=self.message_list)) - # res = asyncio.run(self.chat.async_get_response(messages=self.message_list)) - if self.type == "image-to-text": - attribute_ques_list, scenetext_ques_list, fact_ques_list, objects = self.filter(res, object_list) - return objects, attribute_ques_list, scenetext_ques_list, fact_ques_list diff --git a/pipeline/run_pipeline.py b/pipeline/run_pipeline.py index 84500bd645614dce8c6e176792858cf12b484dbd..1d12a586e96faf9af8f51a2fb8d9628516060e7b 100644 --- a/pipeline/run_pipeline.py +++ b/pipeline/run_pipeline.py @@ -1,5 +1,4 @@ -# import sys -# sys.path.append("/home/wcx/wcx/EasyDetect/pipeline") +import time from pipeline.claim_generate import * from pipeline.query_generate import * from pipeline.tool_execute import * @@ -7,28 +6,47 @@ from pipeline.judge import * from pipeline.openai_wrapper import * class Pipeline: - def __init__(self): + def __init__(self, type, api_key, base_url=None): # 全局只实例化一个对象 会不会干扰prompt的结果 - self.syncchat = SyncChat(model="gpt-4-1106-preview", api_key="sk-jD8DeGdJKrdOxpiQ5bD4845bB53346C3A0E9Ed479bE08676", base_url="https://oneapi.xty.app/v1") - self.asyncchat = AsyncChat(model="gpt-4-1106-preview", api_key="sk-jD8DeGdJKrdOxpiQ5bD4845bB53346C3A0E9Ed479bE08676", base_url="https://oneapi.xty.app/v1") - self.visionchat = VisionChat(model="gpt-4-vision-preview", api_key="sk-jD8DeGdJKrdOxpiQ5bD4845bB53346C3A0E9Ed479bE08676", base_url="https://oneapi.xty.app/v1") + self.syncchat = SyncChat(model="gpt-4-1106-preview", api_key=api_key, base_url=base_url) + self.asyncchat = AsyncChat(model="gpt-4-1106-preview", api_key=api_key, base_url=base_url) + self.visionchat = VisionChat(model="gpt-4-vision-preview", api_key=api_key, base_url=base_url) - self.claim_generator = ClaimGenerator(prompt_path="/home/wcx/wcx/EasyDetect/prompts/claim_generate.yaml",chat=self.syncchat) - self.query_generator = QueryGenerator(prompt_path="/home/wcx/wcx/EasyDetect/prompts/query_generate.yaml",chat=self.asyncchat, type="image-to-text") + self.claim_generator = ClaimGenerator(prompt_path="pipeline/prompts/claim_generate.yaml",chat=self.syncchat) + self.query_generator = QueryGenerator(prompt_path="pipeline/prompts/query_generate.yaml",chat=self.asyncchat) self.tool = Tool() - self.judger = Judger(prompt_path="/home/wcx/wcx/EasyDetect/prompts/verification.yaml", chat=self.visionchat, type="image-to-text") + self.judger = Judger(prompt_path="pipeline/prompts/verification.yaml", chat=self.visionchat) - def run(self, text, image_path): + def run(self, text, image_path, type, use_attribue=False): + time1 = time.time() response, claim_list = self.claim_generator.get_response(text=text) - objects, attribute_ques_list, scenetext_ques_list, fact_ques_list = self.query_generator.get_response(claim_list=claim_list) + time2 = time.time() + if use_attribue: + objects, attribute_ques_list, scenetext_ques_list, fact_ques_list = self.query_generator.get_response(claim_list=claim_list) + else: + objects, scenetext_ques_list, fact_ques_list = self.query_generator.get_response(claim_list=claim_list,type=type) + attribute_ques_list = None + time3 = time.time() + print(objects) + print(attribute_ques_list) + print(scenetext_ques_list) + print(fact_ques_list) + """ + 需要定时清除文件 + """ object_res, attribue_res, text_res, fact_res = self.tool.execute(image_path=image_path, - new_path="/newdisk3/wcx/MLLM/image-to-text/cache", + new_path="pipeline/cache_files/", objects=objects, attribute_list=attribute_ques_list, scenetext_list=scenetext_ques_list, fact_list=fact_ques_list) - - # response = self.judger.get_response(object_res, attribue_res, text_res, fact_res, claim_list, image_path) - return object_res["phrases"] + time4 = time.time() + response = self.judger.get_response(type, object_res, attribue_res, text_res, fact_res, claim_list, image_path) + time5 = time.time() + print("claim generate time:" + str(time2-time1)) + print("query generate time:" + str(time3-time2)) + print("tool execute time:" + str(time4-time3)) + print("judge time:" + str(time5-time4)) + return response,claim_list diff --git a/pipeline/tool/__pycache__/object_detetction_model.cpython-39.pyc b/pipeline/tool/__pycache__/object_detetction_model.cpython-39.pyc index e6d70ff61e5fc37af25bda9d93018853d62ab0e1..e69d4ed067e3b16a5e2a900412a13b2c8b974a39 100644 Binary files a/pipeline/tool/__pycache__/object_detetction_model.cpython-39.pyc and b/pipeline/tool/__pycache__/object_detetction_model.cpython-39.pyc differ diff --git a/pipeline/tool/__pycache__/scene_text_model.cpython-39.pyc b/pipeline/tool/__pycache__/scene_text_model.cpython-39.pyc index e78371e77dad917a60407778bbe1354a9ff92567..3737e8e77a2b417c22fe95db20cb3955287385e7 100644 Binary files a/pipeline/tool/__pycache__/scene_text_model.cpython-39.pyc and b/pipeline/tool/__pycache__/scene_text_model.cpython-39.pyc differ diff --git a/pipeline/tool/object_detetction_model.py b/pipeline/tool/object_detetction_model.py index f907810689077e8207c252576902b7a7137e9c01..0f246e07e0b33f481f804d22fb7af0c058765507 100644 --- a/pipeline/tool/object_detetction_model.py +++ b/pipeline/tool/object_detetction_model.py @@ -26,13 +26,14 @@ AREA_THRESHOLD = 0.001 # used to filter out too small object. IOU_THRESHOLD = 0.95 # used to filter the same instance. greater than threshold means the same instance class GroundingDINO: - def __init__(self, config): - self.config = config - self.BOX_TRESHOLD = self.config["detector"]["BOX_TRESHOLD"] - self.TEXT_TRESHOLD = self.config["detector"]["TEXT_TRESHOLD"] + def __init__(self): + self.BOX_TRESHOLD = 0.35 + self.TEXT_TRESHOLD = 0.25 self.text_rec = MAERec() # load only one time - self.model = load_model(self.config["detector"]["config"], self.config["detector"]["model"], device='cuda:0') + self.model = load_model("pipeline/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py", + "models/groundingdino_swint_ogc.pth", ) + #device='cuda:0') def execute(self, image_path, content, new_path, use_text_rec): diff --git a/pipeline/tool/scene_text_model.py b/pipeline/tool/scene_text_model.py index ea28c5c139ef6125521f07f9544fcfa3b78992d1..951372f9f610cc4678d87bc83e3b884f4ec8646c 100644 --- a/pipeline/tool/scene_text_model.py +++ b/pipeline/tool/scene_text_model.py @@ -7,11 +7,11 @@ from pipeline.mmocr.mmocr.apis.inferencers import MMOCRInferencer class MAERec: def __init__(self): self.mmocr_inferencer = MMOCRInferencer( - "/home/wcx/wcx/GroundingDINO/LVLM/mmocr/configs/textdet/dbnetpp/dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015.py", - "/newdisk3/wcx/ocr_model/dbnetpp.pth", - "/home/wcx/wcx/GroundingDINO/LVLM/mmocr/configs/textrecog/maerec/maerec_b_union14m.py", - "/newdisk3/wcx/ocr_model/maerec_b.pth", - device="cuda:0") + "pipeline/mmocr/configs/textdet/dbnetpp/dbnetpp_resnet50-oclip_fpnc_1200e_icdar2015.py", + "models/dbnetpp.pth", + "pipeline/mmocr/configs/textrecog/maerec/maerec_b_union14m.py", + "models/maerec_b.pth",) + #device="cuda:0") def execute(self, image_path, use_detector=False): """Run MMOCR and SAM diff --git a/pipeline/tool_execute.py b/pipeline/tool_execute.py index 34f6cf32aec91f30f986ca8be0e41f8616a24c8a..08127c3cad6b549e60c218fc32439206aa46b414 100644 --- a/pipeline/tool_execute.py +++ b/pipeline/tool_execute.py @@ -44,8 +44,7 @@ def get_openai_reply(image_path, text): class Tool: def __init__(self): - config = yaml.load(open("/home/wcx/wcx/GroundingDINO/LVLM/config/config.yaml", "r"), Loader=yaml.FullLoader) - self.detector = GroundingDINO(config=config) + self.detector = GroundingDINO() self.search = GoogleSerperAPIWrapper() def execute(self, image_path, new_path, objects, attribute_list, scenetext_list, fact_list): diff --git a/requirements.txt b/requirements.txt index 6369bdf26800ad2e8fe56d745a454f89cb8a2fbe..be94ed2291387797fc36d9ab9da48478128129ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,279 +1,34 @@ -# This file may be used to create an environment using: -# $ conda create --name --file -# platform: linux-64 -_libgcc_mutex=0.1=main -_openmp_mutex=5.1=1_gnu -absl-py=2.0.0=pypi_0 -accelerate=0.24.0=pypi_0 -addict=2.4.0=pypi_0 -aiofiles=23.2.1=pypi_0 -aiohttp=3.8.6=pypi_0 -aiosignal=1.3.1=pypi_0 -albumentations=1.3.1=pypi_0 -aliyun-python-sdk-core=2.14.0=pypi_0 -aliyun-python-sdk-kms=2.16.2=pypi_0 -altair=5.1.2=pypi_0 -annotated-types=0.6.0=pypi_0 -antlr4-python3-runtime=4.9.3=pypi_0 -anyio=3.7.1=pypi_0 -appdirs=1.4.4=pypi_0 -asttokens=2.4.1=pypi_0 -async-timeout=4.0.3=pypi_0 -asynctest=0.13.0=pypi_0 -attrs=23.1.0=pypi_0 -backcall=0.2.0=pyhd3eb1b0_0 -blinker=1.6.3=pypi_0 -blis=0.7.11=pypi_0 -braceexpand=0.1.7=pypi_0 -ca-certificates=2023.08.22=h06a4308_0 -cachetools=5.3.2=pypi_0 -catalogue=2.0.10=pypi_0 -cchardet=2.1.7=pypi_0 -certifi=2023.7.22=pypi_0 -cffi=1.16.0=pypi_0 -chardet=5.2.0=pypi_0 -charset-normalizer=3.3.1=pypi_0 -click=8.1.7=pypi_0 -cloudpathlib=0.16.0=pypi_0 -codecov=2.1.13=pypi_0 -colorama=0.4.6=pypi_0 -comm=0.1.2=py39h06a4308_0 -confection=0.1.3=pypi_0 -contourpy=1.1.1=pypi_0 -coverage=7.3.2=pypi_0 -crcmod=1.7=pypi_0 -cryptography=41.0.7=pypi_0 -cycler=0.12.1=pypi_0 -cymem=2.0.8=pypi_0 -debugpy=1.6.7=py39h6a678d5_0 -decorator=5.1.1=pyhd3eb1b0_0 -decord=0.6.0=pypi_0 -distro=1.8.0=pypi_0 -docker-pycreds=0.4.0=pypi_0 -einops=0.7.0=pypi_0 -en-core-web-lg=3.7.0=pypi_0 -en-core-web-md=3.7.0=pypi_0 -en-core-web-sm=3.7.0=pypi_0 -exceptiongroup=1.1.3=pypi_0 -executing=2.0.0=pypi_0 -fastapi=0.104.0=pypi_0 -ffmpy=0.3.1=pypi_0 -filelock=3.12.4=pypi_0 -flake8=6.1.0=pypi_0 -flash-attn=2.3.3=pypi_0 -flask=3.0.0=pypi_0 -fonttools=4.43.1=pypi_0 -frozenlist=1.4.0=pypi_0 -fsspec=2023.10.0=pypi_0 -gitdb=4.0.11=pypi_0 -gitpython=3.1.40=pypi_0 -google-ai-generativelanguage=0.4.0=pypi_0 -google-api-core=2.15.0=pypi_0 -google-auth=2.23.3=pypi_0 -google-auth-oauthlib=1.1.0=pypi_0 -google-generativeai=0.3.1=pypi_0 -googleapis-common-protos=1.62.0=pypi_0 -gradio=3.50.2=pypi_0 -gradio-client=0.6.1=pypi_0 -groundingdino=0.1.0=dev_0 -grpcio=1.60.0=pypi_0 -grpcio-status=1.60.0=pypi_0 -h11=0.14.0=pypi_0 -h5py=3.10.0=pypi_0 -httpcore=0.18.0=pypi_0 -httpx=0.25.0=pypi_0 -huggingface-hub=0.17.3=pypi_0 -icecream=2.1.3=pypi_0 -idna=3.4=pypi_0 -imageio=2.33.0=pypi_0 -imgaug=0.4.0=pypi_0 -importlib-metadata=6.9.0=pypi_0 -importlib-resources=6.1.0=pypi_0 -importlib_metadata=6.0.0=hd3eb1b0_0 -iniconfig=2.0.0=pypi_0 -interrogate=1.5.0=pypi_0 -iopath=0.1.10=pypi_0 -ipykernel=6.25.0=py39h2f386ee_0 -ipython=8.15.0=py39h06a4308_0 -isort=5.12.0=pypi_0 -itsdangerous=2.1.2=pypi_0 -jedi=0.18.1=py39h06a4308_1 -jinja2=3.1.2=pypi_0 -jmespath=0.10.0=pypi_0 -joblib=1.3.2=pypi_0 -jsonschema=4.19.1=pypi_0 -jsonschema-specifications=2023.7.1=pypi_0 -jupyter_client=8.5.0=py39h06a4308_0 -jupyter_core=5.5.0=py39h06a4308_0 -kiwisolver=1.4.5=pypi_0 -kwarray=0.6.16=pypi_0 -langcodes=3.3.0=pypi_0 -lanms-neo=1.0.2=pypi_0 -lazy-loader=0.3=pypi_0 -ld_impl_linux-64=2.38=h1181459_1 -libffi=3.4.4=h6a678d5_0 -libgcc-ng=11.2.0=h1234567_1 -libgomp=11.2.0=h1234567_1 -libprotobuf=3.20.3=he621ea3_0 -libsodium=1.0.18=h7b6447c_0 -libstdcxx-ng=11.2.0=h1234567_1 -lmdb=1.4.1=pypi_0 -markdown=3.5=pypi_0 -markdown-it-py=3.0.0=pypi_0 -markdown2=2.4.10=pypi_0 -markupsafe=2.1.3=pypi_0 -matplotlib=3.8.0=pypi_0 -matplotlib-inline=0.1.6=py39h06a4308_0 -mccabe=0.7.0=pypi_0 -mdurl=0.1.2=pypi_0 -mmcv=2.0.0=pypi_0 -mmdet=3.0.0=pypi_0 -mmengine=0.10.1=pypi_0 -mmocr=1.0.0=dev_0 -model-index=0.1.11=pypi_0 -multidict=6.0.4=pypi_0 -munch=4.0.0=pypi_0 -murmurhash=1.0.10=pypi_0 -ncurses=6.4=h6a678d5_0 -nest-asyncio=1.5.6=py39h06a4308_0 -networkx=3.2.1=pypi_0 -ninja=1.11.1.1=pypi_0 -nltk=3.8.1=pypi_0 -numpy=1.26.1=pypi_0 -oauthlib=3.2.2=pypi_0 -omegaconf=2.3.0=pypi_0 -openai=1.3.3=pypi_0 -opencv-python=4.8.1.78=pypi_0 -opendatalab=0.0.10=pypi_0 -openmim=0.3.9=pypi_0 -openssl=3.0.12=h7f8727e_0 -openxlab=0.0.29=pypi_0 -ordered-set=4.1.0=pypi_0 -orjson=3.9.10=pypi_0 -oss2=2.17.0=pypi_0 -packaging=23.2=pypi_0 -pandas=2.1.2=pypi_0 -parameterized=0.9.0=pypi_0 -parso=0.8.3=pyhd3eb1b0_0 -peft=0.5.0=pypi_0 -pexpect=4.8.0=pyhd3eb1b0_3 -pickleshare=0.7.5=pyhd3eb1b0_1003 -pillow=10.1.0=pypi_0 -pip=23.3=py39h06a4308_0 -platformdirs=3.11.0=pypi_0 -pluggy=1.3.0=pypi_0 -portalocker=2.8.2=pypi_0 -preshed=3.0.9=pypi_0 -progressbar2=4.2.0=pypi_0 -prompt-toolkit=3.0.36=py39h06a4308_0 -proto-plus=1.23.0=pypi_0 -protobuf=4.23.4=pypi_0 -psutil=5.9.6=pypi_0 -ptyprocess=0.7.0=pyhd3eb1b0_2 -pure_eval=0.2.2=pyhd3eb1b0_0 -py=1.11.0=pypi_0 -pyasn1=0.5.0=pypi_0 -pyasn1-modules=0.3.0=pypi_0 -pyclipper=1.3.0.post5=pypi_0 -pycocotools=2.0.7=pypi_0 -pycodestyle=2.11.1=pypi_0 -pycparser=2.21=pypi_0 -pycryptodome=3.19.0=pypi_0 -pydantic=1.10.11=pypi_0 -pydantic-core=2.10.1=pypi_0 -pydub=0.25.1=pypi_0 -pyflakes=3.1.0=pypi_0 -pygments=2.16.1=pypi_0 -pyparsing=3.1.1=pypi_0 -pytest=7.4.3=pypi_0 -pytest-cov=4.1.0=pypi_0 -pytest-runner=6.0.0=pypi_0 -python=3.9.18=h955ad1f_0 -python-dateutil=2.8.2=pyhd3eb1b0_0 -python-multipart=0.0.6=pypi_0 -python-utils=3.8.1=pypi_0 -pytz=2023.3.post1=pypi_0 -pyyaml=6.0.1=pypi_0 -pyzmq=25.1.0=py39h6a678d5_0 -qudida=0.0.4=pypi_0 -rapidfuzz=3.5.2=pypi_0 -readline=8.2=h5eee18b_0 -referencing=0.30.2=pypi_0 -regex=2023.10.3=pypi_0 -requests=2.28.2=pypi_0 -requests-oauthlib=1.3.1=pypi_0 -retrying=1.3.4=pypi_0 -rich=13.4.2=pypi_0 -rpds-py=0.10.6=pypi_0 -rsa=4.9=pypi_0 -ruamel-yaml=0.18.2=pypi_0 -ruamel-yaml-clib=0.2.8=pypi_0 -safetensors=0.4.0=pypi_0 -scikit-image=0.22.0=pypi_0 -scikit-learn=1.3.2=pypi_0 -scipy=1.11.4=pypi_0 -sconf=0.2.5=pypi_0 -semantic-version=2.10.0=pypi_0 -sentencepiece=0.1.99=pypi_0 -sentry-sdk=1.37.1=pypi_0 -setproctitle=1.3.3=pypi_0 -setuptools=60.2.0=pypi_0 -shapely=2.0.2=pypi_0 -shortuuid=1.0.11=pypi_0 -six=1.16.0=pyhd3eb1b0_1 -smart-open=6.4.0=pypi_0 -smmap=5.0.1=pypi_0 -sniffio=1.3.0=pypi_0 -spacy=3.7.2=pypi_0 -spacy-legacy=3.0.12=pypi_0 -spacy-loggers=1.0.5=pypi_0 -sqlite=3.41.2=h5eee18b_0 -srsly=2.4.8=pypi_0 -stack_data=0.2.0=pyhd3eb1b0_0 -starlette=0.27.0=pypi_0 -supervision=0.6.0=pypi_0 -tabulate=0.9.0=pypi_0 -tensorboard=2.15.0=pypi_0 -tensorboard-data-server=0.7.2=pypi_0 -tensorboardx=2.6.2.2=pypi_0 -termcolor=2.4.0=pypi_0 -terminaltables=3.1.10=pypi_0 -thinc=8.2.1=pypi_0 -threadpoolctl=3.2.0=pypi_0 -tifffile=2023.9.26=pypi_0 -timm=0.9.8=pypi_0 -tk=8.6.12=h1ccaba5_0 -tokenizers=0.13.3=pypi_0 -toml=0.10.2=pypi_0 -tomli=2.0.1=pypi_0 -toolz=0.12.0=pypi_0 -torch=1.13.0+cu116=pypi_0 -torchaudio=0.13.0+cu116=pypi_0 -torchvision=0.14.0+cu116=pypi_0 -tornado=6.3.3=py39h5eee18b_0 -tqdm=4.65.2=pypi_0 -traitlets=5.7.1=py39h06a4308_0 -transformers=4.29.0=pypi_0 -typer=0.9.0=pypi_0 -typing-extensions=4.9.0=pypi_0 -tzdata=2023.3=pypi_0 -ubelt=1.3.4=pypi_0 -urllib3=1.26.18=pypi_0 -uvicorn=0.23.2=pypi_0 -visual-genome=1.1.1=pypi_0 -wandb=0.16.0=pypi_0 -wasabi=1.1.2=pypi_0 -wcwidth=0.2.5=pyhd3eb1b0_0 -weasel=0.3.3=pypi_0 -webdataset=0.2.48=pypi_0 -websockets=11.0.3=pypi_0 -werkzeug=3.0.1=pypi_0 -wget=3.2=pypi_0 -wheel=0.41.2=py39h06a4308_0 -xdoctest=1.1.2=pypi_0 -xz=5.4.2=h5eee18b_0 -yapf=0.40.2=pypi_0 -yarl=1.9.2=pypi_0 -zeromq=4.3.4=h2531618_0 -zipp=3.17.0=pypi_0 -zlib=1.2.13=h5eee18b_0 +numpy +timm +openmim=0.3.9 +torch==1.13.0 +torchvision==0.14.0 +opencv-python==4.8.1.78 +pyclipper +imgaug +lmdb +matplotlib +gradio_client==0.2.7 +numpy +pyclipper +pycocotools +rapidfuzz>=2.0.0 +scikit-image==0.22.0 +asynctest +codecov +flake8 +interrogate +isort +# Note: used for kwarray.group_items, this may be ported to mmcv in the future. +kwarray +lanms-neo==1.0.2 +parameterized +pytest +pytest-cov +pytest-runner +ubelt +xdoctest >= 0.10.0 +yapf +openai=1.3.3 +nltk=3.8.1 +pillow=10.1.0