Spaces:
Sleeping
Sleeping
| # Natural Language Toolkit: Dispersion Plots | |
| # | |
| # Copyright (C) 2001-2023 NLTK Project | |
| # Author: Steven Bird <[email protected]> | |
| # URL: <https://www.nltk.org/> | |
| # For license information, see LICENSE.TXT | |
| """ | |
| A utility for displaying lexical dispersion. | |
| """ | |
| def dispersion_plot(text, words, ignore_case=False, title="Lexical Dispersion Plot"): | |
| """ | |
| Generate a lexical dispersion plot. | |
| :param text: The source text | |
| :type text: list(str) or iter(str) | |
| :param words: The target words | |
| :type words: list of str | |
| :param ignore_case: flag to set if case should be ignored when searching text | |
| :type ignore_case: bool | |
| :return: a matplotlib Axes object that may still be modified before plotting | |
| :rtype: Axes | |
| """ | |
| try: | |
| import matplotlib.pyplot as plt | |
| except ImportError as e: | |
| raise ImportError( | |
| "The plot function requires matplotlib to be installed. " | |
| "See https://matplotlib.org/" | |
| ) from e | |
| word2y = { | |
| word.casefold() if ignore_case else word: y | |
| for y, word in enumerate(reversed(words)) | |
| } | |
| xs, ys = [], [] | |
| for x, token in enumerate(text): | |
| token = token.casefold() if ignore_case else token | |
| y = word2y.get(token) | |
| if y is not None: | |
| xs.append(x) | |
| ys.append(y) | |
| _, ax = plt.subplots() | |
| ax.plot(xs, ys, "|") | |
| ax.set_yticks(list(range(len(words))), words, color="C0") | |
| ax.set_ylim(-1, len(words)) | |
| ax.set_title(title) | |
| ax.set_xlabel("Word Offset") | |
| return ax | |
| if __name__ == "__main__": | |
| import matplotlib.pyplot as plt | |
| from nltk.corpus import gutenberg | |
| words = ["Elinor", "Marianne", "Edward", "Willoughby"] | |
| dispersion_plot(gutenberg.words("austen-sense.txt"), words) | |
| plt.show() | |