Spaces:

sunnychenxiwang
/

EasyDetect

Sleeping

File size: 6,396 Bytes

d916065

# Natural Language Toolkit: Interface to Megam Classifier
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Edward Loper <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

"""

A set of functions used to interface with the external megam_ maxent

optimization package. Before megam can be used, you should tell NLTK where it

can find the megam binary, using the ``config_megam()`` function. Typical

usage:



    >>> from nltk.classify import megam

    >>> megam.config_megam() # pass path to megam if not found in PATH # doctest: +SKIP

    [Found megam: ...]



Use with MaxentClassifier. Example below, see MaxentClassifier documentation

for details.



    nltk.classify.MaxentClassifier.train(corpus, 'megam')



.. _megam: https://www.umiacs.umd.edu/~hal/megam/index.html

"""
import subprocess

from nltk.internals import find_binary

try:
    import numpy
except ImportError:
    numpy = None

######################################################################
# { Configuration
######################################################################

_megam_bin = None


def config_megam(bin=None):
    """

    Configure NLTK's interface to the ``megam`` maxent optimization

    package.



    :param bin: The full path to the ``megam`` binary.  If not specified,

        then nltk will search the system for a ``megam`` binary; and if

        one is not found, it will raise a ``LookupError`` exception.

    :type bin: str

    """
    global _megam_bin
    _megam_bin = find_binary(
        "megam",
        bin,
        env_vars=["MEGAM"],
        binary_names=["megam.opt", "megam", "megam_686", "megam_i686.opt"],
        url="https://www.umiacs.umd.edu/~hal/megam/index.html",
    )


######################################################################
# { Megam Interface Functions
######################################################################


def write_megam_file(train_toks, encoding, stream, bernoulli=True, explicit=True):
    """

    Generate an input file for ``megam`` based on the given corpus of

    classified tokens.



    :type train_toks: list(tuple(dict, str))

    :param train_toks: Training data, represented as a list of

        pairs, the first member of which is a feature dictionary,

        and the second of which is a classification label.



    :type encoding: MaxentFeatureEncodingI

    :param encoding: A feature encoding, used to convert featuresets

        into feature vectors. May optionally implement a cost() method

        in order to assign different costs to different class predictions.



    :type stream: stream

    :param stream: The stream to which the megam input file should be

        written.



    :param bernoulli: If true, then use the 'bernoulli' format.  I.e.,

        all joint features have binary values, and are listed iff they

        are true.  Otherwise, list feature values explicitly.  If

        ``bernoulli=False``, then you must call ``megam`` with the

        ``-fvals`` option.



    :param explicit: If true, then use the 'explicit' format.  I.e.,

        list the features that would fire for any of the possible

        labels, for each token.  If ``explicit=True``, then you must

        call ``megam`` with the ``-explicit`` option.

    """
    # Look up the set of labels.
    labels = encoding.labels()
    labelnum = {label: i for (i, label) in enumerate(labels)}

    # Write the file, which contains one line per instance.
    for featureset, label in train_toks:
        # First, the instance number (or, in the weighted multiclass case, the cost of each label).
        if hasattr(encoding, "cost"):
            stream.write(
                ":".join(str(encoding.cost(featureset, label, l)) for l in labels)
            )
        else:
            stream.write("%d" % labelnum[label])

        # For implicit file formats, just list the features that fire
        # for this instance's actual label.
        if not explicit:
            _write_megam_features(encoding.encode(featureset, label), stream, bernoulli)

        # For explicit formats, list the features that would fire for
        # any of the possible labels.
        else:
            for l in labels:
                stream.write(" #")
                _write_megam_features(encoding.encode(featureset, l), stream, bernoulli)

        # End of the instance.
        stream.write("\n")


def parse_megam_weights(s, features_count, explicit=True):
    """

    Given the stdout output generated by ``megam`` when training a

    model, return a ``numpy`` array containing the corresponding weight

    vector.  This function does not currently handle bias features.

    """
    if numpy is None:
        raise ValueError("This function requires that numpy be installed")
    assert explicit, "non-explicit not supported yet"
    lines = s.strip().split("\n")
    weights = numpy.zeros(features_count, "d")
    for line in lines:
        if line.strip():
            fid, weight = line.split()
            weights[int(fid)] = float(weight)
    return weights


def _write_megam_features(vector, stream, bernoulli):
    if not vector:
        raise ValueError(
            "MEGAM classifier requires the use of an " "always-on feature."
        )
    for (fid, fval) in vector:
        if bernoulli:
            if fval == 1:
                stream.write(" %s" % fid)
            elif fval != 0:
                raise ValueError(
                    "If bernoulli=True, then all" "features must be binary."
                )
        else:
            stream.write(f" {fid} {fval}")


def call_megam(args):
    """

    Call the ``megam`` binary with the given arguments.

    """
    if isinstance(args, str):
        raise TypeError("args should be a list of strings")
    if _megam_bin is None:
        config_megam()

    # Call megam via a subprocess
    cmd = [_megam_bin] + args
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    (stdout, stderr) = p.communicate()

    # Check the return code.
    if p.returncode != 0:
        print()
        print(stderr)
        raise OSError("megam command failed!")

    if isinstance(stdout, str):
        return stdout
    else:
        return stdout.decode("utf-8")