Spaces:
Sleeping
Sleeping
File size: 6,396 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
# Natural Language Toolkit: Interface to Megam Classifier
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Edward Loper <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A set of functions used to interface with the external megam_ maxent
optimization package. Before megam can be used, you should tell NLTK where it
can find the megam binary, using the ``config_megam()`` function. Typical
usage:
>>> from nltk.classify import megam
>>> megam.config_megam() # pass path to megam if not found in PATH # doctest: +SKIP
[Found megam: ...]
Use with MaxentClassifier. Example below, see MaxentClassifier documentation
for details.
nltk.classify.MaxentClassifier.train(corpus, 'megam')
.. _megam: https://www.umiacs.umd.edu/~hal/megam/index.html
"""
import subprocess
from nltk.internals import find_binary
try:
import numpy
except ImportError:
numpy = None
######################################################################
# { Configuration
######################################################################
_megam_bin = None
def config_megam(bin=None):
"""
Configure NLTK's interface to the ``megam`` maxent optimization
package.
:param bin: The full path to the ``megam`` binary. If not specified,
then nltk will search the system for a ``megam`` binary; and if
one is not found, it will raise a ``LookupError`` exception.
:type bin: str
"""
global _megam_bin
_megam_bin = find_binary(
"megam",
bin,
env_vars=["MEGAM"],
binary_names=["megam.opt", "megam", "megam_686", "megam_i686.opt"],
url="https://www.umiacs.umd.edu/~hal/megam/index.html",
)
######################################################################
# { Megam Interface Functions
######################################################################
def write_megam_file(train_toks, encoding, stream, bernoulli=True, explicit=True):
"""
Generate an input file for ``megam`` based on the given corpus of
classified tokens.
:type train_toks: list(tuple(dict, str))
:param train_toks: Training data, represented as a list of
pairs, the first member of which is a feature dictionary,
and the second of which is a classification label.
:type encoding: MaxentFeatureEncodingI
:param encoding: A feature encoding, used to convert featuresets
into feature vectors. May optionally implement a cost() method
in order to assign different costs to different class predictions.
:type stream: stream
:param stream: The stream to which the megam input file should be
written.
:param bernoulli: If true, then use the 'bernoulli' format. I.e.,
all joint features have binary values, and are listed iff they
are true. Otherwise, list feature values explicitly. If
``bernoulli=False``, then you must call ``megam`` with the
``-fvals`` option.
:param explicit: If true, then use the 'explicit' format. I.e.,
list the features that would fire for any of the possible
labels, for each token. If ``explicit=True``, then you must
call ``megam`` with the ``-explicit`` option.
"""
# Look up the set of labels.
labels = encoding.labels()
labelnum = {label: i for (i, label) in enumerate(labels)}
# Write the file, which contains one line per instance.
for featureset, label in train_toks:
# First, the instance number (or, in the weighted multiclass case, the cost of each label).
if hasattr(encoding, "cost"):
stream.write(
":".join(str(encoding.cost(featureset, label, l)) for l in labels)
)
else:
stream.write("%d" % labelnum[label])
# For implicit file formats, just list the features that fire
# for this instance's actual label.
if not explicit:
_write_megam_features(encoding.encode(featureset, label), stream, bernoulli)
# For explicit formats, list the features that would fire for
# any of the possible labels.
else:
for l in labels:
stream.write(" #")
_write_megam_features(encoding.encode(featureset, l), stream, bernoulli)
# End of the instance.
stream.write("\n")
def parse_megam_weights(s, features_count, explicit=True):
"""
Given the stdout output generated by ``megam`` when training a
model, return a ``numpy`` array containing the corresponding weight
vector. This function does not currently handle bias features.
"""
if numpy is None:
raise ValueError("This function requires that numpy be installed")
assert explicit, "non-explicit not supported yet"
lines = s.strip().split("\n")
weights = numpy.zeros(features_count, "d")
for line in lines:
if line.strip():
fid, weight = line.split()
weights[int(fid)] = float(weight)
return weights
def _write_megam_features(vector, stream, bernoulli):
if not vector:
raise ValueError(
"MEGAM classifier requires the use of an " "always-on feature."
)
for (fid, fval) in vector:
if bernoulli:
if fval == 1:
stream.write(" %s" % fid)
elif fval != 0:
raise ValueError(
"If bernoulli=True, then all" "features must be binary."
)
else:
stream.write(f" {fid} {fval}")
def call_megam(args):
"""
Call the ``megam`` binary with the given arguments.
"""
if isinstance(args, str):
raise TypeError("args should be a list of strings")
if _megam_bin is None:
config_megam()
# Call megam via a subprocess
cmd = [_megam_bin] + args
p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
(stdout, stderr) = p.communicate()
# Check the return code.
if p.returncode != 0:
print()
print(stderr)
raise OSError("megam command failed!")
if isinstance(stdout, str):
return stdout
else:
return stdout.decode("utf-8")
|