|
"""Generate a model (textacy.representations.Vectorizer). |
|
|
|
vectorizer = Vectorizer( |
|
tf_type="linear", idf_type="smooth", norm="l2", |
|
min_df=3, max_df=0.95) |
|
doc_term_matrix = vectorizer.fit_transform(tokenized_docs) |
|
doc_term_matrix |
|
|
|
tokenized_docs = [insert_spaces(elm).split() for elm in textzh] |
|
""" |
|
from typing import Dict, Iterable, List, Optional, Union |
|
|
|
from textacy.representations import Vectorizer |
|
from logzero import logger |
|
|
|
|
|
|
|
def gen_model( |
|
tokenized_docs: Iterable[Iterable[str]], |
|
tf_type: str = 'linear', |
|
idf_type: Optional[str] = "smooth", |
|
dl_type: str = None, |
|
norm: Optional[str] = "l2", |
|
min_df: Union[int, float] = 1, |
|
max_df: Union[int, float] = 1.0, |
|
max_n_terms: Optional[int] = None, |
|
vocabulary_terms: Optional[Union[Dict[str, int], Iterable[str]]] = None |
|
) -> Vectorizer: |
|
|
|
"""Generate a model (textacy.representations.Vectorizer). |
|
|
|
Args: |
|
doc: tokenized docs |
|
|
|
(refer to textacy.representation.Vectorizer) |
|
tf_type: Type of term frequency (tf) to use for weights' local component: |
|
|
|
- "linear": tf (tfs are already linear, so left as-is) |
|
- "sqrt": tf => sqrt(tf) |
|
- "log": tf => log(tf) + 1 |
|
- "binary": tf => 1 |
|
|
|
idf_type: Type of inverse document frequency (idf) to use for weights' |
|
global component: |
|
|
|
- "standard": idf = log(n_docs / df) + 1.0 |
|
- "smooth": idf = log(n_docs + 1 / df + 1) + 1.0, i.e. 1 is added |
|
to all document frequencies, as if a single document containing |
|
every unique term was added to the corpus. |
|
- "bm25": idf = log((n_docs - df + 0.5) / (df + 0.5)), which is |
|
a form commonly used in information retrieval that allows for |
|
very common terms to receive negative weights. |
|
- None: no global weighting is applied to local term weights. |
|
|
|
dl_type: Type of document-length scaling to use for weights' |
|
normalization component: |
|
|
|
- "linear": dl (dls are already linear, so left as-is) |
|
- "sqrt": dl => sqrt(dl) |
|
- "log": dl => log(dl) |
|
- None: no normalization is applied to local(*global?) weights |
|
|
|
norm: If "l1" or "l2", normalize weights by the L1 or L2 norms, respectively, |
|
of row-wise vectors; otherwise, don't. |
|
min_df: Minimum number of documents in which a term must appear for it to be |
|
included in the vocabulary and as a column in a transformed doc-term matrix. |
|
If float, value is the fractional proportion of the total number of docs, |
|
which must be in [0.0, 1.0]; if int, value is the absolute number. |
|
max_df: Maximum number of documents in which a term may appear for it to be |
|
included in the vocabulary and as a column in a transformed doc-term matrix. |
|
If float, value is the fractional proportion of the total number of docs, |
|
which must be in [0.0, 1.0]; if int, value is the absolute number. |
|
max_n_terms: If specified, only include terms whose document frequency is within |
|
the top ``max_n_terms``. |
|
vocabulary_terms: Mapping of unique term string to unique term id, or |
|
an iterable of term strings that gets converted into such a mapping. |
|
Note that, if specified, vectorized outputs will include *only* these terms. |
|
|
|
“lucene-style tfidf”: Adds a doc-length normalization to the usual local and global components. |
|
Params: tf_type="linear", apply_idf=True, idf_type="smooth", apply_dl=True, dl_type="sqrt" |
|
|
|
“lucene-style bm25”: Uses a smoothed idf instead of the classic bm25 variant to prevent weights on terms from going negative. |
|
Params: tf_type="bm25", apply_idf=True, idf_type="smooth", apply_dl=True, dl_type="linear" |
|
Attributes: |
|
doc_term_matrix |
|
Returns: |
|
transform_fit'ted vectorizer |
|
""" |
|
|
|
try: |
|
for xelm in iter(tokenized_docs): |
|
for elm in iter(xelm): |
|
assert isinstance(elm, str) |
|
except AssertionError: |
|
raise AssertionError(" tokenized_docs is not of the typing Iterable[Iterable[str]] ") |
|
except Exception as e: |
|
logger.error(e) |
|
raise |
|
|
|
vectorizer = Vectorizer( |
|
|
|
tf_type=tf_type, |
|
idf_type=idf_type, |
|
dl_type=dl_type, |
|
norm=norm, |
|
min_df=min_df, |
|
max_df=max_df, |
|
max_n_terms=max_n_terms, |
|
vocabulary_terms=vocabulary_terms |
|
) |
|
doc_term_matrix = vectorizer.fit_transform(tokenized_docs) |
|
|
|
gen_model.doc_term_matrix = doc_term_matrix |
|
|
|
return vectorizer |
|
|