import gensim
# imports used only for doctests
from topik.tokenizers._registry import register
def _simple_document(text, min_length=1, stopwords=None):
"""A text tokenizer that simply lowercases, matches alphabetic
characters and removes stopwords. For use on individual text documents.
Parameters
----------
text : str
A single document's text to be tokenized
min_length : int
Minimum length of any single word
stopwords: None or iterable of str
Collection of words to ignore as tokens
Examples
--------
>>> text = "frank FRANK the frank dog cat"
>>> tokenized_text = _simple_document(text)
>>> tokenized_text == ["frank", "frank", "frank", "dog", "cat"]
True
"""
if not stopwords:
from gensim.parsing.preprocessing import STOPWORDS as stopwords
return [word for word in gensim.utils.tokenize(text, lower=True)
if word not in stopwords and len(word) >= min_length]
@register
[docs]def simple(raw_corpus, min_length=1, stopwords=None):
"""A text tokenizer that simply lowercases, matches alphabetic
characters and removes stopwords.
Parameters
----------
raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str))
body of documents to examine
min_length : int
Minimum length of any single word
stopwords: None or iterable of str
Collection of words to ignore as tokens
Examples
--------
>>> sample_corpus = [("doc1", "frank FRANK the frank dog cat"),
... ("doc2", "frank a dog of the llama")]
>>> tokenized_corpora = simple(sample_corpus)
>>> next(tokenized_corpora) == ("doc1",
... ["frank", "frank", "frank", "dog", "cat"])
True
"""
for doc_id, doc_text in raw_corpus:
yield(doc_id, _simple_document(doc_text, min_length=min_length, stopwords=stopwords))