Source code for topik.tokenizers.simple

import gensim

# imports used only for doctests
from topik.tokenizers._registry import register


def _simple_document(text, min_length=1, stopwords=None):
    """A text tokenizer that simply lowercases, matches alphabetic
    characters and removes stopwords.  For use on individual text documents.

    Parameters
    ----------
    text : str
        A single document's text to be tokenized
    min_length : int
        Minimum length of any single word
    stopwords: None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> text = "frank FRANK the frank dog cat"
    >>> tokenized_text = _simple_document(text)
    >>> tokenized_text == ["frank", "frank", "frank", "dog", "cat"]
    True
    """
    if not stopwords:
        from gensim.parsing.preprocessing import STOPWORDS as stopwords
    return [word for word in gensim.utils.tokenize(text, lower=True)
            if word not in stopwords and len(word) >= min_length]


@register
[docs]def simple(raw_corpus, min_length=1, stopwords=None): """A text tokenizer that simply lowercases, matches alphabetic characters and removes stopwords. Parameters ---------- raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str)) body of documents to examine min_length : int Minimum length of any single word stopwords: None or iterable of str Collection of words to ignore as tokens Examples -------- >>> sample_corpus = [("doc1", "frank FRANK the frank dog cat"), ... ("doc2", "frank a dog of the llama")] >>> tokenized_corpora = simple(sample_corpus) >>> next(tokenized_corpora) == ("doc1", ... ["frank", "frank", "frank", "dog", "cat"]) True """ for doc_id, doc_text in raw_corpus: yield(doc_id, _simple_document(doc_text, min_length=min_length, stopwords=stopwords))