Source code for topik.tokenizers.entities

import logging

from textblob import TextBlob

from topik.tokenizers.simple import _simple_document

# imports used only for doctests
from topik.tokenizers._registry import register

sample_corpus = [
            ("doc1", str(u"Frank the Swank-Tank walked his sassy unicorn, Brony,"
                         u" to prancercise class daily.  Prancercise was "
                         u"a tremendously popular pastime of sassy "
                         u"unicorns and retirees alike.")),
            ("doc2", str(u"Prancercise is a form of both art and fitniss, "
                         u"originally invented by sassy unicorns. It has "
                         u"recently been popularized by such retired "
                         u"celebrities as Frank The Swank-Tank."))]

def _collect_entities(raw_corpus, freq_min=2, freq_max=10000):
    """Return noun phrases from collection of documents.

    Parameters
    ----------
    raw_corpus: Corpus-base derived object or iterable collection of raw text
    freq_min: int
        Minimum frequency of a noun phrase occurrences in order to retrieve it. Default is 2.
    freq_max: int
        Maximum frequency of a noun phrase occurrences in order to retrieve it. Default is 10000.

    Examples
    --------
    >>> ents = _collect_entities(sample_corpus)
    >>> ents == {'swank-tank', 'prancercise', 'sassy unicorns', 'frank'}
    True
    """

    np_counts_total = {}
    docs_examined = 0
    for doc_id, doc_text in raw_corpus:
        if docs_examined > 0 and docs_examined % 1000 == 0:
            sorted_phrases = sorted(np_counts_total.items(),
                                    key=lambda item: -item[1])
            np_counts_total = dict(sorted_phrases)
            logging.info("at document #%i, considering %i phrases: %s..." %
                         (docs_examined, len(np_counts_total), sorted_phrases[0]))

        for np in TextBlob(doc_text).noun_phrases:
            np_counts_total[np] = np_counts_total.get(np, 0) + 1
        docs_examined += 1

    # Remove noun phrases in the list that have higher frequencies than 'freq_max' or lower frequencies than 'freq_min'
    np_counts = {}
    for np, count in np_counts_total.items():
        if freq_max >= count >= freq_min:
            np_counts[np] = count

    return set(np_counts)


def _tokenize_entities_document(text, entities, min_length=1, stopwords=None):
    '''
    A text tokenizer that passes only terms (a.k.a. 'entities') explicitly
    contained in the entities argument.

    Parameters
    ----------
    text : str
        A single text document to be tokenized
    entities : iterable of str
        Collection of noun phrases, obtained from collect_entities function
    min_length : int
        Minimum length of any single word
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> ents = _collect_entities(sample_corpus)
    >>> text = sample_corpus[0][1]
    >>> tokenized_text = _tokenize_entities_document(text,ents)
    >>> tokenized_text == [
    ...     u'frank', u'swank_tank', u'prancercise', u'sassy_unicorns']
    True
    '''
    result = []
    for np in TextBlob(text).noun_phrases:
        if np in entities:
            # filter out stop words
            tmp = "_".join(_simple_document(np, min_length=min_length, stopwords=stopwords))
            # if we end up with nothing, don't append an empty string
            if tmp:
                result.append(tmp)
    return result


def _tokenize_mixed_document(text, entities, min_length=1, stopwords=None):
    """
    A text tokenizer that retrieves entities ('noun phrases') first and simple words for the rest of the text.

    Parameters
    ----------
    text : str
        A single text document to be tokenized
    entities : iterable of str
        Collection of noun phrases, obtained from collect_entities function
    min_length : int
        Minimum length of any single word
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> ents = _collect_entities(sample_corpus)
    >>> text = sample_corpus[0][1]
    >>> tokenized_text = _tokenize_mixed_document(text,ents)
    >>> tokenized_text == [u'frank', u'swank_tank', u'sassy', u'unicorn',
    ... u'brony', u'prancercise', u'class', u'prancercise', u'popular',
    ... u'pastime', u'sassy_unicorns']
    True
    """
    result = []
    for np in TextBlob(text).noun_phrases:
        if ' ' in np and np not in entities:
            # break apart the noun phrase; it does not occur often enough in the collection of text to be considered.
            result.extend(_simple_document(np, min_length=min_length, stopwords=stopwords))
        else:
            # filter out stop words
            tmp = "_".join(_simple_document(np, min_length=min_length, stopwords=stopwords))
            # if we end up with nothing, don't append an empty string
            if tmp:
                result.append(tmp)
    return result


@register
[docs]def entities(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None):
    """
    A tokenizer that extracts noun phrases from a corpus, then tokenizes all
    documents using those extracted phrases.

    Parameters
    ----------
    corpus : iterable of str
        A collection of text to be tokenized
    min_length : int
        Minimum length of any single word
    freq_min : int
        Minimum occurrence of phrase in order to be considered
    freq_max : int
        Maximum occurrence of phrase, beyond which it is ignored
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> tokenized_corpora = entities(sample_corpus)
    >>> next(tokenized_corpora) == ('doc1',
    ...     [u'frank', u'swank_tank', u'prancercise', u'sassy_unicorns'])
    True
    """
    entities = _collect_entities(corpus, freq_min=freq_min, freq_max=freq_max)
    for doc_id, doc_text in corpus:
        yield doc_id, _tokenize_entities_document(doc_text, entities, min_length=min_length,
                                       stopwords=stopwords)


@register
[docs]def mixed(corpus, min_length=1, freq_min=2, freq_max=10000, stopwords=None):
    """A text tokenizer that retrieves entities ('noun phrases') first and simple words for the rest of the text.

    Parameters
    ----------
    corpus : iterable of str
        A collection of text to be tokenized
    min_length : int
        Minimum length of any single word
    freq_min : int
        Minimum occurrence of phrase in order to be considered
    freq_max : int
        Maximum occurrence of phrase, beyond which it is ignored
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> tokenized_corpora = entities(sample_corpus)
    >>> next(tokenized_corpora) == ('doc1',
    ...     [u'frank', u'swank_tank', u'prancercise', u'sassy_unicorns'])
    True
    """
    entities = _collect_entities(corpus, freq_min=freq_min, freq_max=freq_max)
    for doc_id, doc_text in corpus:
        yield doc_id, _tokenize_mixed_document(doc_text, entities,
                                                min_length=min_length,
                                                stopwords=stopwords)