Source code for topik.tokenizers

from __future__ import absolute_import, print_function

import itertools
import logging
import re

# imports used only for doctests
from topik.tests import test_data_path


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)


[docs]def tokenize_simple(text, min_length=1, stopwords=None):
    """A text tokenizer that simply lowercases, matches alphabetic
    characters and removes stopwords.

    Parameters
    ----------
    text : str
        A single document's text to be tokenized
    entities : iterable of str
        Collection of noun phrases, obtained from collect_entities function
    min_length : int
        Minimum length of any single word
    stopwords: None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> from topik.readers import read_input
    >>> id_documents = read_input(
    ...                 '{}/test_data_json_stream.json'.format(test_data_path),
    ...                 content_field="abstract")
    >>> id, doc_text = next(iter(id_documents))
    >>> doc_text
    u'Transition metal oxides are being considered as the next generation \
materials in field such as electronics and advanced catalysts; between\
 them is Tantalum (V) Oxide; however, there are few reports for the \
synthesis of this material at the nanometer size which could have \
unusual properties. Hence, in this work we present the synthesis of \
Ta2O5 nanorods by sol gel method using DNA as structure directing \
agent, the size of the nanorods was of the order of 40 to 100 nm in \
diameter and several microns in length; this easy method can be useful\
 in the preparation of nanomaterials for electronics, biomedical \
applications as well as catalysts.'
    >>> tokens = tokenize_simple(doc_text)
    >>> tokens
    [u'transition', u'metal', u'oxides', u'considered', \
u'generation', u'materials', u'field', u'electronics', \
u'advanced', u'catalysts', u'tantalum', u'v', u'oxide', \
u'reports', u'synthesis', u'material', u'nanometer', u'size', \
u'unusual', u'properties', u'work', u'present', u'synthesis', \
u'ta', u'o', u'nanorods', u'sol', u'gel', u'method', u'dna', \
u'structure', u'directing', u'agent', u'size', u'nanorods', \
u'order', u'nm', u'diameter', u'microns', u'length', u'easy', \
u'method', u'useful', u'preparation', u'nanomaterials', u'electronics', \
u'biomedical', u'applications', u'catalysts']
    """

    import gensim
    if not stopwords:
        from gensim.parsing.preprocessing import STOPWORDS as stopwords
    return [word for word in gensim.utils.tokenize(text, lower=True)
            if word not in stopwords and len(word) >= min_length]


[docs]def collect_bigrams_and_trigrams(collection, top_n=10000, min_length=1, min_bigram_freq=50,
                                 min_trigram_freq=20, stopwords=None):
    """collects bigrams and trigrams from collection of documents.  Input to collocation tokenizer.

    bigrams are pairs of words that recur in the collection; trigrams are triplets.

    Parameters
    ----------
    collection : iterable of str
        body of documents to examine
    top_n : int
        limit results to this many entries
    min_length : int
        Minimum length of any single word
    min_bigram_freq : int
        threshold of when to consider a pair of words as a recognized bigram
    min_trigram_freq : int
        threshold of when to consider a triplet of words as a recognized trigram
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> from topik.readers import read_input
    >>> raw_data = read_input(
    ...                 '{}/test_data_json_stream.json'.format(test_data_path),
    ...                 content_field="abstract")
    >>> bigrams, trigrams = collect_bigrams_and_trigrams(raw_data, min_bigram_freq=5, min_trigram_freq=3)
    >>> bigrams.pattern
    u'(free standing|ac electrodeposition|centered cubic|spatial resolution|vapor deposition\
|wear resistance|plastic deformation|electrical conductivity|field magnets|v o|\
transmission electron|x ray|et al|ray diffraction|electron microscopy|room \
temperature|diffraction xrd|electron microscope|results indicate|scanning \
electron|m s|doped zno|microscopy tem|polymer matrix|size distribution|mechanical \
properties|grain size|diameters nm|high spatial|particle size|high resolution|ni \
al|diameter nm|range nm|high field|high strength|c c)'
    >>> trigrams.pattern
    u'(differential scanning calorimetry|face centered cubic|ray microanalysis analytical|\
physical vapor deposition|transmission electron microscopy|x ray diffraction|microanalysis \
analytical electron|chemical vapor deposition|high aspect ratio|analytical electron \
microscope|ray diffraction xrd|x ray microanalysis|high spatial resolution|high \
field magnets|atomic force microscopy|electron microscopy tem|narrow size distribution\
|scanning electron microscopy|building high field|silicon oxide nanowires|particle size \
nm)'
    """

    from nltk.collocations import TrigramCollocationFinder
    from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures

    # generator of documents, turn each element to its list of words
    documents = (tokenize_simple(text, min_length=min_length, stopwords=stopwords)
                 for text in collection.get_generator_without_id())
    # generator, concatenate (chain) all words into a single sequence, lazily
    words = itertools.chain.from_iterable(documents)
    tcf = TrigramCollocationFinder.from_words(iter(words))

    tcf.apply_freq_filter(min_trigram_freq)
    trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]
    logging.info("%i trigrams found: %s..." % (len(trigrams), trigrams[:20]))

    bcf = tcf.bigram_finder()
    bcf.apply_freq_filter(min_bigram_freq)
    bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]
    logging.info("%i bigrams found: %s..." % (len(bigrams), bigrams[:20]))

    bigrams_patterns = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
    trigrams_patterns = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)

    return bigrams_patterns, trigrams_patterns


[docs]def tokenize_collocation(text, patterns, min_length=1, stopwords=None):
    """A text tokenizer that includes collocations(bigrams and trigrams).

    A collocation is sequence of words or terms that co-occur more often
    than would be expected by chance.  This function breaks a raw document
    up into tokens based on a pre-established collection of bigrams and
    trigrams.  This collection is derived from a body of many documents, and
    must be obtained in a prior step using the collect_bigrams_and_trigrams
    function.

    Uses nltk.collocations.TrigramCollocationFinder to
    find trigrams and bigrams.

    Parameters
    ----------
    text : str
        A single document's text to be tokenized
    patterns: tuple of compiled regex object to find n-grams
        Obtained from collect_bigrams_and_trigrams function
    min_length : int
        Minimum length of any single word
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> from topik.readers import read_input
    >>> id_documents = read_input('{}/test_data_json_stream.json'.format(test_data_path), content_field="abstract")
    >>> patterns = collect_bigrams_and_trigrams(id_documents, min_bigram_freq=2, min_trigram_freq=2)
    >>> id, doc_text = next(iter(id_documents))
    >>> tokenized_text = tokenize_collocation(doc_text, patterns)
    >>> tokenized_text
    [u'transition_metal', u'oxides', u'considered', u'generation', \
u'materials', u'field', u'electronics', u'advanced', u'catalysts', \
u'tantalum', u'v_oxide', u'reports', u'synthesis_material', \
u'nanometer_size', u'unusual', u'properties', u'work_present', \
u'synthesis', u'ta', u'o', u'nanorods', u'sol', u'gel', u'method', \
u'dna', u'structure', u'directing', u'agent', u'size', u'nanorods', \
u'order', u'nm_diameter', u'microns', u'length', u'easy', u'method', \
u'useful', u'preparation', u'nanomaterials', u'electronics', u'biomedical', \
u'applications', u'catalysts']
    """
    text = ' '.join(tokenize_simple(text, min_length=min_length, stopwords=stopwords))
    for pattern in patterns:
        text = re.sub(pattern, lambda match: match.group(0).replace(' ', '_'), text)
    return text.split()


[docs]def collect_entities(collection, freq_min=2, freq_max=10000):
    """Return noun phrases from collection of documents.

    Parameters
    ----------
    collection: Corpus-base derived object or iterable collection of raw text
    freq_min: int
        Minimum frequency of a noun phrase occurrences in order to retrieve it. Default is 2.
    freq_max: int
        Maximum frequency of a noun phrase occurrences in order to retrieve it. Default is 10000.

    """

    from textblob import TextBlob

    np_counts_total = {}
    docs_examined = 0
    for doc in collection.get_generator_without_id():
        if docs_examined > 0 and docs_examined % 1000 == 0:
            sorted_phrases = sorted(np_counts_total.items(),
                                    key=lambda item: -item[1])
            np_counts_total = dict(sorted_phrases)
            logging.info("at document #%i, considering %i phrases: %s..." %
                         (docs_examined, len(np_counts_total), sorted_phrases[0]))

        for np in TextBlob(doc).noun_phrases:
            np_counts_total[np] = np_counts_total.get(np, 0) + 1
        docs_examined += 1

    # Remove noun phrases in the list that have higher frequencies than 'freq_max' or lower frequencies than 'freq_min'
    np_counts = {}
    for np, count in np_counts_total.items():
        if freq_max >= count >= freq_min:
            np_counts[np] = count

    return set(np_counts)


[docs]def tokenize_entities(text, entities, min_length=1, stopwords=None):
    """A tokenizer that extracts noun phrases from text.

    Requires that you first establish entities using the collect_entities function

    Parameters
    ----------
    text : str
        A single document's text to be tokenized
    entities : iterable of str
        Collection of noun phrases, obtained from collect_entities function
    min_length : int
        Minimum length of any single word
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> from topik.readers import read_input
    >>> id_documents = read_input('{}/test_data_json_stream.json'.format(test_data_path), "abstract")
    >>> entities = collect_entities(id_documents)
    >>> len(entities)
    220
    >>> i = iter(id_documents)
    >>> _, doc_text = next(i)
    >>> doc_text
    u'Transition metal oxides are being considered as the next generation \
materials in field such as electronics and advanced catalysts; between\
 them is Tantalum (V) Oxide; however, there are few reports for the \
synthesis of this material at the nanometer size which could have \
unusual properties. Hence, in this work we present the synthesis of \
Ta2O5 nanorods by sol gel method using DNA as structure directing \
agent, the size of the nanorods was of the order of 40 to 100 nm in \
diameter and several microns in length; this easy method can be useful\
 in the preparation of nanomaterials for electronics, biomedical \
applications as well as catalysts.'
    >>> tokenized_text = tokenize_entities(doc_text, entities)
    >>> tokenized_text
    [u'transition']

    """
    from textblob import TextBlob
    result = []
    for np in TextBlob(text).noun_phrases:
        if np in entities:
            # filter out stop words
            tmp = "_".join(tokenize_simple(np, min_length=min_length, stopwords=stopwords))
            # if we end up with nothing, don't append an empty string
            if tmp:
                result.append(tmp)
    return result



[docs]def tokenize_mixed(text, entities, min_length=1, stopwords=None):
    """A text tokenizer that retrieves entities ('noun phrases') first and simple words for the rest of the text.

    Parameters
    ----------
    text : str
        A single document's text to be tokenized
    entities : iterable of str
        Collection of noun phrases, obtained from collect_entities function
    min_length : int
        Minimum length of any single word
    stopwords: None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> from topik.readers import read_input
    >>> raw_data = read_input('{}/test_data_json_stream.json'.format(test_data_path), content_field="abstract")
    >>> entities = collect_entities(raw_data)
    >>> id, text = next(iter(raw_data))
    >>> tokenized_text = tokenize_mixed(text, entities, min_length=3)
    >>> tokenized_text
    [u'transition', u'metal', u'oxides', u'generation', u'materials', u'tantalum', \
u'oxide', u'nanometer', u'size', u'unusual', u'properties', u'sol', u'gel', \
u'method', u'dna', u'easy', u'method', u'biomedical', u'applications']

    """
    from textblob import TextBlob
    result = []
    for np in TextBlob(text).noun_phrases:
        if ' ' in np and np not in entities:
            # break apart the noun phrase; it does not occur often enough in the collection of text to be considered.
            result.extend(tokenize_simple(np, min_length=min_length, stopwords=stopwords))
        else:
            # filter out stop words
            tmp = "_".join(tokenize_simple(np, min_length=min_length, stopwords=stopwords))
            # if we end up with nothing, don't append an empty string
            if tmp:
                result.append(tmp)
    return result


# Add additional methods here as necessary to expose them to outside consumers.
tokenizer_methods = {"simple": tokenize_simple,
                     "collocation": tokenize_collocation,
                     "entities": tokenize_entities,
                     "mixed": tokenize_mixed
                     }