from __future__ import absolute_import, print_function
import itertools
import logging
import re
# imports used only for doctests
from topik.tests import test_data_path
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
[docs]def tokenize_simple(text, min_length=1, stopwords=None):
"""A text tokenizer that simply lowercases, matches alphabetic
characters and removes stopwords.
Parameters
----------
text : str
A single document's text to be tokenized
entities : iterable of str
Collection of noun phrases, obtained from collect_entities function
min_length : int
Minimum length of any single word
stopwords: None or iterable of str
Collection of words to ignore as tokens
Examples
--------
>>> from topik.readers import read_input
>>> id_documents = read_input(
... '{}/test_data_json_stream.json'.format(test_data_path),
... content_field="abstract")
>>> id, doc_text = next(iter(id_documents))
>>> doc_text
u'Transition metal oxides are being considered as the next generation \
materials in field such as electronics and advanced catalysts; between\
them is Tantalum (V) Oxide; however, there are few reports for the \
synthesis of this material at the nanometer size which could have \
unusual properties. Hence, in this work we present the synthesis of \
Ta2O5 nanorods by sol gel method using DNA as structure directing \
agent, the size of the nanorods was of the order of 40 to 100 nm in \
diameter and several microns in length; this easy method can be useful\
in the preparation of nanomaterials for electronics, biomedical \
applications as well as catalysts.'
>>> tokens = tokenize_simple(doc_text)
>>> tokens
[u'transition', u'metal', u'oxides', u'considered', \
u'generation', u'materials', u'field', u'electronics', \
u'advanced', u'catalysts', u'tantalum', u'v', u'oxide', \
u'reports', u'synthesis', u'material', u'nanometer', u'size', \
u'unusual', u'properties', u'work', u'present', u'synthesis', \
u'ta', u'o', u'nanorods', u'sol', u'gel', u'method', u'dna', \
u'structure', u'directing', u'agent', u'size', u'nanorods', \
u'order', u'nm', u'diameter', u'microns', u'length', u'easy', \
u'method', u'useful', u'preparation', u'nanomaterials', u'electronics', \
u'biomedical', u'applications', u'catalysts']
"""
import gensim
if not stopwords:
from gensim.parsing.preprocessing import STOPWORDS as stopwords
return [word for word in gensim.utils.tokenize(text, lower=True)
if word not in stopwords and len(word) >= min_length]
[docs]def collect_bigrams_and_trigrams(collection, top_n=10000, min_length=1, min_bigram_freq=50,
min_trigram_freq=20, stopwords=None):
"""collects bigrams and trigrams from collection of documents. Input to collocation tokenizer.
bigrams are pairs of words that recur in the collection; trigrams are triplets.
Parameters
----------
collection : iterable of str
body of documents to examine
top_n : int
limit results to this many entries
min_length : int
Minimum length of any single word
min_bigram_freq : int
threshold of when to consider a pair of words as a recognized bigram
min_trigram_freq : int
threshold of when to consider a triplet of words as a recognized trigram
stopwords : None or iterable of str
Collection of words to ignore as tokens
Examples
--------
>>> from topik.readers import read_input
>>> raw_data = read_input(
... '{}/test_data_json_stream.json'.format(test_data_path),
... content_field="abstract")
>>> bigrams, trigrams = collect_bigrams_and_trigrams(raw_data, min_bigram_freq=5, min_trigram_freq=3)
>>> bigrams.pattern
u'(free standing|ac electrodeposition|centered cubic|spatial resolution|vapor deposition\
|wear resistance|plastic deformation|electrical conductivity|field magnets|v o|\
transmission electron|x ray|et al|ray diffraction|electron microscopy|room \
temperature|diffraction xrd|electron microscope|results indicate|scanning \
electron|m s|doped zno|microscopy tem|polymer matrix|size distribution|mechanical \
properties|grain size|diameters nm|high spatial|particle size|high resolution|ni \
al|diameter nm|range nm|high field|high strength|c c)'
>>> trigrams.pattern
u'(differential scanning calorimetry|face centered cubic|ray microanalysis analytical|\
physical vapor deposition|transmission electron microscopy|x ray diffraction|microanalysis \
analytical electron|chemical vapor deposition|high aspect ratio|analytical electron \
microscope|ray diffraction xrd|x ray microanalysis|high spatial resolution|high \
field magnets|atomic force microscopy|electron microscopy tem|narrow size distribution\
|scanning electron microscopy|building high field|silicon oxide nanowires|particle size \
nm)'
"""
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
# generator of documents, turn each element to its list of words
documents = (tokenize_simple(text, min_length=min_length, stopwords=stopwords)
for text in collection.get_generator_without_id())
# generator, concatenate (chain) all words into a single sequence, lazily
words = itertools.chain.from_iterable(documents)
tcf = TrigramCollocationFinder.from_words(iter(words))
tcf.apply_freq_filter(min_trigram_freq)
trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]
logging.info("%i trigrams found: %s..." % (len(trigrams), trigrams[:20]))
bcf = tcf.bigram_finder()
bcf.apply_freq_filter(min_bigram_freq)
bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]
logging.info("%i bigrams found: %s..." % (len(bigrams), bigrams[:20]))
bigrams_patterns = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
trigrams_patterns = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)
return bigrams_patterns, trigrams_patterns
[docs]def tokenize_collocation(text, patterns, min_length=1, stopwords=None):
"""A text tokenizer that includes collocations(bigrams and trigrams).
A collocation is sequence of words or terms that co-occur more often
than would be expected by chance. This function breaks a raw document
up into tokens based on a pre-established collection of bigrams and
trigrams. This collection is derived from a body of many documents, and
must be obtained in a prior step using the collect_bigrams_and_trigrams
function.
Uses nltk.collocations.TrigramCollocationFinder to
find trigrams and bigrams.
Parameters
----------
text : str
A single document's text to be tokenized
patterns: tuple of compiled regex object to find n-grams
Obtained from collect_bigrams_and_trigrams function
min_length : int
Minimum length of any single word
stopwords : None or iterable of str
Collection of words to ignore as tokens
Examples
--------
>>> from topik.readers import read_input
>>> id_documents = read_input('{}/test_data_json_stream.json'.format(test_data_path), content_field="abstract")
>>> patterns = collect_bigrams_and_trigrams(id_documents, min_bigram_freq=2, min_trigram_freq=2)
>>> id, doc_text = next(iter(id_documents))
>>> tokenized_text = tokenize_collocation(doc_text, patterns)
>>> tokenized_text
[u'transition_metal', u'oxides', u'considered', u'generation', \
u'materials', u'field', u'electronics', u'advanced', u'catalysts', \
u'tantalum', u'v_oxide', u'reports', u'synthesis_material', \
u'nanometer_size', u'unusual', u'properties', u'work_present', \
u'synthesis', u'ta', u'o', u'nanorods', u'sol', u'gel', u'method', \
u'dna', u'structure', u'directing', u'agent', u'size', u'nanorods', \
u'order', u'nm_diameter', u'microns', u'length', u'easy', u'method', \
u'useful', u'preparation', u'nanomaterials', u'electronics', u'biomedical', \
u'applications', u'catalysts']
"""
text = ' '.join(tokenize_simple(text, min_length=min_length, stopwords=stopwords))
for pattern in patterns:
text = re.sub(pattern, lambda match: match.group(0).replace(' ', '_'), text)
return text.split()
[docs]def collect_entities(collection, freq_min=2, freq_max=10000):
"""Return noun phrases from collection of documents.
Parameters
----------
collection: Corpus-base derived object or iterable collection of raw text
freq_min: int
Minimum frequency of a noun phrase occurrences in order to retrieve it. Default is 2.
freq_max: int
Maximum frequency of a noun phrase occurrences in order to retrieve it. Default is 10000.
"""
from textblob import TextBlob
np_counts_total = {}
docs_examined = 0
for doc in collection.get_generator_without_id():
if docs_examined > 0 and docs_examined % 1000 == 0:
sorted_phrases = sorted(np_counts_total.items(),
key=lambda item: -item[1])
np_counts_total = dict(sorted_phrases)
logging.info("at document #%i, considering %i phrases: %s..." %
(docs_examined, len(np_counts_total), sorted_phrases[0]))
for np in TextBlob(doc).noun_phrases:
np_counts_total[np] = np_counts_total.get(np, 0) + 1
docs_examined += 1
# Remove noun phrases in the list that have higher frequencies than 'freq_max' or lower frequencies than 'freq_min'
np_counts = {}
for np, count in np_counts_total.items():
if freq_max >= count >= freq_min:
np_counts[np] = count
return set(np_counts)
[docs]def tokenize_entities(text, entities, min_length=1, stopwords=None):
"""A tokenizer that extracts noun phrases from text.
Requires that you first establish entities using the collect_entities function
Parameters
----------
text : str
A single document's text to be tokenized
entities : iterable of str
Collection of noun phrases, obtained from collect_entities function
min_length : int
Minimum length of any single word
stopwords : None or iterable of str
Collection of words to ignore as tokens
Examples
--------
>>> from topik.readers import read_input
>>> id_documents = read_input('{}/test_data_json_stream.json'.format(test_data_path), "abstract")
>>> entities = collect_entities(id_documents)
>>> len(entities)
220
>>> i = iter(id_documents)
>>> _, doc_text = next(i)
>>> doc_text
u'Transition metal oxides are being considered as the next generation \
materials in field such as electronics and advanced catalysts; between\
them is Tantalum (V) Oxide; however, there are few reports for the \
synthesis of this material at the nanometer size which could have \
unusual properties. Hence, in this work we present the synthesis of \
Ta2O5 nanorods by sol gel method using DNA as structure directing \
agent, the size of the nanorods was of the order of 40 to 100 nm in \
diameter and several microns in length; this easy method can be useful\
in the preparation of nanomaterials for electronics, biomedical \
applications as well as catalysts.'
>>> tokenized_text = tokenize_entities(doc_text, entities)
>>> tokenized_text
[u'transition']
"""
from textblob import TextBlob
result = []
for np in TextBlob(text).noun_phrases:
if np in entities:
# filter out stop words
tmp = "_".join(tokenize_simple(np, min_length=min_length, stopwords=stopwords))
# if we end up with nothing, don't append an empty string
if tmp:
result.append(tmp)
return result
[docs]def tokenize_mixed(text, entities, min_length=1, stopwords=None):
"""A text tokenizer that retrieves entities ('noun phrases') first and simple words for the rest of the text.
Parameters
----------
text : str
A single document's text to be tokenized
entities : iterable of str
Collection of noun phrases, obtained from collect_entities function
min_length : int
Minimum length of any single word
stopwords: None or iterable of str
Collection of words to ignore as tokens
Examples
--------
>>> from topik.readers import read_input
>>> raw_data = read_input('{}/test_data_json_stream.json'.format(test_data_path), content_field="abstract")
>>> entities = collect_entities(raw_data)
>>> id, text = next(iter(raw_data))
>>> tokenized_text = tokenize_mixed(text, entities, min_length=3)
>>> tokenized_text
[u'transition', u'metal', u'oxides', u'generation', u'materials', u'tantalum', \
u'oxide', u'nanometer', u'size', u'unusual', u'properties', u'sol', u'gel', \
u'method', u'dna', u'easy', u'method', u'biomedical', u'applications']
"""
from textblob import TextBlob
result = []
for np in TextBlob(text).noun_phrases:
if ' ' in np and np not in entities:
# break apart the noun phrase; it does not occur often enough in the collection of text to be considered.
result.extend(tokenize_simple(np, min_length=min_length, stopwords=stopwords))
else:
# filter out stop words
tmp = "_".join(tokenize_simple(np, min_length=min_length, stopwords=stopwords))
# if we end up with nothing, don't append an empty string
if tmp:
result.append(tmp)
return result
# Add additional methods here as necessary to expose them to outside consumers.
tokenizer_methods = {"simple": tokenize_simple,
"collocation": tokenize_collocation,
"entities": tokenize_entities,
"mixed": tokenize_mixed
}