Source code for topik.tokenizers.tests.test_ngrams

from topik.tokenizers.ngrams import _collect_bigrams_and_trigrams, \
    _collocation_document, ngrams

sample_data = [
        ("doc1", str(u"Frank the Swank-Tank walked his sassy unicorn, Brony,"
                    u" to prancercise class daily.  Prancercise was "
                    u"a tremendously popular pastime of sassy "
                    u"unicorns and retirees alike.")),
        ("doc2", str(u"Prancercise is a form of both art and fitniss, "
                    u"originally invented by sassy unicorns. It has "
                    u"recently been popularized by such retired "
                    u"celebrities as Frank The Swank-Tank."))]


x = ngrams(sample_data, freq_bounds=[(2, 100), (2, 100)])


[docs]def test__collect_bigrams_and_trigrams(): bigrams_and_trigrams = _collect_bigrams_and_trigrams(sample_data, min_freqs=[2,2]) assert(bigrams_and_trigrams[0].pattern == u'(frank swank|swank tank|sassy unicorns)') assert(bigrams_and_trigrams[1].pattern == u'(frank swank tank)')
[docs]def test__collocation_document(): bigrams_and_trigrams = _collect_bigrams_and_trigrams(sample_data, min_freqs=[2,2]) assert(_collocation_document(sample_data[0][1],bigrams_and_trigrams) == [ u'frank_swank', u'tank', u'walked', u'sassy', u'unicorn', u'brony', u'prancercise', u'class', u'daily', u'prancercise', u'tremendously', u'popular', u'pastime', u'sassy_unicorns', u'retirees', u'alike' ]) assert(_collocation_document(sample_data[1][1],bigrams_and_trigrams) == [ u'prancercise', u'form', u'art', u'fitniss', u'originally', u'invented', u'sassy_unicorns', u'recently', u'popularized', u'retired', u'celebrities', u'frank_swank', u'tank' ])
[docs]def test_ngrams(): freq_bounds=[(2,100),(2,100)] tokenized_corpora = ngrams(sample_data, freq_bounds=freq_bounds) assert(len(freq_bounds) == 2) assert(next(tokenized_corpora) == ( 'doc1', [ u'frank_swank', u'tank', u'walked', u'sassy', u'unicorn', u'brony', u'prancercise', u'class', u'daily', u'prancercise', u'tremendously', u'popular', u'pastime', u'sassy_unicorns', u'retirees', u'alike' ])) assert(next(tokenized_corpora) == ( 'doc2', [ u'prancercise', u'form', u'art', u'fitniss', u'originally', u'invented', u'sassy_unicorns', u'recently', u'popularized', u'retired', u'celebrities', u'frank_swank', u'tank' ]))