Source code for topik.models.lda

from __future__ import absolute_import

import os
import gensim
import pandas as pd

from topik.intermediaries.digested_document_collection import DigestedDocumentCollection
from topik.intermediaries.raw_data import load_persisted_corpus
from .model_base import TopicModelBase, register_model

# Doctest imports
from topik.readers import read_input
from topik.tests import test_data_path


@register_model
[docs]class LDA(TopicModelBase):
    """A high-level interface for an LDA (Latent Dirichlet Allocation) model.


    Parameters
    ----------
    corpus_input : CorpusBase-derived object
        object fulfilling basic Corpus interface (preprocessed, tokenized text).
        see topik.intermediaries.tokenized_corpus for more info.
    ntopics : int
        Number of topics to model
    load_filename : None or str
        If not None, this (JSON) file is read to determine parameters of the model persisted to disk.
    binary_filename : None or str
        If not None, this file is loaded by Gensim to bring a disk-persisted model back into memory.


    Attributes
    ----------
    corpus : CorpusBase-derived object, tokenized
    model : Gensim LdaModel instance


    Examples
    --------
    >>> raw_data = read_input('{}/test_data_json_stream.json'.format(test_data_path), "abstract")
    >>> processed_data = raw_data.tokenize()  # preprocess returns a DigestedDocumentCollection
    >>> model = LDA(processed_data, ntopics=3)

    """
    def __init__(self, corpus_input=None, ntopics=10, load_filename=None, binary_filename=None, **kwargs):
        if corpus_input is not None:
            # the minimum_probability=0 argument is necessary in order for
            # gensim to return the full document-topic-distribution matrix.  If
            # this argument is omitted and left to the gensim default of 0.01,
            # then all document-topic weights below that threshold will be
            # returned as NaN, violating the subsequent LDAvis assumption that
            # all rows (documents) in the document-topic-distribution matrix sum
            # to 1.

            self._model = gensim.models.LdaModel(list(iter(corpus_input)), num_topics=ntopics,
                                                 id2word=corpus_input.get_id2word_dict(),
                                                 minimum_probability=0, **kwargs)
            self._corpus = corpus_input
        elif load_filename is not None and binary_filename is not None:
            self._model = gensim.models.LdaModel.load(binary_filename)
            self._corpus = DigestedDocumentCollection(load_persisted_corpus(load_filename))

[docs]    def save(self, filename):
        self._model.save(self.get_model_name_with_parameters())
        saved_data = {"load_filename": filename, "binary_filename": self.get_model_name_with_parameters()}
        return super(LDA, self).save(filename, saved_data)

[docs]    def get_top_words(self, topn):
        top_words = [self._model.show_topic(topicno, topn) for topicno in range(self._model.num_topics)]
        return top_words

[docs]    def get_model_name_with_parameters(self):
        return "LDA_{}_topics{}".format(self._model.num_topics, self._corpus.filter_string)

    def _get_topic_term_dists(self):
        term_topic_df = pd.DataFrame([
                pd.DataFrame.from_records(self._model.show_topic(topic_no, None),
                                         columns=['topic' + str(topic_no) + 'dist', 'token'],
                                         index='token')['topic' + str(topic_no) + 'dist']
                for topic_no in range(self._model.num_topics)]).T
        term_topic_df['term_id'] = pd.Series(dict(self._corpus._dict.token2id.items()))
        term_topic_df = term_topic_df.set_index('term_id')
        return term_topic_df

    def _get_doc_topic_dists(self):
        id_index, bow_corpus = zip(*[(id, self._corpus._dict.doc2bow(doc_tokens))
                              for id, doc_tokens in self._corpus._corpus])

        doc_topic = list(self._model[bow_corpus])

        for i, doc in enumerate(doc_topic):
            for j, topic in enumerate(doc):
                doc_topic[i][j] = doc_topic[i][j][1]

        doc_topic_df = pd.DataFrame(doc_topic, index=id_index)
        doc_topic_df.columns = ['topic'+str(i)+'dist' for i in range(
                                                doc_topic_df.shape[1])]
        doc_topic_df.index.name = 'doc_id'
        return doc_topic_df