Source code for topik.models.model_base

from abc import ABCMeta, abstractmethod
from collections import Counter
import logging

import pandas as pd
from six import with_metaclass

# doctest-only imports
from topik.readers import read_input
from topik.tests import test_data_path
from topik.intermediaries.persistence import Persistor

registered_models = {}

[docs]def register_model(cls): """Decorator function to register new model with global registry of models""" global registered_models if cls.__name__ not in registered_models: registered_models[cls.__name__] = cls return cls
[docs]class TopicModelBase(with_metaclass(ABCMeta)): """Abstract base class for topic models. Ensures consistent interface across models, for base result display capabilities. Attributes ---------- _corpus : topik.intermediaries.digested_document_collection.DigestedDocumentCollection-derived object The input data for this model _persistor : topik.intermediaries.persistor.Persistor object The object responsible for persisting the state of this model to disk. Persistor saves metadata that instructs load_model how to load the actual data. """ _corpus = None @abstractmethod
[docs] def get_top_words(self, topn): """Abstract method. Implementations should collect top n words per topic, translate indices/ids to words. Returns ------- list of lists of tuples: * outer list: topics * inner lists: length topn collection of (weight, word) tuples """ raise NotImplementedError
@abstractmethod
[docs] def save(self, filename, saved_data): """Abstract method. Persist the model metadata and data to disk. Implementations should both save their important data do disk with some known keyword (perhaps as filename or server address details), and pass a dictionary to saved_data. The contents of this dictionary will be passed to the class' constructor as **kwargs. Be sure to either call super(YourClass, self).save(filename, saved_data) or otherwise duplicate the base level of functionality here. Parameters ---------- filename : str The filename of the JSON file to be saved, containing model and corpus metadata that allow for reconstruction saved_data : dict Dictionary of metadata that will be fed to class __init__ method at load time. This should include such things as number of topics modeled, binary filenames, and any other relevant model parameters to recreate your current model. """ self._persistor.store_model(self.get_model_name_with_parameters(), {"class": self.__class__.__name__, "saved_data": saved_data}) self._corpus.save(filename)
@abstractmethod
[docs] def get_model_name_with_parameters(self): """Abstract method. Primarily internal function, used to name configurations in persisted metadata for later retrieval.""" raise NotImplementedError
def _get_term_data(self): vocab = self._get_vocab() tf = self._get_term_frequency() ttd = self._get_topic_term_dists() term_data_df = ttd term_data_df['term_frequency'] = tf term_data_df['term'] = vocab return term_data_df def _get_vocab(self): return pd.Series(dict(self._corpus._dict.items())) def _get_term_frequency(self): tf = Counter() [tf.update(dict(doc)) for doc in self._corpus] # TODO update term documents in intermediate store return pd.Series(dict(tf)) def _get_doc_data(self): doc_data_df = self._get_doc_topic_dists() doc_data_df['doc_length'] = self._get_doc_lengths() return doc_data_df def _get_doc_lengths(self): id_index, doc_lengths = zip(*[(id, len(doc)) for id, doc in list( self._corpus._corpus)]) return pd.Series(doc_lengths, index=id_index) @abstractmethod def _get_topic_term_dists(self): raise NotImplementedError @abstractmethod def _get_doc_topic_dists(self): raise NotImplementedError
[docs] def to_py_lda_vis(self): doc_data_df = self._get_doc_data() term_data_df = self._get_term_data() model_lda_vis_data = { 'vocab': term_data_df['term'], 'term_frequency': term_data_df['term_frequency'], 'topic_term_dists': term_data_df.iloc[:,:-2].T, 'doc_topic_dists': doc_data_df.iloc[:,:-1], 'doc_lengths': doc_data_df['doc_length']} return model_lda_vis_data
[docs] def termite_data(self, topn_words=15): """Generate the pandas dataframe input for the termite plot. Parameters ---------- topn_words : int number of words to include from each topic Examples -------- >>> raw_data = read_input('{}/test_data_json_stream.json'.format(test_data_path), "abstract") >>> processed_data = raw_data.tokenize() # tokenize returns a DigestedDocumentCollection >>> # must set seed so that we get same topics each run >>> import random >>> import numpy >>> random.seed(42) >>> numpy.random.seed(42) >>> model = registered_models["LDA"](processed_data, ntopics=3) >>> model.termite_data(5) topic weight word 0 0 0.005337 nm 1 0 0.005193 high 2 0 0.004622 films 3 0 0.004457 matrix 4 0 0.004194 electron 5 1 0.005109 properties 6 1 0.004654 size 7 1 0.004539 temperature 8 1 0.004499 nm 9 1 0.004248 mechanical 10 2 0.007994 high 11 2 0.006458 nm 12 2 0.005717 size 13 2 0.005399 materials 14 2 0.004734 phase """ from itertools import chain return pd.DataFrame(list(chain.from_iterable([{"topic": topic_id, "weight": weight, "word": word} for (weight, word) in topic] for topic_id, topic in enumerate(self.get_top_words(topn_words)))))
@property def _persistor(self): return self._corpus.persistor
[docs]def load_model(filename, model_name): """Loads a JSON file containing instructions on how to load model data. Returns ------- TopicModelBase-derived object """ p = Persistor(filename) if model_name in p.list_available_models(): data_dict = p.get_model_details(model_name) model = registered_models[data_dict['class']](**data_dict["saved_data"]) else: raise NameError("Model name {} has not yet been created.".format(model_name)) return model