Source code for topik.models.plsa

# -*- coding: utf-8 -*-

import logging

import numpy as np

from .base_model_output import ModelOutput
from ._registry import register

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.WARNING)


def _rand_mat(rows, cols):
    out = np.random.random((rows, cols))
    for row in out:
        row /= row.sum()
    return out


def _cal_p_dw(words_in_docs, word_cts_in_docs, topic_array, zw, dz, beta, p_dw):
    for (d, doc_id, words) in words_in_docs:
        p_dw[d, words] = (word_cts_in_docs[doc_id] * (zw[:, words]*np.expand_dims(dz[d, :], 1))**beta).sum(axis=0)
    return p_dw


def _e_step(words_in_docs, dw_z, topic_array, zw, dz, beta, p_dw):
    for (d, _, words) in words_in_docs:
        dw_z[d, words, :] = ((zw[:, words].T * dz[d, :]) ** beta) / np.expand_dims(p_dw[d, words], 1)
    return dw_z


def _m_step(words_in_docs, word_cts_in_docs, topic_array, zw, dw_z, dz):
    zw[:] = 0
    for (d, doc_id, words) in words_in_docs:
        zw[:, words] += word_cts_in_docs[doc_id]*dw_z[d, words].T
    # normalize by sum of topic word weights
    zw /= np.expand_dims(zw.sum(axis=1), 1)
    for (d, doc_id, words) in words_in_docs:
        dz[d] = (word_cts_in_docs[doc_id] * dw_z[d, words].T).sum(axis=1)
    dz /= np.expand_dims(dz.sum(axis=1), 1)
    return zw, dz


def _cal_likelihood(words_in_docs, word_cts_in_docs, p_dw):
    likelihood = 0
    for (d, doc_id, words) in words_in_docs:
        likelihood += sum(word_cts_in_docs[doc_id] * np.log(p_dw[d][words]))
    return likelihood


def _get_topic_term_matrix(zw, ntopics, id_term_map):
    labeled_zw = {"topic"+str(topicno): zw[topicno].tolist() for topicno in range(ntopics)}
    return labeled_zw


def _get_doc_topic_matrix(dz, ntopics, vectorized_corpus):
    labeled_dz = {doc_id: dz[i].tolist() for i, (doc_id, vector) in enumerate(vectorized_corpus.get_vectors())}
    return labeled_dz


def _PLSA(vectorized_corpus, ntopics, max_iter):
    cur = 0
    topic_array = np.arange(ntopics, dtype=np.int32)
    # topic-word matrix
    zw = _rand_mat(ntopics, vectorized_corpus.global_term_count)
    # document-topic matrix
    dz = _rand_mat(len(vectorized_corpus), ntopics)
    dw_z = np.zeros((len(vectorized_corpus), vectorized_corpus.global_term_count, ntopics))
    p_dw = np.zeros((len(vectorized_corpus), vectorized_corpus.global_term_count))
    beta = 0.8
    words_in_docs = [(id, doc_id, [word_id for word_id, _ in doc.items()])
                     for id, (doc_id, doc) in enumerate(vectorized_corpus.get_vectors())]
    word_cts_in_docs = {doc_id: [ct for _, ct in doc.items()] for doc_id, doc in vectorized_corpus.get_vectors()}
    for i in range(max_iter):
        p_dw = _cal_p_dw(words_in_docs, word_cts_in_docs, topic_array, zw, dz, beta, p_dw)
        dw_z = _e_step(words_in_docs, dw_z, topic_array, zw, dz, beta, p_dw)
        zw, dz = _m_step(words_in_docs, word_cts_in_docs, topic_array, zw, dw_z, dz)
        likelihood = _cal_likelihood(words_in_docs, word_cts_in_docs, p_dw)
        if cur != 0 and abs((likelihood-cur)/cur) < 1e-8:
            break
        cur = likelihood
    topic_term_matrix = _get_topic_term_matrix(zw, ntopics, vectorized_corpus.id_term_map)
    doc_topic_matrix = _get_doc_topic_matrix(dz, ntopics, vectorized_corpus)
    return topic_term_matrix, doc_topic_matrix

@register
[docs]def plsa(vectorized_corpus, ntopics, max_iter=100, **kwargs): return ModelOutput(vectorized_corpus=vectorized_corpus, model_func=_PLSA, ntopics=ntopics, max_iter=max_iter, **kwargs)