Source code for topik.vectorizers.vectorizer_output

from collections import Counter
import itertools

def _accumulate_terms(tokenized_corpus):
    global_terms=set()
    document_term_counts = {}
    doc_lengths = {}
    global_term_frequency_counter = Counter()
    for doc_id, doc in tokenized_corpus:
        doc_terms = set(doc)
        global_terms.update(doc_terms)
        doc_lengths[doc_id] = len(doc)
        document_term_counts[doc_id] = len(doc_terms)
        global_term_frequency_counter.update(doc)
    id_term_map = {}
    global_term_frequency = {}
    for term_id, term in enumerate(global_terms):
        id_term_map[term_id] = term
        global_term_frequency[term_id] = global_term_frequency_counter[term]

    return id_term_map, document_term_counts, doc_lengths, global_term_frequency


[docs]class VectorizerOutput(object): def __init__(self, tokenized_corpus=None, vectorizer_func=None, id_term_map=None, document_term_counts=None, doc_lengths=None, term_frequency=None, vectors=None): if tokenized_corpus and vectorizer_func and not vectors: iter1, iter2 = itertools.tee(tokenized_corpus) self._id_term_map, self._document_term_counts, self._doc_lengths, \ self._term_frequency = _accumulate_terms(iter1) self._term_id_map = {term: id for id, term in self._id_term_map.items()} self._vectors = vectorizer_func(iter2, self) elif id_term_map and document_term_counts and doc_lengths and \ term_frequency and vectors: self._id_term_map = id_term_map self._term_id_map = {term: id for id, term in self._id_term_map.items()} self._document_term_counts = document_term_counts self._doc_lengths = doc_lengths self._term_frequency = term_frequency self._vectors = vectors else: raise ValueError( "Must provide either tokenized corpora and vectorizer func, " "or global term collection, document term counts, and vectors.")
[docs] def get_vectors(self): for doc_id, vector in self._vectors.items(): yield doc_id, vector
def __len__(self): return len(self._vectors) @property def id_term_map(self): return self._id_term_map @property def term_id_map(self): return self._term_id_map @property def global_term_count(self): return len(self.id_term_map) @property def document_term_counts(self): return self._document_term_counts @property def doc_lengths(self): return self._doc_lengths @property def term_frequency(self): return self._term_frequency @property def vectors(self): return self._vectors