Source code for topik.vectorizers.bag_of_words

from collections import Counter
from ._registry import register
from vectorizer_output import VectorizerOutput

def _count_words_in_docs(tokenized_corpora, vectorizer_output):
    doc_counts = {}
    for id, doc in tokenized_corpora:
        doc_counts[id] = {vectorizer_output.term_id_map[key]: value
                          for key, value in Counter(doc).items()}
    return doc_counts

@register
[docs]def bag_of_words(tokenized_corpora): return VectorizerOutput(tokenized_corpora, _count_words_in_docs)