Source code for topik.fileio.in_document_folder

import os
import logging
import gzip

from topik.fileio._registry import register_input
from topik.fileio.tests import test_data_path

@register_input
[docs]def read_document_folder(folder, content_field='text'): """Iterate over the files in a folder to retrieve the content to process and tokenize. Parameters ---------- folder : str The folder containing the files you want to analyze. content_field : str The usage of 'content_field' in this source is different from most other sources. The assumption in this source is that each file contains raw text, NOT dictionaries of categorized data. The content_field argument here specifies what key to store the raw text under in the returned dictionary for each document. Examples -------- >>> documents = read_document_folder( ... '{}/test_data_folder_files'.format(test_data_path)) >>> next(documents)['text'] == ( ... u"'Interstellar' was incredible. The visuals, the score, " + ... u"the acting, were all amazing. The plot is definitely one " + ... u"of the most original I've seen in a while.") True """ if not os.path.exists(folder): raise IOError("Folder not found!") for directory, subdirectories, files in os.walk(folder): for n, file in enumerate(sorted(files)): _open = gzip.open if file.endswith('.gz') else open try: fullpath = os.path.join(directory, file) with _open(fullpath, 'rb') as f: yield {content_field: f.read().decode('utf-8'), 'filename': fullpath} except (ValueError, UnicodeDecodeError) as err: logging.warning("Unable to process file: {}, error: {}".format(fullpath, err))