Source code for topik.fileio.reader

import os

from topik.fileio._registry import registered_inputs
from topik.fileio.tests import test_data_path

# this function is the primary API for people using any registered functions.
[docs]def read_input(source, source_type="auto", folder_content_field='text', **kwargs): """ Read data from given source into Topik's internal data structures. Parameters ---------- source : str input data. Can be file path, directory, or server address. source_type : str "auto" tries to figure out data type of source. Can be manually specified instead. options for manual specification are ['solr', 'elastic', 'json_stream', 'large_json', 'folder'] folder_content_field : str Only used for document_folder source. This argument is used as the key (field name), where each document represents the value of that field. kwargs : any other arguments to pass to input parsers Returns ------- iterable output object >> ids, texts = zip(*list(iter(raw_data))) Examples -------- >>> loaded_corpus = read_input( ... '{}/test_data_json_stream.json'.format(test_data_path)) >>> solution_text = ( ... u'Transition metal oxides are being considered as the next generation '+ ... u'materials in field such as electronics and advanced catalysts; '+ ... u'between them is Tantalum (V) Oxide; however, there are few reports '+ ... u'for the synthesis of this material at the nanometer size which could '+ ... u'have unusual properties. Hence, in this work we present the '+ ... u'synthesis of Ta2O5 nanorods by sol gel method using DNA as structure '+ ... u'directing agent, the size of the nanorods was of the order of 40 to '+ ... u'100 nm in diameter and several microns in length; this easy method '+ ... u'can be useful in the preparation of nanomaterials for electronics, '+ ... u'biomedical applications as well as catalysts.') >>> solution_text == next(loaded_corpus)['abstract'] True """ json_extensions = [".js", ".json"] # web addresses default to elasticsearch if (source_type == "auto" and "9200" in source) or source_type == "elastic": data_iterator = registered_inputs["read_elastic"](source, **kwargs) # files must end in .json. Try json parser first, try large_json parser next. Fail otherwise. elif (source_type == "auto" and os.path.splitext(source)[1] in json_extensions) or source_type == "json_stream": try: data_iterator = registered_inputs["read_json_stream"](source, **kwargs) # tee the iterator and try to get the first element. If it fails, this is actually a large_json file. next(data_iterator) # reset the iterator after this check so that it starts at document 0 rather than document 1 data_iterator = registered_inputs["read_json_stream"](source, **kwargs) except ValueError: data_iterator = registered_inputs["read_large_json"](source, **kwargs) elif source_type == "large_json": data_iterator = registered_inputs["read_large_json"](source, **kwargs) # folder paths are simple strings that don't end in an extension (.+3-4 characters), or end in a / elif (source_type == "auto" and os.path.splitext(source)[1] == "") or source_type == "folder": data_iterator = registered_inputs["read_document_folder"](source, content_field=folder_content_field) else: raise ValueError("Unrecognized source type: {}. Please either manually specify the type, or convert your input" " to a supported type.".format(source)) return data_iterator