Source code for topik.fileio.reader

import os

from topik.fileio._registry import registered_inputs
from topik.fileio.tests import test_data_path

# this function is the primary API for people using any registered functions.
[docs]def read_input(source, source_type="auto", folder_content_field='text', **kwargs):
    """
    Read data from given source into Topik's internal data structures.

    Parameters
    ----------
    source : str
        input data.  Can be file path, directory, or server address.
    source_type : str
        "auto" tries to figure out data type of source.  Can be manually specified instead.
        options for manual specification are ['solr', 'elastic', 'json_stream', 'large_json', 'folder']
    folder_content_field : str
        Only used for document_folder source. This argument is used as the key
        (field name), where each document represents the value of that field.
    kwargs : any other arguments to pass to input parsers

    Returns
    -------
    iterable output object

    >> ids, texts = zip(*list(iter(raw_data)))
    Examples
    --------
    >>> loaded_corpus = read_input(
    ...         '{}/test_data_json_stream.json'.format(test_data_path))
    >>> solution_text = (
    ... u'Transition metal oxides are being considered as the next generation '+
    ... u'materials in field such as electronics and advanced catalysts; '+
    ... u'between them is Tantalum (V) Oxide; however, there are few reports '+
    ... u'for the synthesis of this material at the nanometer size which could '+
    ... u'have unusual properties. Hence, in this work we present the '+
    ... u'synthesis of Ta2O5 nanorods by sol gel method using DNA as structure '+
    ... u'directing agent, the size of the nanorods was of the order of 40 to '+
    ... u'100 nm in diameter and several microns in length; this easy method '+
    ... u'can be useful in the preparation of nanomaterials for electronics, '+
    ... u'biomedical applications as well as catalysts.')
    >>> solution_text == next(loaded_corpus)['abstract']
    True
    """
    json_extensions = [".js", ".json"]

    # web addresses default to elasticsearch
    if (source_type == "auto" and "9200" in source) or source_type == "elastic":
        data_iterator = registered_inputs["read_elastic"](source, **kwargs)
    # files must end in .json.  Try json parser first, try large_json parser next.  Fail otherwise.
    elif (source_type == "auto" and os.path.splitext(source)[1] in json_extensions) or source_type == "json_stream":
        try:
            data_iterator = registered_inputs["read_json_stream"](source, **kwargs)
            # tee the iterator and try to get the first element.  If it fails, this is actually a large_json file.
            next(data_iterator)
            # reset the iterator after this check so that it starts at document 0 rather than document 1
            data_iterator = registered_inputs["read_json_stream"](source, **kwargs)
        except ValueError:
            data_iterator = registered_inputs["read_large_json"](source, **kwargs)
    elif source_type == "large_json":
        data_iterator = registered_inputs["read_large_json"](source, **kwargs)
    # folder paths are simple strings that don't end in an extension (.+3-4 characters), or end in a /
    elif (source_type == "auto" and os.path.splitext(source)[1] == "") or source_type == "folder":
        data_iterator = registered_inputs["read_document_folder"](source,
                                                                  content_field=folder_content_field)
    else:
        raise ValueError("Unrecognized source type: {}.  Please either manually specify the type, or convert your input"
                         " to a supported type.".format(source))
    return data_iterator