Source code for pyensae.filehelper.content_helper

"""
Various functions to process text


:githublink:`%|py|5`
"""

import os
import re
import chardet


[docs]def replace_comma_by_point(file):
    """
    Replaces all commas by point in a file (do that inplace).

    :param file: file to process


    :githublink:`%|py|16`
    """
    with open(file, "r") as f:
        text = f.read()
    text = text.replace(",", ".")
    with open(file, "w") as f:
        f.write(text)


[docs]def file_head(filename: str, nbline=10, encoding="utf8", errors="strict"):
    """
    Extracts the first *nbline* of a file (assuming it is text file).

    :param filename: filename
    :param nbline: number of lines
    :param encoding: encoding
    :param errors: see `open <https://docs.python.org/3/library/functions.html#open>`_
    :return: list of lines


    :githublink:`%|py|33`
    """
    if isinstance(filename, str):
        if not os.path.exists(filename):
            raise FileNotFoundError(filename)
        if not os.path.isfile(filename):
            raise FileNotFoundError(  # pragma: no cover
                "'{0}' is not a file".format(filename))
        with open(filename, "r", encoding=encoding, errors=errors) as f:
            return file_head(f, nbline=nbline, encoding=encoding)
    else:
        rows = []
        for line in filename:
            rows.append(line)
            if len(rows) >= nbline:
                break
        return rows


[docs]def file_tail(filename: str, nbline=10, encoding="utf8", threshold=2 ** 14, errors="strict"):
    """
    Extracts the first nbline of a file (assuming it is text file).

    :param filename: filename
    :param nbline: number of lines
    :param encoding: encoding
    :param threshold: if the file size is above, it will not read the beginning
    :param errors: see `open <https://docs.python.org/3/library/functions.html#open>`_
    :return: list of lines

    The line marked as *A* has an issue because the cursor
    could fall on a character (= byte) in the middle of a character
    if the file is encoded in utf-8 character.
    The next line fails. That's why we try again
    by moving the cursor by one character (see line B).

    The first returned line may be incomplete.


    :githublink:`%|py|69`
    """
    if not os.path.exists(filename):
        raise FileNotFoundError(filename)  # pragma: no cover
    if not os.path.isfile(filename):
        raise FileNotFoundError(  # pragma: no cover
            "'{0}' is not a file".format(filename))

    size = os.stat(filename).st_size
    if size < threshold:
        with open(filename, "r", encoding=encoding, errors=errors) as f:
            rows = f.readlines()
        return rows[-nbline:] if len(rows) > nbline else rows
    else:
        with open(filename, "r", encoding=encoding, errors=errors) as f:
            f.seek(size - threshold)  # line A
            try:
                content = f.read()
            except UnicodeDecodeError:
                f.seek(size - threshold - 1)  # line B
                content = f.read()

        rows = content.split("\n")
        res = rows[-nbline:] if len(rows) > nbline else rows
        return [_ + "\n" for _ in res]


[docs]def enumerate_grep(filename, regex, encoding="utf8", errors=None):
    """
    Extracts lines matching a regular expression.

    :param      filename:        filename
    :param      regex:           regular expression
    :param      encoding:        encoding
    :param      errors:          see `open <https://docs.python.org/3/library/functions.html#open>`_
    :return:                     iterator in lines

    .. versionadded:: 1.1


    :githublink:`%|py|106`
    """
    if isinstance(filename, str):
        if not os.path.exists(filename):
            raise FileNotFoundError(filename)  # pragma: no cover
        if not os.path.isfile(filename):
            raise FileNotFoundError(  # pragma: no cover
                "'{0}' is not a file".format(filename))
        with open(filename, "r", encoding=encoding, errors=errors) as f:
            for _ in enumerate_grep(f, regex, encoding):
                yield _
    else:
        reg = re.compile(regex)
        for line in filename:
            if reg.search(line):
                yield line


[docs]def file_encoding(filename_or_bytes, limit=2**20):
    """
    Returns the encoding of a file.
    The function relies on `chardet <http://chardet.readthedocs.io/en/latest/usage.html>`_.

    :param filename_or_bytes: filename or bytes
    :param limit: if *filename_or_bytes* is a file, the function only
        loads the first *limit* bytes (or all if *limit* is -1)
    :return: dictionary

    Example of results:

    ::

        {'encoding': 'EUC-JP', 'confidence': 0.99}


    :githublink:`%|py|138`
    """
    if isinstance(filename_or_bytes, str):
        if not os.path.exists(filename_or_bytes):
            raise FileNotFoundError(filename_or_bytes)
        if not os.path.isfile(filename_or_bytes):
            raise FileNotFoundError(
                "'{0}' is not a file".format(filename_or_bytes))
        size = os.stat(filename_or_bytes).st_size
        with open(filename_or_bytes, "rb") as f:
            content = f.read() if limit == -1 or size < limit else f.read(limit)
        return file_encoding(content)
    elif isinstance(filename_or_bytes, bytes):
        return chardet.detect(filename_or_bytes)
    else:
        raise TypeError("Unexpecting type for filename_or_bytes, got: {0}.".format(
            type(filename_or_bytes)))
Source code for pyensae.filehelper.content_helper

Links

Contents

Information

Related Topics