Source code for pyensae.filehelper.content_helper
"""
Various functions to process text
:githublink:`%|py|5`
"""
import os
import re
import chardet
[docs]def replace_comma_by_point(file):
"""
Replaces all commas by point in a file (do that inplace).
:param file: file to process
:githublink:`%|py|16`
"""
with open(file, "r") as f:
text = f.read()
text = text.replace(",", ".")
with open(file, "w") as f:
f.write(text)
[docs]def file_head(filename: str, nbline=10, encoding="utf8", errors="strict"):
"""
Extracts the first *nbline* of a file (assuming it is text file).
:param filename: filename
:param nbline: number of lines
:param encoding: encoding
:param errors: see `open <https://docs.python.org/3/library/functions.html#open>`_
:return: list of lines
:githublink:`%|py|33`
"""
if isinstance(filename, str):
if not os.path.exists(filename):
raise FileNotFoundError(filename)
if not os.path.isfile(filename):
raise FileNotFoundError( # pragma: no cover
"'{0}' is not a file".format(filename))
with open(filename, "r", encoding=encoding, errors=errors) as f:
return file_head(f, nbline=nbline, encoding=encoding)
else:
rows = []
for line in filename:
rows.append(line)
if len(rows) >= nbline:
break
return rows
[docs]def file_tail(filename: str, nbline=10, encoding="utf8", threshold=2 ** 14, errors="strict"):
"""
Extracts the first nbline of a file (assuming it is text file).
:param filename: filename
:param nbline: number of lines
:param encoding: encoding
:param threshold: if the file size is above, it will not read the beginning
:param errors: see `open <https://docs.python.org/3/library/functions.html#open>`_
:return: list of lines
The line marked as *A* has an issue because the cursor
could fall on a character (= byte) in the middle of a character
if the file is encoded in utf-8 character.
The next line fails. That's why we try again
by moving the cursor by one character (see line B).
The first returned line may be incomplete.
:githublink:`%|py|69`
"""
if not os.path.exists(filename):
raise FileNotFoundError(filename) # pragma: no cover
if not os.path.isfile(filename):
raise FileNotFoundError( # pragma: no cover
"'{0}' is not a file".format(filename))
size = os.stat(filename).st_size
if size < threshold:
with open(filename, "r", encoding=encoding, errors=errors) as f:
rows = f.readlines()
return rows[-nbline:] if len(rows) > nbline else rows
else:
with open(filename, "r", encoding=encoding, errors=errors) as f:
f.seek(size - threshold) # line A
try:
content = f.read()
except UnicodeDecodeError:
f.seek(size - threshold - 1) # line B
content = f.read()
rows = content.split("\n")
res = rows[-nbline:] if len(rows) > nbline else rows
return [_ + "\n" for _ in res]
[docs]def enumerate_grep(filename, regex, encoding="utf8", errors=None):
"""
Extracts lines matching a regular expression.
:param filename: filename
:param regex: regular expression
:param encoding: encoding
:param errors: see `open <https://docs.python.org/3/library/functions.html#open>`_
:return: iterator in lines
.. versionadded:: 1.1
:githublink:`%|py|106`
"""
if isinstance(filename, str):
if not os.path.exists(filename):
raise FileNotFoundError(filename) # pragma: no cover
if not os.path.isfile(filename):
raise FileNotFoundError( # pragma: no cover
"'{0}' is not a file".format(filename))
with open(filename, "r", encoding=encoding, errors=errors) as f:
for _ in enumerate_grep(f, regex, encoding):
yield _
else:
reg = re.compile(regex)
for line in filename:
if reg.search(line):
yield line
[docs]def file_encoding(filename_or_bytes, limit=2**20):
"""
Returns the encoding of a file.
The function relies on `chardet <http://chardet.readthedocs.io/en/latest/usage.html>`_.
:param filename_or_bytes: filename or bytes
:param limit: if *filename_or_bytes* is a file, the function only
loads the first *limit* bytes (or all if *limit* is -1)
:return: dictionary
Example of results:
::
{'encoding': 'EUC-JP', 'confidence': 0.99}
:githublink:`%|py|138`
"""
if isinstance(filename_or_bytes, str):
if not os.path.exists(filename_or_bytes):
raise FileNotFoundError(filename_or_bytes)
if not os.path.isfile(filename_or_bytes):
raise FileNotFoundError(
"'{0}' is not a file".format(filename_or_bytes))
size = os.stat(filename_or_bytes).st_size
with open(filename_or_bytes, "rb") as f:
content = f.read() if limit == -1 or size < limit else f.read(limit)
return file_encoding(content)
elif isinstance(filename_or_bytes, bytes):
return chardet.detect(filename_or_bytes)
else:
raise TypeError("Unexpecting type for filename_or_bytes, got: {0}.".format(
type(filename_or_bytes)))