Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2@file
3@brief Various functions to process text
4"""
6import os
7import re
8import chardet
11def replace_comma_by_point(file):
12 """
13 Replaces all commas by point in a file (do that inplace).
15 :param file: file to process
16 """
17 with open(file, "r") as f:
18 text = f.read()
19 text = text.replace(",", ".")
20 with open(file, "w") as f:
21 f.write(text)
24def file_head(filename: str, nbline=10, encoding="utf8", errors="strict"):
25 """
26 Extracts the first *nbline* of a file (assuming it is text file).
28 :param filename: filename
29 :param nbline: number of lines
30 :param encoding: encoding
31 :param errors: see `open <https://docs.python.org/3/library/functions.html#open>`_
32 :return: list of lines
33 """
34 if isinstance(filename, str):
35 if not os.path.exists(filename):
36 raise FileNotFoundError(filename)
37 if not os.path.isfile(filename):
38 raise FileNotFoundError( # pragma: no cover
39 "'{0}' is not a file".format(filename))
40 with open(filename, "r", encoding=encoding, errors=errors) as f:
41 return file_head(f, nbline=nbline, encoding=encoding)
42 else:
43 rows = []
44 for line in filename:
45 rows.append(line)
46 if len(rows) >= nbline:
47 break
48 return rows
51def file_tail(filename: str, nbline=10, encoding="utf8", threshold=2 ** 14, errors="strict"):
52 """
53 Extracts the first nbline of a file (assuming it is text file).
55 :param filename: filename
56 :param nbline: number of lines
57 :param encoding: encoding
58 :param threshold: if the file size is above, it will not read the beginning
59 :param errors: see `open <https://docs.python.org/3/library/functions.html#open>`_
60 :return: list of lines
62 The line marked as *A* has an issue because the cursor
63 could fall on a character (= byte) in the middle of a character
64 if the file is encoded in utf-8 character.
65 The next line fails. That's why we try again
66 by moving the cursor by one character (see line B).
68 The first returned line may be incomplete.
69 """
70 if not os.path.exists(filename):
71 raise FileNotFoundError(filename) # pragma: no cover
72 if not os.path.isfile(filename):
73 raise FileNotFoundError( # pragma: no cover
74 "'{0}' is not a file".format(filename))
76 size = os.stat(filename).st_size
77 if size < threshold:
78 with open(filename, "r", encoding=encoding, errors=errors) as f:
79 rows = f.readlines()
80 return rows[-nbline:] if len(rows) > nbline else rows
81 else:
82 with open(filename, "r", encoding=encoding, errors=errors) as f:
83 f.seek(size - threshold) # line A
84 try:
85 content = f.read()
86 except UnicodeDecodeError:
87 f.seek(size - threshold - 1) # line B
88 content = f.read()
90 rows = content.split("\n")
91 res = rows[-nbline:] if len(rows) > nbline else rows
92 return [_ + "\n" for _ in res]
95def enumerate_grep(filename, regex, encoding="utf8", errors=None):
96 """
97 Extracts lines matching a regular expression.
99 @param filename filename
100 @param regex regular expression
101 @param encoding encoding
102 @param errors see `open <https://docs.python.org/3/library/functions.html#open>`_
103 @return iterator in lines
105 .. versionadded:: 1.1
106 """
107 if isinstance(filename, str):
108 if not os.path.exists(filename):
109 raise FileNotFoundError(filename) # pragma: no cover
110 if not os.path.isfile(filename):
111 raise FileNotFoundError( # pragma: no cover
112 "'{0}' is not a file".format(filename))
113 with open(filename, "r", encoding=encoding, errors=errors) as f:
114 for _ in enumerate_grep(f, regex, encoding):
115 yield _
116 else:
117 reg = re.compile(regex)
118 for line in filename:
119 if reg.search(line):
120 yield line
123def file_encoding(filename_or_bytes, limit=2**20):
124 """
125 Returns the encoding of a file.
126 The function relies on `chardet <http://chardet.readthedocs.io/en/latest/usage.html>`_.
128 :param filename_or_bytes: filename or bytes
129 :param limit: if *filename_or_bytes* is a file, the function only
130 loads the first *limit* bytes (or all if *limit* is -1)
131 :return: dictionary
133 Example of results:
135 ::
137 {'encoding': 'EUC-JP', 'confidence': 0.99}
138 """
139 if isinstance(filename_or_bytes, str):
140 if not os.path.exists(filename_or_bytes):
141 raise FileNotFoundError(filename_or_bytes)
142 if not os.path.isfile(filename_or_bytes):
143 raise FileNotFoundError(
144 "'{0}' is not a file".format(filename_or_bytes))
145 size = os.stat(filename_or_bytes).st_size
146 with open(filename_or_bytes, "rb") as f:
147 content = f.read() if limit == -1 or size < limit else f.read(limit)
148 return file_encoding(content)
149 elif isinstance(filename_or_bytes, bytes):
150 return chardet.detect(filename_or_bytes)
151 else:
152 raise TypeError("Unexpecting type for filename_or_bytes, got: {0}.".format(
153 type(filename_or_bytes)))