Coverage for src/pyensae/filehelper/content

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1"""

2@file

3@brief Various functions to process text

4"""

6import os

7import re

8import chardet

11def replace_comma_by_point(file):

12 """

13 Replaces all commas by point in a file (do that inplace).

15 :param file: file to process

16 """

17 with open(file, "r") as f:

18 text = f.read()

19 text = text.replace(",", ".")

20 with open(file, "w") as f:

21 f.write(text)

24def file_head(filename: str, nbline=10, encoding="utf8", errors="strict"):

25 """

26 Extracts the first *nbline* of a file (assuming it is text file).

28 :param filename: filename

29 :param nbline: number of lines

30 :param encoding: encoding

31 :param errors: see `open <https://docs.python.org/3/library/functions.html#open>`_

32 :return: list of lines

33 """

34 if isinstance(filename, str):

35 if not os.path.exists(filename):

36 raise FileNotFoundError(filename)

37 if not os.path.isfile(filename):

38 raise FileNotFoundError( # pragma: no cover

39 "'{0}' is not a file".format(filename))

40 with open(filename, "r", encoding=encoding, errors=errors) as f:

41 return file_head(f, nbline=nbline, encoding=encoding)

42 else:

43 rows = []

44 for line in filename:

45 rows.append(line)

46 if len(rows) >= nbline:

47 break

48 return rows

51def file_tail(filename: str, nbline=10, encoding="utf8", threshold=2 ** 14, errors="strict"):

52 """

53 Extracts the first nbline of a file (assuming it is text file).

55 :param filename: filename

56 :param nbline: number of lines

57 :param encoding: encoding

58 :param threshold: if the file size is above, it will not read the beginning

59 :param errors: see `open <https://docs.python.org/3/library/functions.html#open>`_

60 :return: list of lines

62 The line marked as *A* has an issue because the cursor

63 could fall on a character (= byte) in the middle of a character

64 if the file is encoded in utf-8 character.

65 The next line fails. That's why we try again

66 by moving the cursor by one character (see line B).

68 The first returned line may be incomplete.

69 """

70 if not os.path.exists(filename):

71 raise FileNotFoundError(filename) # pragma: no cover

72 if not os.path.isfile(filename):

73 raise FileNotFoundError( # pragma: no cover

74 "'{0}' is not a file".format(filename))

76 size = os.stat(filename).st_size

77 if size < threshold:

78 with open(filename, "r", encoding=encoding, errors=errors) as f:

79 rows = f.readlines()

80 return rows[-nbline:] if len(rows) > nbline else rows

81 else:

82 with open(filename, "r", encoding=encoding, errors=errors) as f:

83 f.seek(size - threshold) # line A

84 try:

85 content = f.read()

86 except UnicodeDecodeError:

87 f.seek(size - threshold - 1) # line B

88 content = f.read()

90 rows = content.split("\n")

91 res = rows[-nbline:] if len(rows) > nbline else rows

92 return [_ + "\n" for _ in res]

95def enumerate_grep(filename, regex, encoding="utf8", errors=None):

96 """

97 Extracts lines matching a regular expression.

99 @param filename filename

100 @param regex regular expression

101 @param encoding encoding

102 @param errors see `open <https://docs.python.org/3/library/functions.html#open>`_

103 @return iterator in lines

104

105 .. versionadded:: 1.1

106 """

107 if isinstance(filename, str):

108 if not os.path.exists(filename):

109 raise FileNotFoundError(filename) # pragma: no cover

110 if not os.path.isfile(filename):

111 raise FileNotFoundError( # pragma: no cover

112 "'{0}' is not a file".format(filename))

113 with open(filename, "r", encoding=encoding, errors=errors) as f:

114 for _ in enumerate_grep(f, regex, encoding):

115 yield _

116 else:

117 reg = re.compile(regex)

118 for line in filename:

119 if reg.search(line):

120 yield line

121

122

123def file_encoding(filename_or_bytes, limit=2**20):

124 """

125 Returns the encoding of a file.

126 The function relies on `chardet <http://chardet.readthedocs.io/en/latest/usage.html>`_.

127

128 :param filename_or_bytes: filename or bytes

129 :param limit: if *filename_or_bytes* is a file, the function only

130 loads the first *limit* bytes (or all if *limit* is -1)

131 :return: dictionary

132

133 Example of results:

134

135 ::

136

137 {'encoding': 'EUC-JP', 'confidence': 0.99}

138 """

139 if isinstance(filename_or_bytes, str):

140 if not os.path.exists(filename_or_bytes):

141 raise FileNotFoundError(filename_or_bytes)

142 if not os.path.isfile(filename_or_bytes):

143 raise FileNotFoundError(

144 "'{0}' is not a file".format(filename_or_bytes))

145 size = os.stat(filename_or_bytes).st_size

146 with open(filename_or_bytes, "rb") as f:

147 content = f.read() if limit == -1 or size < limit else f.read(limit)

148 return file_encoding(content)

149 elif isinstance(filename_or_bytes, bytes):

150 return chardet.detect(filename_or_bytes)

151 else:

152 raise TypeError("Unexpecting type for filename_or_bytes, got: {0}.".format(

153 type(filename_or_bytes)))

Coverage for src/pyensae/filehelper/content_helper.py : 88%

64 statements

Coverage for src/pyensae/filehelper/content_helper.py : 88%

64 statements 56 run 8 missing 5 excluded

64 statements