# -*- coding: utf-8 -*-
"""
various basic functions often needed
:githublink:`%|py|6`
"""
import os
import re
import random
from pyquickhelper.loghelper.flog import fLOG, GetSepLine
from pyquickhelper.filehelper.synchelper import explore_folder_iterfile
_keep_var_character = re.compile("[^a-zA-Z0-9_]")
[docs]def _clean_name_variable(st):
"""
Cleans a string.
:param st: string to clean
:return: another string
:githublink:`%|py|23`
"""
res = _keep_var_character.split(st)
if res is None:
raise Exception("unable to clean " + st)
return "_".join(res)
[docs]def test_regular_expression(exp=".*", text="", fLOG=fLOG):
"""
Tests a regular expression.
:param exp: regular expression
:param text: text to check
:param fLOG: logging function
:githublink:`%|py|49`
"""
fLOG("regex", exp)
fLOG("text", text)
ex = re.compile(exp)
ma = ex.search(text)
if ma is None:
fLOG("no result")
else:
fLOG(ma.groups())
[docs]def IsEmptyString(s):
"""
Tells if a string is empty.
:param s: string
:return: boolean
:githublink:`%|py|66`
"""
if s is None:
return True
return len(s) == 0
[docs]def is_empty_string(s):
"""
Tells if a string is empty.
:param s: string
:return: boolean
:githublink:`%|py|78`
"""
if s is None:
return True
return len(s) == 0
[docs]def file_head(file="",
head=1000,
out=""):
"""
Keeps the head of a file.
:param file: file name
:param head: number of lines to keep
:param out: output file, if == None or empty, then, it becomes:
file + ".head.%d.ext" % head
:return: out
:githublink:`%|py|95`
"""
if not os.path.exists(file):
raise Exception("unable to find file %s" % file)
if IsEmptyString(out):
f, ext = os.path.splitext(file)
out = "%s.head.%d%s" % (file, head, ext)
f = open(file, "r")
g = open(out, "w")
for i, line in enumerate(f):
if i >= head:
break
g.write(line)
f.close()
g.close()
return out
[docs]def file_split(file="", nb=2, out="", header=False, rnd=False):
"""
Splits a file.
:param file: file name or stream
:param nb: number of files
:param out: output file, if == None or empty, then, it becomes:
``file + ".split.%d.ext" % i``, it must contain ``%d``
or it must a a list or strings or streams
:param header: consider a header or not
:param rnd: randomly draw the file which receives the current line
:return: number of processed lines
:githublink:`%|py|125`
"""
if not os.path.exists(file):
raise Exception("unable to find file %s" % file)
if is_empty_string(out):
f, ext = os.path.splitext(file)
out = "%s.split.%s%s" % (file, _get_format_zero_nb_integer(nb), ext)
elif not isinstance(out, list) and "%d" not in out:
raise ValueError("%d should be present in out='{0}'".format(out))
size = os.stat(file).st_size
typstr = str # unicode#
f = open(file, "r") if isinstance(file, typstr) else file
g = {}
tot = 0
last_line = 0
for i, line in enumerate(f):
last_line = i
if i == 0 and header:
for n in range(0, nb):
if n not in g:
if isinstance(out, list):
if isinstance(out[n], typstr):
g[n] = open(out[n], "w")
else:
g[n] = out[n]
else:
g[n] = open(out % n, "w")
g[n].write(line)
continue
if rnd:
n = random.randint(0, nb - 1)
else:
n = int(min(nb, tot * nb / size))
tot += len(line)
if n not in g:
if isinstance(out, list):
if isinstance(out[n], typstr):
g[n] = open(out[n], "w")
else:
g[n] = out[n]
else:
g[n] = open(out % n, "w")
g[n].write(line)
if (i + 1) % 10000 == 0:
fLOG(" processed ", i, " bytes ", tot,
" out of ", size, " lines in ", out)
if isinstance(file, typstr):
f.close()
for k, v in g.items():
if not isinstance(out, list) or isinstance(out[k], typstr):
v.close()
return last_line
[docs]def file_list(folder, out=""):
"""
Prints the list of files and sub files in a text file.
:param folder: folder
:param out: result
:return: out
:githublink:`%|py|191`
"""
typstr = str # unicode#
if out is None or isinstance(out, typstr):
if is_empty_string(out):
out = "%s_.list_of_files.txt" % folder
f = open(out, "w")
else:
f = out
for li in explore_folder_iterfile(folder):
f.write(li)
f.write(GetSepLine())
if isinstance(out, typstr):
f.close()
return out
[docs]def file_grep(file="", regex=".*", out="", head=-1):
"""
Grep.
:param file: file name
:param regex: regular expression
:param out: output file, if == None or empty, then, it becomes:
file + ".head.%d.ext" % head
:param head: stops after the first head lines (or -1 if not stop)
:return: out
:githublink:`%|py|220`
"""
if not os.path.exists(file):
raise Exception("unable to find file %s" % file)
if IsEmptyString(out):
f, ext = os.path.splitext(file)
out = "%s.regex.%d%s" % (file, head, ext)
exp = re.compile(regex)
f = open(file, "r")
g = open(out, "w")
nb = 0
for line in f:
if exp.search(line):
g.write(line)
nb += 1
if nb >= head >= 0:
break
f.close()
g.close()
return out