Coverage for pyquickhelper/filehelper/synchelper.py: 82%
278 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-03 02:21 +0200
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-03 02:21 +0200
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Series of functions related to folder, explore, synchronize, remove (recursively).
5"""
6import os
7import re
8import fnmatch
9from typing import Callable
10from ..loghelper.flog import fLOG
11from .file_tree_node import FileTreeNode
12from .files_status import FilesStatus, checksum_md5
13from ..loghelper.pqh_exception import PQHException
16def explore_folder(folder, pattern=None, neg_pattern=None, fullname=False,
17 return_only=None, recursive=True, sub_pattern=None,
18 sub_replace=None, fLOG=None):
19 """
20 Returns the list of files included in a folder and its subfolders.
21 Returned names can be modified if *sub_pattern* is specified.
23 :param folder: (str) folder
24 :param pattern: (str) if None, get all files, otherwise, it is a regular expression,
25 the filename must verify (with the folder if fullname is True)
26 :param neg_pattern: (str) negative pattern
27 :param fullname: (bool) if True, include the subfolder while checking the regex (pattern)
28 :param return_only: (str) to return folders and files (*=None*),
29 only the files (*='f'*) or only the folders (*='d')
30 :param recursive: (bool) look into subfolders
31 :param sub_pattern: (str) replacements pattern, the output is
32 then modified accordingly to this
33 regular expression
34 :param sub_replace: (str) if sub_pattern is specified, this second pattern
35 specifies how to replace
36 :param fLOG: (fct) logging function
37 :return: (list, list), a list of folders, a list of files (the folder is not included the path name)
39 .. cmdref::
40 :title: Explore the content of a directory
41 :cmd: -m pyquickhelper ls --help
43 The command calls function @see fn explore_folder
44 and makes the list of all files in a directory or
45 all folders. Example::
47 python -m pyquickhelper ls -f _mynotebooks -r f -p .*[.]ipynb -n checkpoints -fu 1
49 It works better with :epkg:`chrome`. An example to change file names::
51 python -m pyquickhelper ls -f myfolder -p .*[.]py -r f -n pycache -fu 1 -s test_(.*) -su unit_\\1
53 Or another to automatically create git commands to rename files::
55 python -m pyquickhelper ls -f _mynotebooks -r f -p .*[.]ipynb -s "(.*)[.]ipynb" -su "git mv \\1.ipynb \\1~.ipynb"
56 """
57 if pattern is not None:
58 pattern = re.compile(pattern)
59 if neg_pattern is not None:
60 neg_pattern = re.compile(neg_pattern)
61 if sub_pattern is not None:
62 sub_pattern = re.compile(sub_pattern)
64 def listdir_aswalk(folder):
65 "local function"
66 return folder, None, os.listdir(folder)
68 fct = os.walk if recursive else listdir_aswalk
70 found = 0
71 filter = 0
72 negfil = 0
73 files, rep = [], {}
74 for r, _, f in fct(folder):
75 for a in f:
76 found += 1
77 temp = os.path.join(r, a)
78 if pattern is not None:
79 if fullname:
80 if not pattern.search(temp):
81 filter += 1
82 continue
83 else:
84 if not pattern.search(a):
85 filter += 1
86 continue
87 if neg_pattern is not None:
88 if fullname:
89 if neg_pattern.search(temp):
90 negfil += 1
91 continue
92 else:
93 if neg_pattern.search(a):
94 negfil += 1
95 continue
96 if sub_pattern:
97 modified = sub_pattern.sub(sub_replace, temp)
98 files.append(modified)
99 else:
100 files.append(temp)
101 r = os.path.split(temp)[0]
102 rep[r] = None
104 if fLOG:
105 fLOG("[explore_folder] found={0} not-in={1} out={2} in '{3}'".format(
106 found, filter, negfil, folder))
107 keys = sorted(rep.keys())
108 if return_only is None:
109 if sub_pattern:
110 keys = [sub_pattern.sub(sub_replace, _) for _ in keys]
111 return keys, files
112 if return_only == 'f':
113 return files
114 if return_only == 'd':
115 if sub_pattern:
116 keys = [sub_pattern.sub(sub_replace, _) for _ in keys]
117 return keys
118 raise ValueError( # pragma: no cover
119 "return_only must be either None, 'f' or 'd'.")
122def explore_folder_iterfile(folder, pattern=None, neg_pattern=None,
123 fullname=False, recursive=True, verbose=False):
124 """
125 Same as @see fn explore_folder but iterates on files
126 included in a folder and its subfolders.
128 :param folder: folder
129 :param pattern: if None, get all files, otherwise, it is a regular expression,
130 the filename must verify (with the folder is fullname is True)
131 :param neg_pattern: negative pattern to exclude files
132 :param fullname: if True, include the subfolder while checking the regex
133 :param recursive: look into subfolders
134 :param verbose: use :epkg:`tqdm` to display a progress bar
135 :return: iterator on files
136 """
137 if pattern is not None:
138 pattern = re.compile(pattern)
139 if neg_pattern is not None:
140 neg_pattern = re.compile(neg_pattern)
142 def listdir_aswalk(folder):
143 "local function"
144 yield folder, None, os.listdir(folder)
146 iter = os.walk if recursive else listdir_aswalk
148 def itera(folder):
149 for r, _, f in iter(folder):
150 for a in f:
151 yield r, f, a
153 if verbose:
154 from tqdm import tqdm # pragma: no cover
155 loop = tqdm(itera(folder)) # pragma: no cover
156 else:
157 loop = itera(folder)
159 rep = {}
160 for r, _, a in loop:
161 if verbose:
162 loop.set_description(r) # pragma: no cover
163 temp = os.path.join(r, a)
164 if pattern is not None:
165 if fullname:
166 if not pattern.search(temp):
167 continue
168 else:
169 if not pattern.search(a):
170 continue
171 if neg_pattern is not None:
172 if fullname:
173 if neg_pattern.search(temp):
174 continue
175 else:
176 if neg_pattern.search(a):
177 print("---", temp)
178 continue
179 yield temp
180 r = os.path.split(temp)[0]
181 rep[r] = None
184def explore_folder_iterfile_repo(folder, log=fLOG):
185 """
186 Returns all files present in folder and added to
187 a :epkg:`SVN` or :epkg:`GIT` repository.
189 @param folder folder
190 @param log log function
191 @return iterator
192 """
193 node = FileTreeNode(folder, repository=True, log=log)
194 svnfiles = node.get_dict()
195 for file in svnfiles:
196 yield file
199def synchronize_folder(p1: str, p2: str, hash_size=1024 ** 2, repo1=False, repo2=False,
200 size_different=True, no_deletion=False,
201 filter: [str, Callable[[str], str], None] = None,
202 filter_copy: [str, Callable[[str], str], None] = None,
203 avoid_copy=False, operations=None, file_date: str = None,
204 log1=False, copy_1to2=False, create_dest=False,
205 fLOG=fLOG):
206 """
207 Synchronizes two folders (or copy if the second is empty),
208 it only copies more recent files.
209 It can walk through a :epkg:`git` repository or
210 `SVN <https://subversion.apache.org/>`_.
212 :param p1: (str) first path
213 :param p2: (str) second path
214 :param hash_size: (bool) to check whether or not two files are different
215 :param repo1: (bool) assuming the first folder is under SVN or GIT,
216 it uses pysvn to get the list
217 of files (avoiding any extra files)
218 :param repo2: (bool) assuming the second folder is under SVN or GIT,
219 it uses pysvn to get the list
220 of files (avoiding any extra files)
221 :param size_different: (bool) if True, a file will be copied only if size are different,
222 otherwise, it will be copied if the first file is more recent
223 :param no_deletion: (bool) if a file is found in the second folder and not in the first one,
224 if will be removed unless no_deletion is True
225 :param filter: (str) None to accept every file, a string if it is a regular expression,
226 a function for something more complex:
227 function ``(fullname) --> True``
228 (every file is considered in lower case),
229 (use :epkg:`*py:re:search`)
230 :param filter_copy: (str) None to accept every file, a string if it is a regular expression,
231 a function for something more complex: function (fullname) --> True
232 :param avoid_copy: (bool) if True, just return the list of files
233 which should be copied but does not do the copy
234 :param operations: if None, this function is called the following way ``operations(op, n1, n2)``
235 if should return True if the file was updated
236 :param file_date: (str) filename which contains information about when the last sync was done
237 :param log1: @see cl FileTreeNode
238 :param copy_1to2: (bool) only copy files from *p1* to *p2*
239 :param create_dest: (bool) create destination directory if not exist
240 :param fLOG: logging function
241 :return: list of operations done by the function,
242 list of 3-uple: action, source_file, dest_file
244 if ``file_date`` is mentioned, the second folder is not explored. Only
245 the modified files will be taken into account (except for the first sync).
247 .. exref::
248 :title: synchronize two folders
250 The following function synchronizes a folder with another one
251 on a USB drive or a network drive. To minimize the number of access
252 to the other location, it stores the status of the previous
253 synchronization in a file (``status_copy.txt`` in the below example).
254 Next time, the function goes through the directory and sub-directories
255 to synchronize and only propagates the modifications which happened
256 since the last modification.
257 The function ``filter_copy`` defines what file to synchronize or not.
259 ::
261 def filter_copy(file):
262 return "_don_t_synchronize_" not in file
264 synchronize_folder( "c:/mydata",
265 "g:/mybackup",
266 hash_size = 0,
267 filter_copy = filter_copy,
268 file_date = "c:/status_copy.txt")
270 The function is able to go through 90.000 files and 90 Gb
271 in 12 minutes (for an update).
272 """
274 fLOG(f"[synchronize_folder] from '{p1}'")
275 fLOG(f"[synchronize_folder] to '{p2}'")
277 if create_dest and not os.path.exists(p2):
278 fLOG(f"[synchronize_folder] md '{p2}'")
279 os.makedirs(p2)
281 if file_date is not None and not os.path.exists(file_date):
282 with open(file_date, "w", encoding="utf8") as f:
283 f.write("")
285 def mytrue(v):
286 return True
288 typstr = str
289 if filter is None:
290 tfilter = mytrue
291 elif isinstance(filter, typstr):
292 exp = re.compile(filter)
294 def regtrue(be):
295 "local function"
296 return (True if exp.search(be) else False) # pylint: disable=R1719
298 tfilter = regtrue
299 else:
300 tfilter = filter
302 def pr_filter(root, path, f, d):
303 if d:
304 return True
305 path = path.lower()
306 f = f.lower()
307 be = os.path.join(path, f)
308 return tfilter(be)
310 if isinstance(filter_copy, str):
311 rg = re.compile(filter_copy)
313 def regtrue2(f):
314 return rg.search(f) is not None
316 filter_copy = regtrue2
318 f1 = p1
319 f2 = p2
321 fLOG(f"[synchronize_folder] exploring f1='{f1}'")
322 node1 = FileTreeNode(
323 f1, filter=pr_filter, repository=repo1, log=True, log1=log1)
324 fLOG("[synchronize_folder] number of found files (p1)",
325 len(node1), node1.max_date())
326 if file_date is not None:
327 log1n = 1000 if log1 else None
328 status = FilesStatus(file_date, fLOG=fLOG)
329 res = list(status.difference(node1, u4=True, nlog=log1n))
330 else:
331 fLOG(f"[synchronize_folder] exploring f2='{f2}'")
332 node2 = FileTreeNode(
333 f2, filter=pr_filter, repository=repo2, log=True, log1=log1)
334 fLOG("[synchronize_folder] number of found files (p2)",
335 len(node2), node2.max_date())
336 res = node1.difference(node2, hash_size=hash_size)
337 status = None
339 action = []
340 modif = 0
341 report = {">": 0, ">+": 0, "<": 0, "<+": 0, "<=": 0, ">-": 0, "issue": 0}
343 fLOG("[synchronize_folder] Starting synchronisation.")
344 nbcur = 0
345 nbprint = 0
346 for op, file, n1, n2 in res:
347 nbcur += 1
348 if (nbprint <= 50 or nbcur % 50 == 0) and \
349 op not in ("==", '<', '<=', '<+') and \
350 (n1 is None or not n1.isdir()):
351 fLOG(
352 f"[synchronize_folder] ... {nbcur}/{len(res)} (current: '{file}' :: {op})")
353 nbprint += 1
354 if filter_copy is not None and not filter_copy(file):
355 continue
357 if operations is not None:
358 r = operations(op, n1, n2)
359 if r and status is not None:
360 status.update_copied_file(n1.fullname)
361 modif += 1
362 report[op] += 1
363 if modif % 50 == 0:
364 fLOG(
365 f"[synchronize_folder] Processed {nbcur}/{len(res)} (current: '{file}')")
366 status.save_dates()
367 else:
369 if op in [">", ">+"]:
370 if not n1.isdir():
371 if file_date is not None or not size_different or n2 is None or n1._size != n2._size:
372 if not avoid_copy:
373 n1.copy_to(f2, copy_1to2)
374 action.append((">+", n1, f2))
375 if status is not None:
376 status.update_copied_file(n1.fullname)
377 modif += 1
378 report[op] += 1
379 if modif % 50 == 0:
380 fLOG( # pragma: no cover
381 "[synchronize_folder] Processed {0}/{1} (current: '{2}')"
382 "".format(nbcur, len(res), file))
383 status.save_dates() # pragma: no cover
384 else:
385 pass
387 elif op in ["<+"]:
388 if not copy_1to2:
389 if n2 is None:
390 if not no_deletion:
391 # this case happens when it does not know sideB (sideA is stored in a file)
392 # it needs to remove file, file refers to this side
393 filerel = os.path.relpath(file, start=p1)
394 filerem = os.path.join(p2, filerel)
395 try:
396 ft = FileTreeNode(p2, filerel)
397 except PQHException:
398 ft = None # probably already removed
400 if ft is not None:
401 action.append((">-", None, ft))
402 if not avoid_copy:
403 fLOG( # pragma: no cover
404 "[synchronize_folder] - remove ", filerem)
405 os.remove(filerem)
406 if status is not None:
407 status.update_copied_file(
408 file, delete=True)
409 modif += 1
410 report[op] += 1
411 if modif % 50 == 0:
412 fLOG( # pragma: no cover
413 "[synchronize_folder] Processed {0}/{1} "
414 "(current: '{2}')".format(nbcur, len(res), file))
415 status.save_dates()
416 else:
417 fLOG( # pragma: no cover
418 "[synchronize_folder] - skip (probably already removed) "
419 "", filerem)
420 else:
421 if not n2.isdir() and not no_deletion:
422 if not avoid_copy:
423 n2.remove()
424 action.append((">-", None, n2))
425 if status is not None:
426 status.update_copied_file(
427 n1.fullname, delete=True)
428 modif += 1
429 report[">-"] += 1
430 if modif % 50 == 0:
431 fLOG( # pragma: no cover
432 "[synchronize_folder] Processed {0}/{1} "
433 "(current: '{2}')".format(nbcur, len(res), file))
434 status.save_dates()
435 elif n2 is not None and n1._size != n2._size and not n1.isdir():
436 fLOG( # pragma: no cover
437 "[synchronize_folder] problem: size are different for "
438 "file %s (%d != %d) dates (%s,%s) (op %s)" % (
439 file, n1._size, n2._size, n1._date, n2._date, op))
440 report["issue"] += 1
441 # n1.copy_to(f2)
442 # raise RuntimeError ("size are different for file %s (%d != %d) (op %s)" % (file, n1._size, n2._size, op))
444 if status is not None:
445 status.save_dates(file_date)
447 report = [(k, v) for k, v in sorted(report.items()) if v > 0]
448 if len(report):
449 msg = [f"{k}={v}" for k, v in report]
450 fLOG(f"[synchronize_folder] END: {msg}")
451 else:
452 fLOG("[synchronize_folder] END: no copy")
454 return action
457def remove_folder(top, remove_also_top=True, raise_exception=True):
458 """
459 Removes everything in folder *top*.
461 @param top path to remove
462 @param remove_also_top remove also root
463 @param raise_exception raise an exception if a file cannot be remove
464 @return list of removed files and folders
465 --> list of tuple ( (name, "file" or "dir") )
466 """
467 if top in {"", "C:", "c:", "C:\\", "c:\\", "d:", "D:", "D:\\", "d:\\"}:
468 raise RuntimeError( # pragma: no cover
469 "top is a root (c: for example), this is not safe")
471 res = []
472 first_root = None
473 for root, dirs, files in os.walk(top, topdown=False):
474 for name in files:
475 t = os.path.join(root, name)
476 try:
477 os.remove(t)
478 except PermissionError as e: # pragma: no cover
479 if raise_exception:
480 raise PermissionError(
481 f"unable to remove file {t}") from e
482 remove_also_top = False
483 continue
484 res.append((t, "file"))
485 for name in dirs:
486 t = os.path.join(root, name)
487 try:
488 os.rmdir(t)
489 except OSError as e:
490 if raise_exception:
491 raise OSError(
492 f"unable to remove folder {t}") from e
493 remove_also_top = False # pragma: no cover
494 continue # pragma: no cover
495 res.append((t, "dir"))
496 if first_root is None:
497 first_root = root
499 if top is not None and remove_also_top:
500 res.append((top, "dir"))
501 os.rmdir(top)
503 return res
506def has_been_updated(source, dest):
507 """
508 It assumes *dest* is a copy of *source*, it wants to know
509 if the copy is up to date or not.
511 @param source filename
512 @param dest copy
513 @return True,reason or False,None
514 """
515 if not os.path.exists(dest):
516 return True, "new"
518 st1 = os.stat(source)
519 st2 = os.stat(dest)
520 if st1.st_size != st2.st_size:
521 return True, "size"
523 d1 = st1.st_mtime
524 d2 = st2.st_mtime
525 if d1 > d2:
526 return True, "date"
528 c1 = checksum_md5(source)
529 c2 = checksum_md5(dest)
531 if c1 != c2:
532 return True, "md5"
534 return False, None
537def walk(top, onerror=None, followlinks=False, neg_filter=None):
538 """
539 Does the same as :epkg:`*py:os:walk`
540 plus does not go through a sub-folder if this one is big.
541 Folders such *build* or *Debug* or *Release*
542 may not need to be dug into.
544 @param top folder
545 @param onerror see :epkg:`*py:os:walk`
546 @param followlinks see :epkg:`*py:os:walk`
547 @param neg_filter filtering, a string, every folder verifying the filter will be excluded
548 (file pattern, not a regular expression pattern)
549 @return see :epkg:`*py:os:walk`
550 """
551 if neg_filter is None:
552 for root, dirs, files in os.walk(top=top, onerror=onerror, followlinks=followlinks):
553 yield root, dirs, files
554 else:
555 typstr = str # unicode #
556 f = not isinstance(neg_filter, typstr)
557 for root, dirs, files in os.walk(top, onerror=onerror, followlinks=followlinks):
558 rem = []
559 for i, d in enumerate(dirs):
560 if (f and neg_filter(d)) or (not f and fnmatch.fnmatch(d, neg_filter)):
561 rem.append(i)
562 if rem:
563 rem.reverse()
564 for i in rem:
565 del dirs[i]
567 yield root, dirs, files
570def download_urls_iterfile(folder, pattern=None, neg_pattern=None,
571 fullname=False, recursive=True):
572 """
573 Same as @see fn explore_folder but iterates on files
574 included in a folder and its subfolders.
576 :param folder: folder
577 :param pattern: if None, get all files, otherwise, it is a regular expression,
578 the filename must verify (with the folder is fullname is True)
579 :param neg_pattern: negative pattern to exclude files
580 :param fullname: if True, include the subfolder while checking the regex
581 :param recursive: look into subfolders
582 :return: iterator on files
583 """