Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Series of functions related to folder, explore, synchronize, remove (recursively). 

5""" 

6import os 

7import re 

8import fnmatch 

9from typing import Callable 

10from ..loghelper.flog import fLOG 

11from .file_tree_node import FileTreeNode 

12from .files_status import FilesStatus, checksum_md5 

13from ..loghelper.pqh_exception import PQHException 

14 

15 

16def explore_folder(folder, pattern=None, neg_pattern=None, fullname=False, 

17 return_only=None, recursive=True, sub_pattern=None, 

18 sub_replace=None, fLOG=None): 

19 """ 

20 Returns the list of files included in a folder and its subfolders. 

21 Returned names can be modified if *sub_pattern* is specified. 

22 

23 :param folder: (str) folder 

24 :param pattern: (str) if None, get all files, otherwise, it is a regular expression, 

25 the filename must verify (with the folder if fullname is True) 

26 :param neg_pattern: (str) negative pattern 

27 :param fullname: (bool) if True, include the subfolder while checking the regex (pattern) 

28 :param return_only: (str) to return folders and files (*=None*), 

29 only the files (*='f'*) or only the folders (*='d') 

30 :param recursive: (bool) look into subfolders 

31 :param sub_pattern: (str) replacements pattern, the output is 

32 then modified accordingly to this 

33 regular expression 

34 :param sub_replace: (str) if sub_pattern is specified, this second pattern 

35 specifies how to replace 

36 :param fLOG: (fct) logging function 

37 :return: (list, list), a list of folders, a list of files (the folder is not included the path name) 

38 

39 .. cmdref:: 

40 :title: Explore the content of a directory 

41 :cmd: -m pyquickhelper ls --help 

42 

43 The command calls function @see fn explore_folder 

44 and makes the list of all files in a directory or 

45 all folders. Example:: 

46 

47 python -m pyquickhelper ls -f _mynotebooks -r f -p .*[.]ipynb -n checkpoints -fu 1 

48 

49 It works better with :epkg:`chrome`. An example to change file names:: 

50 

51 python -m pyquickhelper ls -f myfolder -p .*[.]py -r f -n pycache -fu 1 -s test_(.*) -su unit_\\1 

52 

53 Or another to automatically create git commands to rename files:: 

54 

55 python -m pyquickhelper ls -f _mynotebooks -r f -p .*[.]ipynb -s "(.*)[.]ipynb" -su "git mv \\1.ipynb \\1~.ipynb" 

56 """ 

57 if pattern is not None: 

58 pattern = re.compile(pattern) 

59 if neg_pattern is not None: 

60 neg_pattern = re.compile(neg_pattern) 

61 if sub_pattern is not None: 

62 sub_pattern = re.compile(sub_pattern) 

63 

64 def listdir_aswalk(folder): 

65 "local function" 

66 return folder, None, os.listdir(folder) 

67 

68 fct = os.walk if recursive else listdir_aswalk 

69 

70 found = 0 

71 filter = 0 

72 negfil = 0 

73 files, rep = [], {} 

74 for r, _, f in fct(folder): 

75 for a in f: 

76 found += 1 

77 temp = os.path.join(r, a) 

78 if pattern is not None: 

79 if fullname: 

80 if not pattern.search(temp): 

81 filter += 1 

82 continue 

83 else: 

84 if not pattern.search(a): 

85 filter += 1 

86 continue 

87 if neg_pattern is not None: 

88 if fullname: 

89 if neg_pattern.search(temp): 

90 negfil += 1 

91 continue 

92 else: 

93 if neg_pattern.search(a): 

94 negfil += 1 

95 continue 

96 if sub_pattern: 

97 modified = sub_pattern.sub(sub_replace, temp) 

98 files.append(modified) 

99 else: 

100 files.append(temp) 

101 r = os.path.split(temp)[0] 

102 rep[r] = None 

103 

104 if fLOG: 

105 fLOG("[explore_folder] found={0} not-in={1} out={2} in '{3}'".format( 

106 found, filter, negfil, folder)) 

107 keys = sorted(rep.keys()) 

108 if return_only is None: 

109 if sub_pattern: 

110 keys = [sub_pattern.sub(sub_replace, _) for _ in keys] 

111 return keys, files 

112 if return_only == 'f': 

113 return files 

114 if return_only == 'd': 

115 if sub_pattern: 

116 keys = [sub_pattern.sub(sub_replace, _) for _ in keys] 

117 return keys 

118 raise ValueError( # pragma: no cover 

119 "return_only must be either None, 'f' or 'd'.") 

120 

121 

122def explore_folder_iterfile(folder, pattern=None, neg_pattern=None, 

123 fullname=False, recursive=True, verbose=False): 

124 """ 

125 Same as @see fn explore_folder but iterates on files 

126 included in a folder and its subfolders. 

127 

128 :param folder: folder 

129 :param pattern: if None, get all files, otherwise, it is a regular expression, 

130 the filename must verify (with the folder is fullname is True) 

131 :param neg_pattern: negative pattern to exclude files 

132 :param fullname: if True, include the subfolder while checking the regex 

133 :param recursive: look into subfolders 

134 :param verbose: use :epkg:`tqdm` to display a progress bar 

135 :return: iterator on files 

136 """ 

137 if pattern is not None: 

138 pattern = re.compile(pattern) 

139 if neg_pattern is not None: 

140 neg_pattern = re.compile(neg_pattern) 

141 

142 def listdir_aswalk(folder): 

143 "local function" 

144 yield folder, None, os.listdir(folder) 

145 

146 iter = os.walk if recursive else listdir_aswalk 

147 

148 def itera(folder): 

149 for r, _, f in iter(folder): 

150 for a in f: 

151 yield r, f, a 

152 

153 if verbose: 

154 from tqdm import tqdm # pragma: no cover 

155 loop = tqdm(itera(folder)) # pragma: no cover 

156 else: 

157 loop = itera(folder) 

158 

159 rep = {} 

160 for r, _, a in loop: 

161 if verbose: 

162 loop.set_description(r) # pragma: no cover 

163 temp = os.path.join(r, a) 

164 if pattern is not None: 

165 if fullname: 

166 if not pattern.search(temp): 

167 continue 

168 else: 

169 if not pattern.search(a): 

170 continue 

171 if neg_pattern is not None: 

172 if fullname: 

173 if neg_pattern.search(temp): 

174 continue 

175 else: 

176 if neg_pattern.search(a): 

177 print("---", temp) 

178 continue 

179 yield temp 

180 r = os.path.split(temp)[0] 

181 rep[r] = None 

182 

183 

184def explore_folder_iterfile_repo(folder, log=fLOG): 

185 """ 

186 Returns all files present in folder and added to 

187 a :epkg:`SVN` or :epkg:`GIT` repository. 

188 

189 @param folder folder 

190 @param log log function 

191 @return iterator 

192 """ 

193 node = FileTreeNode(folder, repository=True, log=log) 

194 svnfiles = node.get_dict() 

195 for file in svnfiles: 

196 yield file 

197 

198 

199def synchronize_folder(p1: str, p2: str, hash_size=1024 ** 2, repo1=False, repo2=False, 

200 size_different=True, no_deletion=False, 

201 filter: [str, Callable[[str], str], None] = None, 

202 filter_copy: [str, Callable[[str], str], None] = None, 

203 avoid_copy=False, operations=None, file_date: str = None, 

204 log1=False, copy_1to2=False, create_dest=False, 

205 fLOG=fLOG): 

206 """ 

207 Synchronizes two folders (or copy if the second is empty), 

208 it only copies more recent files. 

209 It can walk through a :epkg:`git` repository or 

210 `SVN <https://subversion.apache.org/>`_. 

211 

212 :param p1: (str) first path 

213 :param p2: (str) second path 

214 :param hash_size: (bool) to check whether or not two files are different 

215 :param repo1: (bool) assuming the first folder is under SVN or GIT, 

216 it uses pysvn to get the list 

217 of files (avoiding any extra files) 

218 :param repo2: (bool) assuming the second folder is under SVN or GIT, 

219 it uses pysvn to get the list 

220 of files (avoiding any extra files) 

221 :param size_different: (bool) if True, a file will be copied only if size are different, 

222 otherwise, it will be copied if the first file is more recent 

223 :param no_deletion: (bool) if a file is found in the second folder and not in the first one, 

224 if will be removed unless no_deletion is True 

225 :param filter: (str) None to accept every file, a string if it is a regular expression, 

226 a function for something more complex: 

227 function ``(fullname) --> True`` 

228 (every file is considered in lower case), 

229 (use :epkg:`*py:re:search`) 

230 :param filter_copy: (str) None to accept every file, a string if it is a regular expression, 

231 a function for something more complex: function (fullname) --> True 

232 :param avoid_copy: (bool) if True, just return the list of files 

233 which should be copied but does not do the copy 

234 :param operations: if None, this function is called the following way ``operations(op, n1, n2)`` 

235 if should return True if the file was updated 

236 :param file_date: (str) filename which contains information about when the last sync was done 

237 :param log1: @see cl FileTreeNode 

238 :param copy_1to2: (bool) only copy files from *p1* to *p2* 

239 :param create_dest: (bool) create destination directory if not exist 

240 :param fLOG: logging function 

241 :return: list of operations done by the function, 

242 list of 3-uple: action, source_file, dest_file 

243 

244 if ``file_date`` is mentioned, the second folder is not explored. Only 

245 the modified files will be taken into account (except for the first sync). 

246 

247 .. exref:: 

248 :title: synchronize two folders 

249 

250 The following function synchronizes a folder with another one 

251 on a USB drive or a network drive. To minimize the number of access 

252 to the other location, it stores the status of the previous 

253 synchronization in a file (``status_copy.txt`` in the below example). 

254 Next time, the function goes through the directory and sub-directories 

255 to synchronize and only propagates the modifications which happened 

256 since the last modification. 

257 The function ``filter_copy`` defines what file to synchronize or not. 

258 

259 :: 

260 

261 def filter_copy(file): 

262 return "_don_t_synchronize_" not in file 

263 

264 synchronize_folder( "c:/mydata", 

265 "g:/mybackup", 

266 hash_size = 0, 

267 filter_copy = filter_copy, 

268 file_date = "c:/status_copy.txt") 

269 

270 The function is able to go through 90.000 files and 90 Gb 

271 in 12 minutes (for an update). 

272 """ 

273 

274 fLOG("[synchronize_folder] from '{0}'".format(p1)) 

275 fLOG("[synchronize_folder] to '{0}'".format(p2)) 

276 

277 if create_dest and not os.path.exists(p2): 

278 fLOG("[synchronize_folder] md '{0}'".format(p2)) 

279 os.makedirs(p2) 

280 

281 if file_date is not None and not os.path.exists(file_date): 

282 with open(file_date, "w", encoding="utf8") as f: 

283 f.write("") 

284 

285 def mytrue(v): 

286 return True 

287 

288 typstr = str 

289 if filter is None: 

290 tfilter = mytrue 

291 elif isinstance(filter, typstr): 

292 exp = re.compile(filter) 

293 

294 def regtrue(be): 

295 "local function" 

296 return (True if exp.search(be) else False) # pylint: disable=R1719 

297 

298 tfilter = regtrue 

299 else: 

300 tfilter = filter 

301 

302 def pr_filter(root, path, f, d): 

303 if d: 

304 return True 

305 path = path.lower() 

306 f = f.lower() 

307 be = os.path.join(path, f) 

308 return tfilter(be) 

309 

310 if isinstance(filter_copy, str): 

311 rg = re.compile(filter_copy) 

312 

313 def regtrue2(f): 

314 return rg.search(f) is not None 

315 

316 filter_copy = regtrue2 

317 

318 f1 = p1 

319 f2 = p2 

320 

321 fLOG("[synchronize_folder] exploring f1='{0}'".format(f1)) 

322 node1 = FileTreeNode( 

323 f1, filter=pr_filter, repository=repo1, log=True, log1=log1) 

324 fLOG("[synchronize_folder] number of found files (p1)", 

325 len(node1), node1.max_date()) 

326 if file_date is not None: 

327 log1n = 1000 if log1 else None 

328 status = FilesStatus(file_date, fLOG=fLOG) 

329 res = list(status.difference(node1, u4=True, nlog=log1n)) 

330 else: 

331 fLOG("[synchronize_folder] exploring f2='{0}'".format(f2)) 

332 node2 = FileTreeNode( 

333 f2, filter=pr_filter, repository=repo2, log=True, log1=log1) 

334 fLOG("[synchronize_folder] number of found files (p2)", 

335 len(node2), node2.max_date()) 

336 res = node1.difference(node2, hash_size=hash_size) 

337 status = None 

338 

339 action = [] 

340 modif = 0 

341 report = {">": 0, ">+": 0, "<": 0, "<+": 0, "<=": 0, ">-": 0, "issue": 0} 

342 

343 fLOG("[synchronize_folder] Starting synchronisation.") 

344 nbcur = 0 

345 nbprint = 0 

346 for op, file, n1, n2 in res: 

347 nbcur += 1 

348 if (nbprint <= 50 or nbcur % 50 == 0) and \ 

349 op not in ("==", '<', '<=', '<+') and \ 

350 (n1 is None or not n1.isdir()): 

351 fLOG( 

352 "[synchronize_folder] ... {0}/{1} (current: '{2}' :: {3})".format(nbcur, len(res), file, op)) 

353 nbprint += 1 

354 if filter_copy is not None and not filter_copy(file): 

355 continue 

356 

357 if operations is not None: 

358 r = operations(op, n1, n2) 

359 if r and status is not None: 

360 status.update_copied_file(n1.fullname) 

361 modif += 1 

362 report[op] += 1 

363 if modif % 50 == 0: 

364 fLOG( 

365 "[synchronize_folder] Processed {0}/{1} (current: '{2}')".format(nbcur, len(res), file)) 

366 status.save_dates() 

367 else: 

368 

369 if op in [">", ">+"]: 

370 if not n1.isdir(): 

371 if file_date is not None or not size_different or n2 is None or n1._size != n2._size: 

372 if not avoid_copy: 

373 n1.copy_to(f2, copy_1to2) 

374 action.append((">+", n1, f2)) 

375 if status is not None: 

376 status.update_copied_file(n1.fullname) 

377 modif += 1 

378 report[op] += 1 

379 if modif % 50 == 0: 

380 fLOG( # pragma: no cover 

381 "[synchronize_folder] Processed {0}/{1} (current: '{2}')" 

382 "".format(nbcur, len(res), file)) 

383 status.save_dates() # pragma: no cover 

384 else: 

385 pass 

386 

387 elif op in ["<+"]: 

388 if not copy_1to2: 

389 if n2 is None: 

390 if not no_deletion: 

391 # this case happens when it does not know sideB (sideA is stored in a file) 

392 # it needs to remove file, file refers to this side 

393 filerel = os.path.relpath(file, start=p1) 

394 filerem = os.path.join(p2, filerel) 

395 try: 

396 ft = FileTreeNode(p2, filerel) 

397 except PQHException: 

398 ft = None # probably already removed 

399 

400 if ft is not None: 

401 action.append((">-", None, ft)) 

402 if not avoid_copy: 

403 fLOG( # pragma: no cover 

404 "[synchronize_folder] - remove ", filerem) 

405 os.remove(filerem) 

406 if status is not None: 

407 status.update_copied_file( 

408 file, delete=True) 

409 modif += 1 

410 report[op] += 1 

411 if modif % 50 == 0: 

412 fLOG( # pragma: no cover 

413 "[synchronize_folder] Processed {0}/{1} " 

414 "(current: '{2}')".format(nbcur, len(res), file)) 

415 status.save_dates() 

416 else: 

417 fLOG( # pragma: no cover 

418 "[synchronize_folder] - skip (probably already removed) " 

419 "", filerem) 

420 else: 

421 if not n2.isdir() and not no_deletion: 

422 if not avoid_copy: 

423 n2.remove() 

424 action.append((">-", None, n2)) 

425 if status is not None: 

426 status.update_copied_file( 

427 n1.fullname, delete=True) 

428 modif += 1 

429 report[">-"] += 1 

430 if modif % 50 == 0: 

431 fLOG( # pragma: no cover 

432 "[synchronize_folder] Processed {0}/{1} " 

433 "(current: '{2}')".format(nbcur, len(res), file)) 

434 status.save_dates() 

435 elif n2 is not None and n1._size != n2._size and not n1.isdir(): 

436 fLOG( # pragma: no cover 

437 "[synchronize_folder] problem: size are different for " 

438 "file %s (%d != %d) dates (%s,%s) (op %s)" % ( 

439 file, n1._size, n2._size, n1._date, n2._date, op)) 

440 report["issue"] += 1 

441 # n1.copy_to(f2) 

442 # raise Exception ("size are different for file %s (%d != %d) (op %s)" % (file, n1._size, n2._size, op)) 

443 

444 if status is not None: 

445 status.save_dates(file_date) 

446 

447 report = [(k, v) for k, v in sorted(report.items()) if v > 0] 

448 if len(report): 

449 msg = ["{}={}".format(k, v) for k, v in report] 

450 fLOG("[synchronize_folder] END: {}".format(msg)) 

451 else: 

452 fLOG("[synchronize_folder] END: no copy") 

453 

454 return action 

455 

456 

457def remove_folder(top, remove_also_top=True, raise_exception=True): 

458 """ 

459 Removes everything in folder *top*. 

460 

461 @param top path to remove 

462 @param remove_also_top remove also root 

463 @param raise_exception raise an exception if a file cannot be remove 

464 @return list of removed files and folders 

465 --> list of tuple ( (name, "file" or "dir") ) 

466 """ 

467 if top in {"", "C:", "c:", "C:\\", "c:\\", "d:", "D:", "D:\\", "d:\\"}: 

468 raise Exception( # pragma: no cover 

469 "top is a root (c: for example), this is not safe") 

470 

471 res = [] 

472 first_root = None 

473 for root, dirs, files in os.walk(top, topdown=False): 

474 for name in files: 

475 t = os.path.join(root, name) 

476 try: 

477 os.remove(t) 

478 except PermissionError as e: # pragma: no cover 

479 if raise_exception: 

480 raise PermissionError( 

481 "unable to remove file {0}".format(t)) from e 

482 remove_also_top = False 

483 continue 

484 res.append((t, "file")) 

485 for name in dirs: 

486 t = os.path.join(root, name) 

487 try: 

488 os.rmdir(t) 

489 except OSError as e: 

490 if raise_exception: 

491 raise OSError( 

492 "unable to remove folder {0}".format(t)) from e 

493 remove_also_top = False # pragma: no cover 

494 continue # pragma: no cover 

495 res.append((t, "dir")) 

496 if first_root is None: 

497 first_root = root 

498 

499 if top is not None and remove_also_top: 

500 res.append((top, "dir")) 

501 os.rmdir(top) 

502 

503 return res 

504 

505 

506def has_been_updated(source, dest): 

507 """ 

508 It assumes *dest* is a copy of *source*, it wants to know 

509 if the copy is up to date or not. 

510 

511 @param source filename 

512 @param dest copy 

513 @return True,reason or False,None 

514 """ 

515 if not os.path.exists(dest): 

516 return True, "new" 

517 

518 st1 = os.stat(source) 

519 st2 = os.stat(dest) 

520 if st1.st_size != st2.st_size: 

521 return True, "size" 

522 

523 d1 = st1.st_mtime 

524 d2 = st2.st_mtime 

525 if d1 > d2: 

526 return True, "date" 

527 

528 c1 = checksum_md5(source) 

529 c2 = checksum_md5(dest) 

530 

531 if c1 != c2: 

532 return True, "md5" 

533 

534 return False, None 

535 

536 

537def walk(top, onerror=None, followlinks=False, neg_filter=None): 

538 """ 

539 Does the same as :epkg:`*py:os:walk` 

540 plus does not go through a sub-folder if this one is big. 

541 Folders such *build* or *Debug* or *Release* 

542 may not need to be dug into. 

543 

544 @param top folder 

545 @param onerror see :epkg:`*py:os:walk` 

546 @param followlinks see :epkg:`*py:os:walk` 

547 @param neg_filter filtering, a string, every folder verifying the filter will be excluded 

548 (file pattern, not a regular expression pattern) 

549 @return see :epkg:`*py:os:walk` 

550 """ 

551 if neg_filter is None: 

552 for root, dirs, files in os.walk(top=top, onerror=onerror, followlinks=followlinks): 

553 yield root, dirs, files 

554 else: 

555 typstr = str # unicode # 

556 f = not isinstance(neg_filter, typstr) 

557 for root, dirs, files in os.walk(top, onerror=onerror, followlinks=followlinks): 

558 rem = [] 

559 for i, d in enumerate(dirs): 

560 if (f and neg_filter(d)) or (not f and fnmatch.fnmatch(d, neg_filter)): 

561 rem.append(i) 

562 if rem: 

563 rem.reverse() 

564 for i in rem: 

565 del dirs[i] 

566 

567 yield root, dirs, files 

568 

569 

570def download_urls_iterfile(folder, pattern=None, neg_pattern=None, 

571 fullname=False, recursive=True): 

572 """ 

573 Same as @see fn explore_folder but iterates on files 

574 included in a folder and its subfolders. 

575 

576 :param folder: folder 

577 :param pattern: if None, get all files, otherwise, it is a regular expression, 

578 the filename must verify (with the folder is fullname is True) 

579 :param neg_pattern: negative pattern to exclude files 

580 :param fullname: if True, include the subfolder while checking the regex 

581 :param recursive: look into subfolders 

582 :return: iterator on files 

583 """