Coverage for src/ensae_teaching_cs/homeblog/table_formula.py: 77%

1570 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-04-28 06:23 +0200

1# -*- coding: utf8 -*- 

2""" 

3@file 

4@brief Implements TableFormula. 

5""" 

6import copy 

7import os 

8import sys 

9import datetime 

10import random 

11import math 

12import json 

13import numpy 

14import pandas 

15from xlwt import Formatting as EXf 

16from xlwt import Style as EXs 

17import xlwt as EXw 

18from xlsxwriter import workbook as EXxw 

19from xlrd import open_workbook 

20from pyquickhelper.loghelper import fLOG, noLOG 

21from pyquickhelper.loghelper.convert_helper import str2datetime 

22from pyensae.sql import Database 

23from .table_formula_stat import _TableFormulaStat 

24 

25 

26class TableFormula(_TableFormulaStat): 

27 """ 

28 This class aims at representating a table, it provides 

29 some "SQL like" functionalities such groupby or innerjoin, select, where... 

30 This was a custom implementation of a DataFrame before I discover 

31 `pandas <http://pandas.pydata.org/>`_. 

32 

33 The class provides an easy to go through the row table by converting each row 

34 in a dictionary ``{ column_name: value }`` on the run. Example: 

35 

36 :: 

37 

38 tbl = TableFormula(...) 

39 newtbl = tbl.filter(lambda v: v["criteria"] == 5) 

40 

41 See @see op __init__ for others ways to create a table. 

42 

43 @var header list of column names 

44 @var values list of rows(each row contains as many value as the number of columns) 

45 @var index dictionary { column name: position }, changing ``header`` means also changing ``header``. 

46 

47 Example: 

48 

49 :: 

50 

51 table = TableFormula("name d_a d_b d_c#A 1 2 3#A 1.1 2.1 3.1#B 3 4 5".replace(" ", "\\t").replace("#","\\n")) 

52 print(table) 

53 print("one value 0,1:", table[0,1]) 

54 

55 print("---------") 

56 dist = table.get_distinct_values("name") 

57 for k in sorted(dist): print("*%d: %s"%(int(dist[k]),k)) 

58 

59 print("---------") 

60 table.add_column("has_A", lambda v: 1. if "A" in v["name"] else 0.) 

61 print(table) 

62 

63 x = 1./3 

64 print("------------- smoothing", x) 

65 table.add_column_smooth("has_A_smooth", lambda v: v["has_A"], [-1,0,1], [x,x,x]) 

66 print(table) 

67 

68 print("--------- filter") 

69 fil = table.filter(lambda v: v["d_b"] == 2) 

70 print(fil) 

71 

72 print("--------- random") 

73 rnd = table.random(5) 

74 print(rnd) 

75 

76 print("--------- random unique") 

77 rnd = table.random(1, True) 

78 print(rnd) 

79 

80 print("--------- filter quantile") 

81 fil = table.filter_quantile(lambda v: v["d_b"], 0, 0.4) 

82 print(fil) 

83 

84 print("--------- aggregate_column") 

85 total = table.aggregate(lambda v: v["d_c"]) 

86 print(total) 

87 

88 print("--------- sort") 

89 table.sort(lambda v: v["d_b"] + v["d_c"]) 

90 print(table) 

91 

92 print("--------- union") 

93 union = table.union(table) 

94 print(union) 

95 

96 print("--------- group sum") 

97 group = table.groupby(lambda v: v["name"], 

98 [lambda v: v["d_a"], 

99 lambda v: v["d_b"]], 

100 ["name", "sum_d_a", "sum_d_b"]) 

101 print(group) 

102 

103 print("--------- group max") 

104 groupmax = table.groupby(lambda v: v["name"], 

105 [lambda v: v["d_a"], 

106 lambda v: v["d_b"]], 

107 ["name", "max_d_a", "max_d_b"], 

108 [max, max]) 

109 print(groupmax) 

110 

111 print("--------- group sum with weights") 

112 group = table.groupby(lambda v: v["name"], 

113 [lambda v: v["d_a"]], 

114 ["name", "weight", "sum_d_a"], 

115 [lambda vec,w: sum(vec) / w], 

116 lambda v: v ["d_b"]) 

117 

118 print("--------- innerjoin") 

119 innerjoin = table.innerjoin(group, lambda v: v["name"], 

120 lambda v: v["name"], "group") 

121 print(innerjoin) 

122 

123 print("------------- extraction") 

124 ext = table.extract_columns(["name", "d_a"]) 

125 print(ext) 

126 

127 print("------------- remove") 

128 ext = table.remove_columns(["d_a"]) 

129 print(ext) 

130 

131 print("------------- todict") 

132 d = table.todict(lambda v: v["name"], lambda v: v["d_b"], True) 

133 print(d) 

134 

135 print("------------- select") 

136 d = table.select(lambda v:(v["name"], v["d_b"])) 

137 print(list(d)) 

138 

139 print("------------- use of an index") 

140 table.create_index(lambda v:(v["name"], v["d_a"])) 

141 row = table.get(('A', 1.1)) 

142 print(row) 

143 value = table.get(('A', 1.1), 2) 

144 print(value) 

145 

146 print("------------- multiply_column_by_row_instance ") 

147 table = TableFormula("name d_a d_b d_c#A 1 2 3#A 1.1 2.1 3.1#B 3 4 5".replace(" ", "\\t").replace("#","\\n")) 

148 table.add_column("key_add", lambda v:"unique") 

149 print(table) 

150 mul = table.multiply_column_by_row_instance( 

151 lambda v: v["key_add"], 

152 lambda v: v["name"]) 

153 print(mul) 

154 

155 if os.path.exists("BNP.PA.txt"): 

156 print("--------------- financial stock") 

157 table = TableFormula("BNP.PA.txt", sep=",") 

158 table.sort(lambda v: v["Date"]) 

159 print(table[:10]) 

160 

161 print("--------------- groupby_implicit") 

162 table = TableFormula("key_name sum_a len_b avg_c#A 1 2 3#A 1.1 2.1 3.1#B 3 4 5".replace(" ", "\\t").replace("#","\\n")) 

163 print(table) 

164 gr = table.groupby_implicit(lambda v: v ["key_name"]) 

165 print(gr) 

166 

167 print("--------------- covariance") 

168 values = [random.random() for i in range(0,100)] 

169 values = [[x, x + random.random()/2] for x in values] 

170 tbl = TableFormula(["x", "y"], values).values_to_float() 

171 cov = tbl.covariance() 

172 print(cov) 

173 

174 print("--------------- histogram") 

175 hist = tbl.histogram(lambda v:(v["x"],1), 10) 

176 print(hist) 

177 

178 print("--------------- histogram") 

179 hist = tbl.values_to_float().histograms(["x", "y"], 10) 

180 print(hist) 

181 

182 print("--------------- unions of columns") 

183 hist = tbl.values_to_float().union_columns(["x", "y"]) 

184 print(hist) 

185 """ 

186 

187 @staticmethod 

188 def add_header_if_not_present(filename, header, encoding=None, logFunction=noLOG): 

189 """ 

190 the function checks if the first line contains the column in header 

191 otherwise, it modifies the file and add them on the first line 

192 

193 @param filename filename 

194 @param header list of column name(all strings) 

195 @param encoding encoding 

196 @param logFunction use this function to log information about what is happening 

197 """ 

198 if encoding is None: 

199 with open(filename, "r") as f: 

200 firstline = f.readline().strip("\n\r ") 

201 su = sum(map(lambda _: 1 if _ in header else 0, firstline.split("\t"))) 

202 if su < len(header) / 2.0: 

203 logFunction( # pragma: no cover 

204 "add_header_if_not_present: adding header({0}<{1}){2} to '{3}'\nfirstline:\n{4}".format( 

205 su, len(header) / 2, header, filename, firstline)) 

206 with open(filename, "r") as f: 

207 text = f.read() 

208 text = "\t".join(header) + "\n" + text 

209 logFunction( # pragma: no cover 

210 "add_header_if_not_present: writing") 

211 with open(filename, "w") as f: 

212 f.write(text) 

213 logFunction( # pragma: no cover 

214 "add_header_if_not_present: complete") 

215 else: 

216 with open(filename, "r", encoding=encoding) as f: 

217 firstline = f.readline().strip("\n\r ") 

218 su = sum(map(lambda _: 1 if _ in header else 0, firstline.split("\t"))) 

219 if su < len(header) / 2.0: 

220 logFunction( # pragma: no cover 

221 "add_header_if_not_present: adding header({0}<{1}){2} to '{3}'\nfirstline:\n{4}".format( 

222 su, len(header) / 2, header, filename, firstline)) 

223 with open(filename, "r", encoding=encoding) as f: 

224 text = f.read() 

225 text = "\t".join(header) + "\n" + text 

226 logFunction( # pragma: no cover 

227 "add_header_if_not_present: writing") 

228 with open(filename, "w", encoding=encoding) as f: 

229 f.write(text) 

230 logFunction( # pragma: no cover 

231 "add_header_if_not_present: complete") 

232 

233 @staticmethod 

234 def random_split_file(filename, outfileprefix, nb, has_header=True, encoding=None, logFunction=noLOG): 

235 """ 

236 split a file in nb buckets by random(lines are sent to a random file as they come) 

237 

238 @param filename filename to split 

239 @param nb number of buckets 

240 @param outfileprefix output files will start with outfileprefix + '%04d.txt' % i 

241 @param encoding encoding 

242 @param has_header the header will be replicated in each created file 

243 @param logFunction to display information 

244 @return list of created files 

245 """ 

246 firstline = None 

247 if has_header: 

248 if encoding is None: 

249 with open(filename, "r") as f: 

250 firstline = f.readline().strip("\n\r ") 

251 else: # pragma: no cover 

252 with open(filename, "r", encoding=encoding) as f: 

253 firstline = f.readline().strip("\n\r ") 

254 

255 logFunction( # pragma: no cover 

256 f"random_split_file: file {filename} has header {firstline}") 

257 

258 logFunction("random_split_file: split %s in %d parts" % (filename, nb)) 

259 fileName = [outfileprefix + (".%04d.txt" % n) for n in range(0, nb)] 

260 nbline = 0 

261 

262 if encoding is None: 

263 filesP = [open(_, "w") for _ in fileName] 

264 if firstline is not None: 

265 for _ in filesP: 

266 _.write(firstline + "\n") 

267 with open(filename, "r") as f: 

268 line = f.readline() 

269 if firstline is not None: 

270 line = f.readline() 

271 while line is not None and len(line) > 0: 

272 h = random.randint(0, nb - 1) 

273 filesP[h].write(line) 

274 line = f.readline() 

275 nbline += 1 

276 if nbline % 1000000 == 0: 

277 logFunction( # pragma: no cover 

278 "random_split_file: processed %d lines" % nbline) 

279 else: 

280 filesP = [open(_, "w", encoding=encoding) for _ in fileName] 

281 if firstline is not None: 

282 for _ in filesP: 

283 _.write(firstline + "\n") 

284 with open(filename, "r", encoding=encoding) as f: 

285 line = f.readline() 

286 if firstline is not None: 

287 line = f.readline() 

288 while line is not None and len(line) > 0: 

289 h = random.randint(0, nb - 1) 

290 filesP[h].write(line) 

291 line = f.readline() 

292 nbline += 1 

293 if nbline % 1000000 == 0: 

294 logFunction( # pragma: no cover 

295 "random_split_file: processed %d lines" % nbline) 

296 

297 for _ in filesP: 

298 _.close() 

299 logFunction("random_split_file: end") 

300 return fileName 

301 

302 @staticmethod 

303 def ratio(x, y): 

304 """ 

305 return a ratio between two real values or an empty string if the denominateur is null 

306 @return a real of an empty string 

307 """ 

308 return x * 1.0 / y if y != 0 else (0 if x == 0 else "") 

309 

310 @staticmethod 

311 def bootstrap(values, function, nbdraws=-1, alpha=0.05): 

312 """ 

313 return a confidence interval for a statistics 

314 @param values values 

315 @param function produces the statistics over a random set of observations chosen in values 

316 @param nbdraws number of draws, if it is equal to -1, is equal to len(values) 

317 @param alpha confidence level 

318 @return average, min, lower bound, higher bound, max 

319 """ 

320 stat = [] 

321 N = len(values) - 1 

322 if nbdraws == - 1: 

323 nbdraws = len(values) 

324 for i in range(nbdraws): 

325 randset = [values[random.randint(0, N)] for i in range(N + 1)] 

326 s = function(randset) 

327 stat.append(s) 

328 stat.sort() 

329 lv = len(stat) 

330 alpha = alpha / 2 

331 i1 = int(lv * alpha + 0.5) 

332 i2 = int(lv * (1 - alpha) + 0.5) 

333 i2 = min(i2, len(stat) - 1) 

334 av = sum(stat) / len(stat) 

335 return av, min(stat), stat[i1], stat[i2], max(stat) 

336 

337 @staticmethod 

338 def correlation_bicolumn(values, deviations=False, noCenter=False): 

339 """ 

340 assume values is a matrix with two columns 

341 @param values 2 column matrix 

342 @param deviations if True, returns cor, sigma1, sigma2 

343 @param noCenter if True, do not remove the average before computing the covariance, 

344 it means we assume variables are already centered 

345 @return correlation factor or correlation, sigma1, sigma2 if deviations is True 

346 """ 

347 if len(values) <= 1: 

348 raise ValueError( # pragma: no cover 

349 f"expecting more than one observation, not {len(values)}") 

350 

351 mx = 0. 

352 my = 0. 

353 vx = 0. 

354 vy = 0. 

355 co = 0. 

356 nb = 0. 

357 for a, b in values: 

358 nb += 1 

359 mx += a 

360 my += b 

361 vx += a ** 2 

362 vy += b ** 2 

363 co += a * b 

364 mx /= nb 

365 my /= nb 

366 vx /= nb 

367 vy /= nb 

368 co /= nb 

369 if not noCenter: 

370 vx -= mx ** 2 

371 vy -= my ** 2 

372 co -= mx * my 

373 vx = vx ** 0.5 

374 vy = vy ** 0.5 

375 v = vx * vy 

376 if v != 0: 

377 co /= v 

378 if deviations: 

379 return co, vx, vy # pragma: no cover 

380 return co 

381 

382 def _private_getclass(self): 

383 """ 

384 the class often creates another class of the same type, 

385 this function returns the class object 

386 """ 

387 return self.__class__ 

388 

389 def __init__(self, file, numeric_column=None, sep="\t", encoding=None, 

390 read_n_lines=-1, sheet=0, **options): 

391 """ 

392 It can either take a filename, an object TableFormula, 

393 a list of columns and values. 

394 

395 :param file: filename or a list of column names or a dictionary, 

396 file can also be a `pandas DataFrame 

397 <http://pandas.pydata.org/pandas-docs/dev/dsintro.html#dataframe>`_. 

398 :param numeric_column: depends on file types(see below examples) 

399 :param sep: column separator if file is a filename 

400 :param read_n_lines: read the first n lines(or all if it is -1) 

401 :param sheet: in case the file is an Excel file, this parameter precises the sheet number or name 

402 :param suffix_nb: if True, adds an integer to the column name if it is a duplicate 

403 

404 Example: 

405 

406 :: 

407 

408 table = TableFormula("name d_a d_b d_c#A 1 2 3#A 1.1 2.1 3.1#B 3 4 5".replace(" ", "\\t").replace("#","\\n")) 

409 

410 or 

411 

412 :: 

413 

414 table = TableFormula("file.txt", ["nb"]) 

415 

416 or 

417 

418 :: 

419 

420 table = TableFormula(["date", "Y", "Y2", "xl"], values) 

421 

422 or 

423 

424 :: 

425 

426 data = [{ "one":1, "two":2 }, {"two":2.1, "three":3 }] 

427 table = TableFormula(data) 

428 

429 or 

430 

431 :: 

432 

433 data = { 1:{ "one":2.3, "two":2.2 }, 2:{"one":2.1, "two":3 } 

434 table = TableFormula("__byrow__", data) 

435 

436 or 

437 

438 :: 

439 

440 table = TableFormula(numpy.matrix(...)) 

441 

442 or 

443 

444 :: 

445 

446 table = TableFormula(numpy.array(...)) 

447 

448 

449 @warning In this second case, rows and header are not copied. 

450 """ 

451 if numeric_column is None: 

452 numeric_column = [] 

453 if isinstance(file, str): 

454 if os.path.exists(file): 

455 self._read_file(file, numeric_column, sep, 

456 encoding, read_n_lines, sheet=sheet) 

457 elif file == "__byrow__" and isinstance(numeric_column, dict): 

458 self._fill_by_row(numeric_column) 

459 else: 

460 lines = file.split("\n") 

461 if len(lines) == 1: 

462 raise FileNotFoundError( # pragma: no cover 

463 f"A file was probably expected but was not found: '{file}'.") 

464 self._readlines(lines, numeric_column, sep) 

465 

466 elif isinstance(file, list): 

467 if len(file) == 0: 

468 raise ValueError( # pragma: no cover 

469 "Empty data and columns are not allowed.") 

470 

471 if isinstance(file[0], dict): 

472 self.index = {} 

473 self.values = [] 

474 for row in file: 

475 for k, v in row.items(): 

476 if k not in self.index: 

477 self.index[k] = len(self.index) 

478 

479 # we sort the labels to avoid instabilities 

480 labels = [k for k, v in self.index.items()] 

481 labels.sort() 

482 self.index = {} 

483 for la in labels: 

484 self.index[la] = len(self.index) 

485 

486 for row in file: 

487 line = [None for k in self.index] 

488 for k, v in row.items(): 

489 line[self.index[k]] = v 

490 self.values.append(line) 

491 

492 self.header = [None for k in self.index] 

493 for k, v in self.index.items(): 

494 self.header[v] = k 

495 

496 n = len(self.index) 

497 for row in self.values: 

498 while len(row) < n: 

499 row.append(None) 

500 

501 elif isinstance(numeric_column, numpy.matrix): 

502 self.header = file 

503 self.index = {} 

504 for i, h in enumerate(self.header): 

505 self.index[h] = i 

506 self.values = [[float(numeric_column[i, j]) for j in range( 

507 numeric_column.shape[1])] for i in range(numeric_column.shape[0])] 

508 elif isinstance(numeric_column, numpy.ndarray): 

509 self.header = file 

510 self.index = {} 

511 for i, h in enumerate(self.header): 

512 self.index[h] = i 

513 self.values = [[float(numeric_column[i, j]) for j in range( 

514 numeric_column.shape[1])] for i in range(numeric_column.shape[0])] 

515 elif isinstance(file[0], list): 

516 if len(file) == 1: 

517 self.header = file[0] 

518 self.values = file[1:] + numeric_column 

519 self.index = {} 

520 for i, h in enumerate(self.header): 

521 self.index[h] = i 

522 else: 

523 self.header = file[0] 

524 self.values = file[1:] 

525 self.index = {} 

526 for i, h in enumerate(self.header): 

527 self.index[h] = i 

528 elif isinstance(file[0], str): 

529 self.header = file 

530 self.values = numeric_column 

531 self.index = {} 

532 for i, h in enumerate(self.header): 

533 self.index[h] = i 

534 else: 

535 raise RuntimeError( # pragma: no cover 

536 "This case should not happen: " + str(type(file[0]))) 

537 

538 elif isinstance(file, numpy.matrix): # pragma: no cover 

539 self.header = ["c%d" % d for d in range(file.shape[1])] 

540 self.index = {} 

541 for i, h in enumerate(self.header): 

542 self.index[h] = i 

543 self.values = [[float(file[i, j]) for j in range( 

544 file.shape[1])] for i in range(file.shape[0])] 

545 

546 elif isinstance(file, numpy.ndarray): 

547 self.header = ["c%d" % d for d in range(file.shape[1])] 

548 self.index = {} 

549 for i, h in enumerate(self.header): 

550 self.index[h] = i 

551 self.values = [[float(file[i, j]) for j in range( 

552 file.shape[1])] for i in range(file.shape[0])] 

553 

554 else: 

555 if isinstance(file, pandas.DataFrame): 

556 def convert(x): 

557 return None if isinstance(x, float) and numpy.isnan(x) else x 

558 df = file 

559 self.header = [_ for _ in df.columns] 

560 hi = 'index' 

561 while hi in self.header: 

562 hi += "_" 

563 self.header.insert(0, hi) 

564 self.values = [] 

565 for i, row in enumerate(df.values): 

566 row = [df.index[i]] + [convert(x) for x in row] 

567 self.values.append(row) 

568 

569 self.index = {} 

570 for i, h in enumerate(self.header): 

571 self.index[h] = i 

572 else: 

573 raise TypeError( # pragma: no cover 

574 "File has an unexpected type: " + str(type(file))) 

575 

576 unique = {} 

577 for i, c in enumerate(self.header): 

578 if c in unique: 

579 if options.get("suffix_nb", False): 

580 c = "%s_%d" % (c, i) 

581 self.header[i] = c 

582 else: 

583 raise KeyError( # pragma: no cover 

584 f"column '{c}' already exists in '{self.header}'") 

585 unique[c] = True 

586 

587 def __add__(self, other): 

588 """ 

589 do an addition, add values if types are matching 

590 :param other: matrix or float or string 

591 :return: new matrix, keep the header of the first matrix 

592 """ 

593 if len(self) != len(other): 

594 raise ValueError( # pragma: no cover 

595 "both matrices should have the same number of rows") 

596 if len(self.header) != len(other.header): 

597 raise ValueError( # pragma: no cover 

598 "both matrices should have the same number of columns") 

599 values = [] 

600 for row, rowo in zip(self.values, other.values): 

601 r = [] 

602 for a, b in zip(row, rowo): 

603 if type(a) == type(b): 

604 x = a + b 

605 else: 

606 x = None 

607 r.append(x) 

608 values.append(r) 

609 return self._private_getclass()(self.header, values) 

610 

611 def __mul__(self, other): 

612 """ 

613 do a multiplication(by a number) 

614 :param other: matrix or float or string 

615 :return: new matrix, keep the header of the first matrix 

616 """ 

617 if not isinstance(other, float) and not isinstance(other, int): 

618 raise TypeError( # pragma: no cover 

619 "other should be a number") 

620 values = [] 

621 for row in self.values: 

622 r = [] 

623 for a in row: 

624 if a is not None: 

625 x = a * other 

626 else: 

627 x = None 

628 r.append(x) 

629 values.append(r) 

630 return self._private_getclass()(self.header, values) 

631 

632 def multiplication_term_term(self, other): 

633 """ 

634 do a multiplication term by term(similar to an addition), 

635 add values if types are matching 

636 

637 :param other: matrix or float or string 

638 :return: new matrix, keep the header of the first matrix 

639 """ 

640 if len(self) != len(other): 

641 raise ValueError( # pragma: no cover 

642 "both matrices should have the same number of rows") 

643 if len(self.header) != len(other.header): 

644 raise ValueError( # pragma: no cover 

645 "both matrices should have the same number of columns") 

646 values = [] 

647 for row, rowo in zip(self.values, other.values): 

648 r = [] 

649 for a, b in zip(row, rowo): 

650 if type(a) == type(b) and not isinstance(a, str): 

651 x = a * b 

652 else: 

653 x = None 

654 r.append(x) 

655 values.append(r) 

656 return self._private_getclass()(self.header, values) 

657 

658 def replicate(self, times): 

659 """replicates all rows a given number of times 

660 :param times: number of multiplication 

661 :return: new matrix, keep the header of the first matrix 

662 """ 

663 values = [] 

664 for i in range(0, times): 

665 values.extend(copy.copy(self.values)) 

666 return self._private_getclass()(self.header, values) 

667 

668 @property 

669 def size(self): 

670 """ 

671 returns the size(nb rows, nb columns) 

672 """ 

673 return len(self), len(self.header) 

674 

675 @property 

676 def shape(self): 

677 """ 

678 returns the size(nb rows, nb columns) 

679 """ 

680 return self.size 

681 

682 def _fill_by_row(self, values): 

683 """ 

684 fill the table 

685 :param values: dictionary { <int_row_index>: { <column name>: value} } 

686 """ 

687 mx = max(values.keys()) + 1 

688 self.index = {} 

689 self.header = [] 

690 for k, v in values.items(): 

691 for col in v: 

692 if col not in self.index: 

693 self.index[col] = len(self.index) 

694 self.header.append(col) 

695 self.values = [[None for h in self.header] for k in range(mx)] 

696 for k, v in values.items(): 

697 for col, to in v.items(): 

698 self.values[k][self.index[col]] = to 

699 

700 def __getitem__(self, irow): 

701 """ 

702 operator [], accepts slices 

703 :param irow: integer, tuple, slice or list 

704 :return: depends on irow 

705 - int --> a table with one row 

706 - slice --> a table with several rows 

707 - list --> a table with the selected rows 

708 - tuple --> a value 

709 """ 

710 if isinstance(irow, int): 

711 return self._private_getclass()( 

712 self.header, [self.values[irow]]) 

713 if isinstance(irow, slice): 

714 return self._private_getclass()( 

715 self.header, [self.values[ii] for ii in range(*irow.indices(len(self)))]) 

716 if isinstance(irow, list): 

717 return self._private_getclass()( 

718 self.header, [self.values[ii] for ii in irow]) 

719 if isinstance(irow, tuple): 

720 if isinstance(irow[1], str): 

721 row = self.values[irow[0]] 

722 v = self._interpret_row(row) 

723 return v[irow[1]] 

724 return self.values[irow[0]][irow[1]] 

725 raise TypeError("Invalid argument type: " + str(type(irow))) 

726 

727 def __setitem__(self, irow, value): 

728 """ 

729 operator [], just accepts tuple(to change a value) 

730 :param irow: 2-uple 

731 :param value: new value 

732 """ 

733 if isinstance(irow, tuple): 

734 if isinstance(irow[1], str): 

735 row = self.values[irow[0]] 

736 v = self._interpret_row(row) 

737 v[irow[1]] = value 

738 else: 

739 self.values[irow[0]][irow[1]] = value 

740 else: 

741 raise TypeError( # pragma: no cover 

742 "Invalid argument type(only tuple accepted): " + str(type(irow))) 

743 

744 def __len__(self): 

745 """ 

746 returns the number of rows 

747 """ 

748 return len(self.values) 

749 

750 def __copy__(self): 

751 """ 

752 operator copy 

753 """ 

754 return self._private_getclass()(self.header, self.values) 

755 

756 def __deepcopy__(self, memo): 

757 """ 

758 operator ``deepcopy`` 

759 """ 

760 return self._private_getclass()(copy.deepcopy(self.header, memo), copy.deepcopy(self.values, memo)) 

761 

762 def copy(self): 

763 """ 

764 call ``copy.deepcopy(self)`` 

765 """ 

766 return copy.deepcopy(self) 

767 

768 def delta(self, other): 

769 """ 

770 returns a list of differences between self and others 

771 

772 :param other: TableFormula 

773 :return: list of differences(first one) 

774 """ 

775 if other is None: 

776 return False 

777 if not isinstance(other, TableFormula): 

778 raise TypeError("other is not a table: " + str(type(other))) 

779 if len(self.header) != len(other.header): 

780 return ["different number of columns"] 

781 for a, b in zip(self.header, other.header): 

782 if a != b: 

783 return ["different columns"] 

784 if len(self.values) != len(other.values): 

785 return ["different number of rows"] 

786 line = 0 

787 for r, s in zip(self.values, other.values): 

788 if len(r) != len(s): 

789 return ["different number of values on row %d" % line] 

790 col = 0 

791 for a, b in zip(r, s): 

792 if a != b: 

793 return ["different value on cell %d,%d: %s!=%s(type %s, %s)" % (line, col, a, b, str(type(a)), str(type(b)))] 

794 col += 1 

795 line += 1 

796 return [] 

797 

798 def __eq__(self, other): 

799 """ 

800 check if two tables are equal by value 

801 :param other: other table 

802 :return: boolean 

803 """ 

804 if other is None: 

805 return False 

806 if not isinstance(other, TableFormula): 

807 return False 

808 if len(self.header) != len(other.header): 

809 return False 

810 for a, b in zip(self.header, other.header): 

811 if a != b: 

812 return False 

813 if len(self.values) != len(other.values): 

814 return False 

815 for r, s in zip(self.values, other.values): 

816 if len(r) != len(s): 

817 return False 

818 for a, b in zip(r, s): 

819 if a != b: 

820 return False 

821 return True 

822 

823 def __str__(self): 

824 """ 

825 convert the table into a string 

826 :return: string 

827 """ 

828 rows = ["\t".join(self.header)] 

829 for row in self.values: 

830 s = "\t".join([str(_) for _ in row]) 

831 rows.append(s) 

832 return "\n".join(rows) 

833 

834 def __html__(self, class_table=None, class_td=None, class_tr=None, class_th=None): 

835 """ 

836 Converts the table into a :epkg:`html` string. 

837 

838 :param class_table: adds a class to the tag ``table`` (None for none) 

839 :param class_td: adds a class to the tag ``td`` (None for none) 

840 :param class_tr: adds a class to the tag ``tr`` (None for none) 

841 :param class_th: adds a class to the tag ``th`` (None for none) 

842 """ 

843 clta = f' class="{class_table}"' if class_table is not None else "" 

844 cltr = f' class="{class_tr}"' if class_tr is not None else "" 

845 cltd = f' class="{class_td}"' if class_td is not None else "" 

846 clth = f' class="{class_th}"' if class_th is not None else "" 

847 

848 rows = [f"<table{clta}>"] 

849 rows.append("{0}{1}{2}".format(("<tr%s><th%s>" % (cltr, clth)), 

850 ("</th><th%s>" % clth).join(self.header), "</th></tr>")) 

851 septd = f"</td><td{cltd}>" 

852 strtd = f"<tr{cltr}><td{cltd}>" 

853 for row in self.values: 

854 s = septd.join([str(_) for _ in row]) 

855 rows.append(strtd + s + "</td></tr>") 

856 rows.append("</table>") 

857 rows.append("") 

858 return "\n".join(rows) 

859 

860 def __rst__(self, add_line=True): 

861 """ 

862 convert the table into rst format 

863 

864 :: 

865 

866 +------------------------+------------+----------+----------+ 

867 | Header row, column 1 | Header 2 | Header 3 | Header 4 | 

868 | (header rows optional) | | | | 

869 +========================+============+==========+==========+ 

870 | body row 1, column 1 | column 2 | column 3 | column 4 | 

871 +------------------------+------------+----------+----------+ 

872 | body row 2 | ... | ... | | 

873 +------------------------+------------+----------+----------+ 

874 

875 :param add_line: add a line separator between each row 

876 """ 

877 tbl = self.values_to_str() 

878 length = [len(_) for _ in tbl.header] 

879 for row in tbl.values: 

880 for i, v in enumerate(row): 

881 length[i] = max(length[i], len(v)) 

882 length = [_ + 2 for _ in length] 

883 line = ["-" * le for le in length] 

884 lineb = ["=" * le for le in length] 

885 sline = f"+{'+'.join(line)}+" 

886 slineb = f"+{'+'.join(lineb)}+" 

887 res = [sline] 

888 

889 def complete(cool): 

890 s, i = cool 

891 i -= 2 

892 if len(s) < i: 

893 s += " " * (i - len(s)) 

894 return s 

895 

896 res.append(f"| {' | '.join(map(complete, zip(tbl.header, length)))} |") 

897 res.append(slineb) 

898 res.extend([f"| {' | '.join(map(complete, zip(row, length)))} |" 

899 for row in tbl.values]) 

900 if add_line: 

901 t = len(res) 

902 for i in range(t - 1, 3, -1): 

903 res.insert(i, sline) 

904 res.append(sline) 

905 return "\n".join(res) + "\n" 

906 

907 def strtype(self): 

908 """ 

909 displays the type of values(not the values) 

910 """ 

911 rows = ["\t".join(self.header)] 

912 for row in self.values: 

913 s = "\t".join([str(type(_)) for _ in row]) 

914 rows.append(s) 

915 return "\n".join(rows) 

916 

917 def _read_file(self, file, numeric_column, sep, encoding, read_n_lines, sheet=0): 

918 """ 

919 private 

920 """ 

921 ext = os.path.splitext(file)[-1].lower() 

922 if ext in [".xls", ".xlsx"]: 

923 lines = list(open_workbook(file, sheet=sheet)) 

924 # removing empty column(assuming first row is the header) 

925 ind = [i for i, n in enumerate(lines[0]) if len(n) > 0] 

926 if len(ind) < len(lines[0]): 

927 lines = [[line[i] for i in ind] for line in lines] 

928 else: 

929 if sys.version_info.major >= 3 or encoding is None: 

930 if encoding is None: 

931 f = open(file, "r") 

932 else: 

933 f = open(file, "r", encoding=encoding) 

934 else: 

935 f = open(file, "r", encoding=encoding) 

936 

937 if read_n_lines > 0: 

938 lines = [] 

939 for line in f: 

940 if len(lines) >= read_n_lines: 

941 break 

942 lines.append(line) 

943 else: 

944 lines = f.readlines() 

945 f.close() 

946 self._readlines(lines, numeric_column, sep) 

947 

948 def change_header(self, new_header): 

949 """ 

950 change the column names 

951 

952 :param new_header: a list or a function which modifies the header 

953 

954 Example: 

955 

956 :: 

957 

958 tbl.change_header(lambda h: h if h != "column" else "new_name") 

959 

960 .. warning:: Do not do that yourself, the class holds a dictionary up to date with the column index. 

961 """ 

962 if isinstance(new_header, list): 

963 self.header = new_header 

964 self.index = {} 

965 for i, h in enumerate(self.header): 

966 self.index[h] = i 

967 else: 

968 he = [new_header(h) for h in self.header] 

969 self.change_header(he) 

970 

971 def rename_column(self, old_name, new_name): 

972 """ 

973 rename a column 

974 

975 :param old_name: old name 

976 :param new_name: new name 

977 """ 

978 header = [{old_name: new_name}.get(_, _) for _ in self.header] 

979 self.change_header(header) 

980 

981 def save(self, filename, sep="\t", encoding=None, newline="\n"): 

982 """ 

983 saves the tables in a text file, first row is the column names 

984 

985 :param filename: filename 

986 :param sep: column separator 

987 :param encoding: encoding 

988 :param newline: line separator 

989 """ 

990 if sys.version_info.major >= 3 or encoding is None: 

991 if encoding is None: 

992 f = open(filename, "w", newline=newline) 

993 else: 

994 f = open(filename, "w", encoding=encoding, newline=newline) 

995 else: 

996 f = open(filename, "w", encoding=encoding) 

997 

998 f.write(sep.join(self.header)) 

999 f.write("\n") 

1000 for row in self.values: 

1001 f.write(sep.join([str(_) for _ in row])) 

1002 f.write("\n") 

1003 f.close() 

1004 

1005 def _readlines(self, lines, numeric_column, sep): 

1006 """private""" 

1007 if isinstance(lines[0], str): 

1008 lines = [_.replace("\ufeff", "").replace("\xef\xbb\xbf", "") 

1009 .strip("\n\r ").split(sep) for _ in lines if len(_) > 0] 

1010 self.header = lines[0] 

1011 self.values = lines[1:] 

1012 self.index = {} 

1013 for i, h in enumerate(self.header): 

1014 self.index[h] = i 

1015 elif isinstance(lines[0], list): 

1016 self.header = lines[0] 

1017 self.values = lines[1:] 

1018 self.index = {} 

1019 for i, h in enumerate(self.header): 

1020 self.index[h] = i 

1021 else: 

1022 raise RuntimeError("unexpected format: " + str(type(lines[0]))) 

1023 

1024 self._auto_conversion(numeric_column) 

1025 

1026 def _auto_conversion(self, others_columns): 

1027 """ 

1028 private 

1029 set up the column type based on the column name 

1030 """ 

1031 def condition(k): 

1032 if k.startswith("sum_") or k.startswith("pos_"): 

1033 return True 

1034 if k.startswith("avg_") or k.startswith("len_"): 

1035 return True 

1036 if k.startswith("nb_") or k.startswith("max_") or k.startswith("min_"): 

1037 return True 

1038 if k.startswith("d_") or k in others_columns: 

1039 return True 

1040 if k in ["Open", "High", "Low", "Close", "Volume", "Adj Close"]: 

1041 return True 

1042 if k in ["distance", "nb"]: 

1043 return True 

1044 return False 

1045 

1046 for i, k in enumerate(self.header): 

1047 if k == "Date": 

1048 for row in self.values: 

1049 if isinstance(row[i], str): 

1050 row[i] = datetime.datetime.strptime(row[i], '%Y-%m-%d') 

1051 elif isinstance(row[i], float): 

1052 row[i] = datetime.datetime.utcfromtimestamp(row[i]) 

1053 else: 

1054 raise RuntimeError( 

1055 f"unable to extract a date from type {type(row[i])}") 

1056 elif condition(k): 

1057 for row in self.values: 

1058 row[i] = float(row[i]) 

1059 else: 

1060 for row in self.values: 

1061 if isinstance(row[i], str) and row[i] == "None": 

1062 row[i] = None 

1063 

1064 def get_column_values(self, col): 

1065 """ 

1066 private 

1067 returns all values for one column 

1068 """ 

1069 i = self.index[col] 

1070 return [row[i] for row in self.values] 

1071 

1072 def get_distinct_values(self, col): 

1073 """private""" 

1074 row = self.get_column_values(col) 

1075 dis = {} 

1076 for r in row: 

1077 dis[r] = dis.get(r, 0) + 1 

1078 return dis 

1079 

1080 def _interpret_row(self, row): 

1081 """ 

1082 private 

1083 returns each row as a dictionary { column_name:value } 

1084 """ 

1085 values = {} 

1086 for a, b in zip(self.header, row): 

1087 values[a] = b 

1088 return values 

1089 

1090 def __iter__(self): 

1091 """ 

1092 iterator on all rows, it returns a dictionary { column:value } 

1093 @return dictionary 

1094 """ 

1095 for row in self.values: 

1096 yield self._interpret_row(row) 

1097 

1098 def add_column(self, colname, function, position=-1): 

1099 """ 

1100 Adds a column. 

1101 :param colname: column name or columns name if it is a list or a tuple 

1102 :param function: function which will gives the values(or a list of functions, or a function which return a tuple) 

1103 :param position: where to insert the column, -1 for the end 

1104 

1105 Example: 

1106 

1107 :: 

1108 

1109 table.add_column("has_A", lambda v: 1 if "A" in v["name"] else 0, 0) 

1110 

1111 table.add_column(("has_A", "has_B"),(lambda v: 1 if "A" in v["name"] else 0, 

1112 lambda v: 1 if "B" in v["name"] else 0)) 

1113 

1114 table.add_column(("has_A", "has_B"),(lambda v:(1 if "A" in v["name"] else 0, 1 if "B" in v["name"] else 0)) 

1115 """ 

1116 if isinstance(colname, str): 

1117 if position == -1: 

1118 self.index[colname] = len(self.index) 

1119 for row in self.values: 

1120 v = self._interpret_row(row) 

1121 x = function(v) 

1122 row.append(x) 

1123 self.header.append(colname) 

1124 else: 

1125 for row in self.values: 

1126 v = self._interpret_row(row) 

1127 x = function(v) 

1128 row.insert(position, x) 

1129 self.header.insert(position, colname) 

1130 self.index = {v: i for i, v in enumerate(self.header)} 

1131 

1132 elif isinstance(function, list): 

1133 if len(colname) != len(function): 

1134 raise ValueError( # pragma: no cover 

1135 "unable to continue, colname and function do not have the same number of elements") 

1136 if position == -1: 

1137 position = [-1] * len(colname) 

1138 elif isinstance(position, int): 

1139 position = [position] * len(colname) 

1140 else: 

1141 if len(position) != len(colname): 

1142 raise RuntimeError( # pragma: no cover 

1143 "Unable to continue, colname and position do not " 

1144 "have the same number of elements.") 

1145 dec = 0 

1146 for a, b, c in zip(colname, function, position): 

1147 self.add_column(a, b, c + dec) 

1148 dec += 1 

1149 

1150 else: 

1151 # we assume here, the function returns a tuple 

1152 if not isinstance(position, int): 

1153 raise TypeError( # pragma: no cover 

1154 "Int expected for position for this case.") 

1155 

1156 if position == -1: 

1157 for row in self.values: 

1158 v = self._interpret_row(row) 

1159 x = function(v) 

1160 row.extend(x) 

1161 self.header.extend(colname) 

1162 

1163 else: 

1164 for row in self.values: 

1165 v = self._interpret_row(row) 

1166 x = function(v) 

1167 for i, _ in enumerate(x): 

1168 row.insert(position + i, _) 

1169 for i, c in enumerate(colname): 

1170 self.header.insert(position + i, c) 

1171 self.index = {v: i for i, v in enumerate(self.header)} 

1172 return self 

1173 

1174 def add_column_index(self, colname="index", start=0): 

1175 """ 

1176 Example: 

1177 

1178 :: 

1179 

1180 table.add_column("index_row") 

1181 """ 

1182 self.index[colname] = len(self.index) 

1183 for i, row in enumerate(self.values): 

1184 row.append(i + start) 

1185 self.header.append(colname) 

1186 return self 

1187 

1188 def addc(self, colname, function, position=-1): 

1189 """ 

1190 @see me add_column 

1191 """ 

1192 return self.add_column(colname, function, position) 

1193 

1194 def add_column_recursive(self, colname, functionValue, functionAgg): 

1195 """ 

1196 Example: 

1197 

1198 :: 

1199 

1200 table.add_column_recursive(lambda v: v ["norm_%s" % loi], 

1201 lambda li, v: li[-1] + v) 

1202 """ 

1203 self.index[colname] = len(self.index) 

1204 values = [] 

1205 for row in self.values: 

1206 v = self._interpret_row(row) 

1207 x = functionValue(v) 

1208 y = functionAgg(values, x) 

1209 row.append(y) 

1210 values.append(y) 

1211 self.header.append(colname) 

1212 return self 

1213 

1214 def add_column_recursive_row(self, colname, functionAgg): 

1215 """ 

1216 Example: 

1217 

1218 :: 

1219 

1220 table.add_column_recursive_row("w_%s" % loi, 

1221 lambda li, v: li[-1] + v ["norm_%s" % loi] \ 

1222 if len(li)> 0 else v ["norm_%s" % loi]) 

1223 """ 

1224 self.index[colname] = len(self.index) 

1225 values = [] 

1226 for row in self.values: 

1227 v = self._interpret_row(row) 

1228 y = functionAgg(values, v) 

1229 row.append(y) 

1230 values.append(y) 

1231 self.header.append(colname) 

1232 return self 

1233 

1234 def add_column_vector(self, colname, vector): 

1235 """ 

1236 add a column defined by vector(list of values for each row) 

1237 

1238 :param colname: column to add 

1239 :param vector: (list) list of values to add to each row 

1240 :return self 

1241 """ 

1242 if len(vector) != len(self): 

1243 raise ValueError( # pragma: no cover 

1244 f"vector and table have different length {len(vector)} != {len(self)}") 

1245 for vec, row in zip(vector, self.values): 

1246 row.append(vec) 

1247 self.index[colname] = len(self.index) 

1248 self.header.append(colname) 

1249 return self 

1250 

1251 def add_column_smooth(self, colname, function, position, weights): 

1252 """ 

1253 Example: 

1254 

1255 :: 

1256 

1257 x = 1./3 

1258 table.add_column_smooth("has_A_smooth", lambda v: v["has_A"], [-1,0,1], [x,x,x]) 

1259 """ 

1260 if len(position) != len(weights): 

1261 raise ValueError( # pragma: no cover 

1262 "position and weights must have the same length") 

1263 self.index[colname] = len(self.index) 

1264 column = [function(self._interpret_row(row)) for row in self.values] 

1265 tw = sum(weights) 

1266 couple = list(zip(position, weights)) 

1267 for p, row in enumerate(self.values): 

1268 sx = 0. 

1269 sw = 0. 

1270 ms = 0 

1271 for i, w in couple: 

1272 pi = p + i 

1273 if 0 <= pi < len(self): 

1274 sx += column[pi] * w 

1275 sw += w 

1276 else: 

1277 ms += 1 

1278 

1279 if ms == 0: 

1280 row.append(sx) 

1281 elif sw != 0: 

1282 row.append(sx * tw / sw) 

1283 else: 

1284 row.append(sx) 

1285 self.header.append(colname) 

1286 return self 

1287 

1288 def aggregate_column(self, colname, aggregated_function=sum): 

1289 """ 

1290 Example: 

1291 

1292 :: 

1293 

1294 total = table.aggregate_column("d_c", sum) 

1295 """ 

1296 def function(v): 

1297 return v[colname] 

1298 return self.aggregate(function, aggregated_function) 

1299 

1300 def aggregate(self, function, aggregated_function=sum): 

1301 """ 

1302 Example: 

1303 

1304 :: 

1305 

1306 total = table.aggregate_column(lambda v: v["d_c"], len) 

1307 """ 

1308 return aggregated_function([function(self._interpret_row(row)) for row in self.values]) 

1309 

1310 def where(self, condition_function): 

1311 """ 

1312 @see me filter 

1313 """ 

1314 return self.filter(condition_function) 

1315 

1316 def filter(self, condition_function): 

1317 """ 

1318 Example: 

1319 

1320 :: 

1321 

1322 fil = table.filter(lambda v: v["d_b"] == 2) 

1323 

1324 @warning Rows are not copied. 

1325 """ 

1326 newv = [] 

1327 for row in self.values: 

1328 v = self._interpret_row(row) 

1329 x = condition_function(v) 

1330 if x: 

1331 newv.append(row) 

1332 final = self._private_getclass()(self.header, newv) 

1333 return final 

1334 

1335 def groupby_implicit(self, functionKey, functionWeight=None, logging=None): 

1336 """ 

1337 use prefix of a column name to know which function to use 

1338 as an aggregated(sum, avg, len, key, none, max, min) 

1339 Example: 

1340 

1341 :: 

1342 

1343 group = table.groupby_implicit(lambda v: v["name"]) 

1344 """ 

1345 def identical(col, v): 

1346 return v[col] 

1347 

1348 def first(vec): 

1349 return vec[0] 

1350 

1351 def avg(vec): 

1352 return TableFormula.ratio(sum(vec), len(vec)) 

1353 

1354 functions = [] 

1355 labels = ["key"] 

1356 functionsAgg = [] 

1357 for col in self.header: 

1358 if col.startswith("key"): 

1359 values = self.select( 

1360 lambda v, col=col: (v[col], functionKey(v))) 

1361 dd = {} 

1362 for v in values: 

1363 if v[1] not in dd: 

1364 dd[v[1]] = {} 

1365 dd[v[1]][v[0]] = 1 

1366 for k in dd: 

1367 dd[k] = len(dd[k]) # pylint: disable=E4702 

1368 keep = [] 

1369 for k, v in dd.items(): 

1370 if v > 1: 

1371 keep.append((k, v)) 

1372 

1373 if len(keep) == 0: 

1374 functions.append(lambda v, col=col: identical(col, v)) 

1375 labels.append(col) 

1376 functionsAgg.append(first) 

1377 elif logging is not None: 

1378 end = min(len(keep), 10) 

1379 mes = ",".join([str(_) for _ in keep[:end]]) 

1380 logging("removing column '{0}' no unique value: {1}: {2}".format( 

1381 col, len(dd), mes)) 

1382 elif col.startswith("sum"): 

1383 functions.append(lambda v, col=col: identical(col, v)) 

1384 labels.append(col) 

1385 functionsAgg.append(sum) 

1386 elif col.startswith("len"): 

1387 functions.append(lambda v, col=col: 1) 

1388 labels.append(col) 

1389 functionsAgg.append(len) 

1390 elif col.startswith("min"): 

1391 functions.append(lambda v, col=col: 1) 

1392 labels.append(col) 

1393 functionsAgg.append(min) 

1394 elif col.startswith("max"): 

1395 functions.append(lambda v, col=col: 1) 

1396 labels.append(col) 

1397 functionsAgg.append(max) 

1398 elif col.startswith("avg"): 

1399 functions.append(lambda v, col=col: identical(col, v)) 

1400 labels.append(col) 

1401 functionsAgg.append(avg) 

1402 elif col.startswith("none"): 

1403 pass 

1404 else: 

1405 raise RuntimeError("unable to aggregate column " + col) 

1406 

1407 return self.groupby(functionKey, functions, labels, functionsAgg, functionWeight) 

1408 

1409 def groupby(self, functionKey, functionsValue, columns=None, functionsAgg=None, functionWeight=None): 

1410 """ 

1411 Example: 

1412 

1413 :: 

1414 

1415 group = table.groupby(lambda v: v["name"], 

1416 [lambda v: v["d_a"], 

1417 lambda v: v["d_b"]], 

1418 ["name", "sum_d_a", "sum_d_b"]) 

1419 

1420 or 

1421 

1422 :: 

1423 

1424 groupmax = table.groupby(lambda v: v["name"], 

1425 [lambda v: v["d_a"], 

1426 lambda v: v["d_b"]], 

1427 ["name", "max_d_a", "max_d_b"], 

1428 [max, max]) 

1429 """ 

1430 if not isinstance(functionsValue, list): 

1431 functionsValue = [functionsValue] 

1432 if functionsAgg is None: 

1433 functionsAgg = [sum for f in functionsValue] 

1434 if functionWeight is None: 

1435 if columns is not None and len(columns) != len(functionsValue) + 1: 

1436 raise RuntimeError("columns should have %d names not(%d)" % ( 

1437 len(functionsValue) + 1, len(columns))) 

1438 else: 

1439 if columns is not None and len(columns) != len(functionsValue) + 2: 

1440 raise RuntimeError("columns should have %d names not(%d)" % ( 

1441 len(functionsValue) + 2, len(columns))) 

1442 if columns is not None and not isinstance(columns[0], str): 

1443 raise TypeError("expecting type str not %s in columns" % 

1444 (str(type(columns[0])))) 

1445 

1446 hist = {} 

1447 if functionWeight is not None: 

1448 histWeight = {} 

1449 

1450 for row in self.values: 

1451 v = self._interpret_row(row) 

1452 key = functionKey(v) 

1453 w = 1. if functionWeight is None else functionWeight(v) 

1454 

1455 if key not in hist: 

1456 histWeight[key] = [w] 

1457 hist[key] = [[f(v) * w] for f in functionsValue] 

1458 else: 

1459 histWeight[key].append(w) 

1460 h = hist[key] 

1461 for i, f in enumerate(functionsValue): 

1462 h[i].append(f(v) * w) 

1463 

1464 for key in hist: # pylint: disable=C0206 

1465 h = hist[key] 

1466 w = sum(histWeight[key]) 

1467 for i in range(0, len(h)): 

1468 h[i] = functionsAgg[i](h[i], w) 

1469 

1470 f = hist.items if sys.version_info.major >= 3 else hist.items 

1471 histValues = [[k, sum(histWeight[k])] + v for k, v in f()] 

1472 

1473 if columns is None: 

1474 columns = ["key", "weight"] + ["val%d" % 

1475 i for i, f in enumerate(functionsValue)] 

1476 ret = self._private_getclass()(columns, histValues) 

1477 return ret 

1478 else: 

1479 for row in self.values: 

1480 v = self._interpret_row(row) 

1481 key = functionKey(v) 

1482 if key not in hist: 

1483 hist[key] = [[f(v)] for f in functionsValue] 

1484 else: 

1485 h = hist[key] 

1486 for i, f in enumerate(functionsValue): 

1487 h[i].append(f(v)) 

1488 

1489 for key in hist: # pylint: disable=C0206 

1490 h = hist[key] 

1491 for i in range(0, len(h)): 

1492 h[i] = functionsAgg[i](h[i]) 

1493 

1494 f = hist.items if sys.version_info.major >= 3 else hist.items 

1495 histValues = [[k] + v for k, v in f()] 

1496 

1497 if columns is None: 

1498 columns = ["key"] + ["val%d" % 

1499 i for i, f in enumerate(functionsValue)] 

1500 ret = self._private_getclass()(columns, histValues) 

1501 return ret 

1502 

1503 def sort(self, functionValue, reverse=False): 

1504 """ 

1505 Example: 

1506 

1507 :: 

1508 

1509 table.sort(lambda v: v["d_b"] + v["d_c"]) 

1510 """ 

1511 values = [(functionValue(self._interpret_row(row)), i) 

1512 for i, row in enumerate(self.values)] 

1513 values.sort(reverse=reverse) 

1514 self.values = [self.values[_[1]] for _ in values] 

1515 return self 

1516 

1517 def extract_columns(self, listColumns): 

1518 """ 

1519 extract some columns 

1520 

1521 @param listColumns list of columns to remove or a function 

1522 which returns True if the column has to be extracted 

1523 based on its name 

1524 @return table 

1525 

1526 Example: 

1527 

1528 :: 

1529 

1530 ext = table.extract_columns(["name", "d_a"]) 

1531 """ 

1532 if isinstance(listColumns, list): 

1533 indexes = [(self.index[col] if isinstance(col, str) else col) 

1534 for col in listColumns] 

1535 header = listColumns 

1536 values = [[row[i] for i in indexes] for row in self.values] 

1537 return self._private_getclass()(header, values) 

1538 else: 

1539 header = [_ for _ in self.header if listColumns(_)] 

1540 return self.extract_columns(header) 

1541 

1542 def remove_columns(self, listColumns): 

1543 """ 

1544 remove some columns 

1545 

1546 @param listColumns list of columns to remove or a function 

1547 which returns True if the column has to be removed 

1548 based on its name 

1549 @return table 

1550 

1551 Example: 

1552 

1553 :: 

1554 

1555 rem = table.remove("d_a") 

1556 """ 

1557 if isinstance(listColumns, list): 

1558 cols = [_ for i, _ in enumerate( 

1559 self.header) if _ not in listColumns and i not in listColumns] 

1560 return self.extract_columns(cols) 

1561 if isinstance(listColumns, str): 

1562 cols = [_ for _ in self.header if _ != listColumns] 

1563 return self.extract_columns(cols) 

1564 cols = [_ for _ in self.header if not listColumns(_)] 

1565 return self.extract_columns(cols) 

1566 

1567 def innerjoin(self, table, functionKey1, functionKey2, nameKey="key", 

1568 addSuffixAnyWay=False, prefixToAdd=None, full=False, 

1569 keepKey=True, putKeyInColumn=None, missingValue=None, 

1570 uniqueKey=False): 

1571 """ 

1572 @param table other table to join with 

1573 @param functionKey1 key for the first table(a function) 

1574 @param functionKey2 key for the second table(a function) innerjoin .... ON ... 

1575 @param addSuffixAnyWay add a suffix to every column from the second table even 

1576 if names are different(suffix is "+") 

1577 @param prefixToAdd prefix to add the the columns of the second table 

1578 @param full add all items even if there is no common keys(``FULL OUTER JOIN``), 

1579 otherwise keep only common keys 

1580 @param keepKey keep the key as a column in the result(column is key), otherwise not 

1581 @param putKeyInColumn private parameter: keepKey has to be true and in this case, 

1582 put the key in this column(integer) 

1583 @param missingValue when there is not key on one side, this default value will be put in place 

1584 @param uniqueKey if True, the function assumes there is a bijection between rows 

1585 and keys(one row <--> one key) on both tables, 

1586 otherwise, it will not. 

1587 @return a table 

1588 

1589 Example: 

1590 

1591 :: 

1592 

1593 innerjoin = table.innerjoin(group, lambda v: v["name"], 

1594 lambda v: v["name"], "group") 

1595 """ 

1596 defaultVal1 = [missingValue for k in self.header] 

1597 defaultVal2 = [missingValue for k in table.header] 

1598 

1599 if uniqueKey: 

1600 keys = {} 

1601 for row in self.values: 

1602 v = self._interpret_row(row) 

1603 key = functionKey1(v) 

1604 keys[key] = (row, None) 

1605 

1606 for row in table.values: 

1607 v = table._interpret_row(row) 

1608 key = functionKey2(v) 

1609 if key in keys: 

1610 keys[key] = (keys[key][0], row) 

1611 elif full: 

1612 keys[key] = (None, row) 

1613 

1614 if not full: 

1615 d = [] 

1616 for k, v in keys.items(): 

1617 if None in v: 

1618 d.append(k) 

1619 for _ in d: 

1620 del keys[_] 

1621 else: 

1622 for k in keys: # pylint: disable=C0206 

1623 v = keys[k] 

1624 if v[0] is None: 

1625 keys[k] = (defaultVal1, v[1]) # pylint: disable=E4702 

1626 elif v[1] is None: 

1627 keys[k] = (v[0], defaultVal2) # pylint: disable=E4702 

1628 

1629 if keepKey: 

1630 columns = [nameKey] 

1631 for x in self.header: 

1632 while x in columns: 

1633 x += "~" 

1634 columns.append(x) 

1635 

1636 for x in table.header: 

1637 if prefixToAdd is not None: 

1638 x = prefixToAdd + x 

1639 elif addSuffixAnyWay: 

1640 x += "+" 

1641 while x in columns: 

1642 x += "+" 

1643 columns.append(x) 

1644 

1645 f = keys.items if sys.version_info.major >= 3 else keys.items 

1646 values = [[k] + v[0] + v[1] for k, v in f() if len(v) == 2] 

1647 return self._private_getclass()(columns, values) 

1648 else: 

1649 columns = [] 

1650 for x in self.header: 

1651 while x in columns: 

1652 x += "~" 

1653 columns.append(x) 

1654 

1655 for x in table.header: 

1656 if prefixToAdd is not None: 

1657 x = prefixToAdd + x 

1658 elif addSuffixAnyWay: 

1659 x += "+" 

1660 while x in columns: 

1661 x += "+" 

1662 columns.append(x) 

1663 

1664 f = keys.items if sys.version_info.major >= 3 else keys.items 

1665 

1666 if putKeyInColumn is None: 

1667 values = [v[0] + v[1] for k, v in f() if len(v) == 2] 

1668 else: 

1669 values = [] 

1670 for k, v in f(): 

1671 if len(v) == 2: 

1672 nr = v[0] + v[1] 

1673 nr[putKeyInColumn] = k 

1674 values.append(nr) 

1675 

1676 return self._private_getclass()(columns, values) 

1677 else: 

1678 keys = {} 

1679 for row in self.values: 

1680 v = self._interpret_row(row) 

1681 key = functionKey1(v) 

1682 if key in keys: 

1683 keys[key][0].append(row) 

1684 else: 

1685 keys[key] = ([row], None) 

1686 

1687 for row in table.values: 

1688 v = table._interpret_row(row) 

1689 key = functionKey2(v) 

1690 if key in keys: 

1691 if keys[key][1] is None: 

1692 keys[key] = (keys[key][0], [row]) 

1693 else: 

1694 keys[key][1].append(row) 

1695 elif full: 

1696 keys[key] = (None, [row]) 

1697 

1698 if not full: 

1699 d = [] 

1700 for k, v in keys.items(): 

1701 if None in v: 

1702 d.append(k) 

1703 for _ in d: 

1704 del keys[_] 

1705 else: 

1706 for k in keys: # pylint: disable=C0206 

1707 v = keys[k] 

1708 if v[0] is None: 

1709 keys[k] = ([defaultVal1], v[1] 

1710 ) # pylint: disable=E4702 

1711 elif v[1] is None: 

1712 keys[k] = (v[0], [defaultVal2] 

1713 ) # pylint: disable=E4702 

1714 

1715 if keepKey: 

1716 columns = [nameKey] 

1717 for x in self.header: 

1718 while x in columns: 

1719 x += "~" 

1720 columns.append(x) 

1721 

1722 for x in table.header: 

1723 if prefixToAdd is not None: 

1724 x = prefixToAdd + x 

1725 elif addSuffixAnyWay: 

1726 x += "+" 

1727 while x in columns: 

1728 x += "+" 

1729 columns.append(x) 

1730 

1731 f = keys.items if sys.version_info.major >= 3 else keys.items 

1732 

1733 values = [] 

1734 for k, v in f(): 

1735 if len(v) == 2: 

1736 for ka in v[0]: 

1737 for kb in v[1]: 

1738 values.append([k] + ka + kb) 

1739 return self._private_getclass()(columns, values) 

1740 else: 

1741 columns = [] 

1742 for x in self.header: 

1743 while x in columns: 

1744 x += "~" 

1745 columns.append(x) 

1746 

1747 for x in table.header: 

1748 if prefixToAdd is not None: 

1749 x = prefixToAdd + x 

1750 elif addSuffixAnyWay: 

1751 x += "+" 

1752 while x in columns: 

1753 x += "+" 

1754 columns.append(x) 

1755 

1756 f = keys.items if sys.version_info.major >= 3 else keys.items 

1757 

1758 if putKeyInColumn is None: 

1759 values = [v[0] + v[1] for k, v in f() if len(v) == 2] 

1760 else: 

1761 values = [] 

1762 for k, v in f(): 

1763 if len(v) == 2: 

1764 for ka in v[0]: 

1765 for kb in v[1]: 

1766 nr = ka + kb 

1767 nr[putKeyInColumn] = k 

1768 values.append(nr) 

1769 

1770 return self._private_getclass()(columns, values) 

1771 

1772 def filter_quantile(self, function, alpha_min=0.025, alpha_max=0.025): 

1773 """ 

1774 sort all rows using criteria defined by function and remove 

1775 rows at the extremes 

1776 

1777 @param function values used to estimate the quantiles 

1778 @param alpha_min lower quantile 

1779 @param alpha_max higher quantile 

1780 @return a table containing all the rows where the criterium 

1781 is within the two quantiles 

1782 

1783 Example: 

1784 

1785 :: 

1786 

1787 fil = table.filter_quantile(lambda v: v["d_b"], 0, 0.4) 

1788 

1789 @warning Rows are not copied. 

1790 """ 

1791 values = [] 

1792 for row in self.values: 

1793 v = self._interpret_row(row) 

1794 val = function(v) 

1795 values.append((val, row)) 

1796 values.sort() 

1797 lv = len(values) 

1798 i1 = int(lv * alpha_min + 0.5) 

1799 i2 = int(lv * (1 - alpha_max) + 0.5) 

1800 i1 = max(i1, 0) 

1801 i1 = min(i1, lv) 

1802 i2 = max(i1, i2) 

1803 i2 = min(i2, lv) 

1804 if i2 == i1: 

1805 raise RuntimeError("unable to extract quantile, the table is either " 

1806 "empty or chosen quantile are not correct") 

1807 values = [_[1] for _ in values[i1:i2]] 

1808 return self._private_getclass()(self.header, values) 

1809 

1810 def union(self, table): 

1811 """ 

1812 @param table table 

1813 @return table(with the same number of columns) 

1814 

1815 concatenates two tables by rows, they must have the same header, rows of both tables are merged into a single matrix 

1816 Example: 

1817 

1818 :: 

1819 

1820 union = table.union(table2) 

1821 """ 

1822 if len(self.header) != len(table.header): 

1823 raise ValueError( # pragma: no cover 

1824 "tables do not have the same number of columns\ntbl1: %s\ntbl2: %s" % ( 

1825 ",".join(self.header), ",".join(table.header))) 

1826 for a, b in zip(self.header, table.header): 

1827 if a != b: 

1828 raise ValueError( # pragma: no cover 

1829 "tables do not have the same column names") 

1830 return self._private_getclass()(self.header, self.values + table.values) 

1831 

1832 def concatenate(self, table, addPrefix=""): 

1833 """ 

1834 concatenates two tables by columns 

1835 @param table table 

1836 @param addPrefix add a prefix to each column from table 

1837 @return table (with the same number of rows as the longest one) 

1838 """ 

1839 maxr = max(len(self), len(table)) 

1840 header = self.header + [addPrefix + h for h in table.header] 

1841 values = [] 

1842 for i in range(0, maxr): 

1843 r1 = self.values[i] if i < len(self) else [None] * len(self.header) 

1844 r2 = table.values[i] if i < len( 

1845 table) else [None] * len(self.table) 

1846 values.append(r1 + r2) 

1847 return self._private_getclass()(header, values) 

1848 

1849 def random(self, n, unique=False): 

1850 """ 

1851 select n random row from the table, returns a table 

1852 

1853 @param n number of desired random rows 

1854 @param unique draws unique rows or non unique rows 

1855 (tirage sans remise ou avec remise) 

1856 @return a table 

1857 

1858 Example: 

1859 

1860 :: 

1861 

1862 rnd = table.random(10) 

1863 """ 

1864 if unique: 

1865 if n > len(self): 

1866 raise ValueError( # pragma: no cover 

1867 "number of desired random rows is higher " 

1868 "than the number of rows in the table") 

1869 index = {} 

1870 while len(index) < n: 

1871 h = random.randint(0, len(self) - 1) 

1872 index[h] = 0 

1873 values = [self.values[h] for h in index] 

1874 return self._private_getclass()(self.header, values) 

1875 else: 

1876 values = [] 

1877 for i in range(0, n): 

1878 h = random.randint(0, len(self) - 1) 

1879 values.append(self.values[h]) 

1880 return self._private_getclass()(self.header, values) 

1881 

1882 def todict(self, functionKey, functionValue, useList=False): 

1883 """ 

1884 convert the table as a dictionary { key:value } 

1885 each of them is defined by functions. 

1886 

1887 @param functionKey defines the key 

1888 @param functionValue defines the value 

1889 @param useList if there are multiple rows sharing the same key, it should be true, 

1890 all values are stored in a list 

1891 @return a dictionary { key:row } or { key: [row1, row2, ...] } 

1892 

1893 Example: 

1894 

1895 :: 

1896 

1897 d = table.todict(lambda v: v["name"], lambda v: v["d_b"], True) 

1898 """ 

1899 res = {} 

1900 if useList: 

1901 for row in self.values: 

1902 v = self._interpret_row(row) 

1903 key = functionKey(v) 

1904 val = functionValue(v) 

1905 if key in res: 

1906 res[key].append(val) 

1907 else: 

1908 res[key] = [val] 

1909 else: 

1910 for row in self.values: 

1911 v = self._interpret_row(row) 

1912 key = functionKey(v) 

1913 val = functionValue(v) 

1914 res[key] = val 

1915 return res 

1916 

1917 def reduce_dict(self, functionKey, functionValue, uselist=False): 

1918 """ 

1919 @see me todict 

1920 """ 

1921 return self.todict(functionKey, functionValue, uselist) 

1922 

1923 def select(self, functionRow): 

1924 """ 

1925 @param functionRow fonction 

1926 @return table 

1927 

1928 Example: 

1929 

1930 :: 

1931 

1932 d = table.select(lambda v:(v["name"], v["d_b"])) 

1933 print(list(d)) 

1934 """ 

1935 for row in self.values: 

1936 v = self._interpret_row(row) 

1937 nr = functionRow(v) 

1938 yield nr 

1939 

1940 def modify_all(self, modification_function): 

1941 """ 

1942 apply the same modification to every number 

1943 @param modification_function modification to apply to every number 

1944 @return new table 

1945 

1946 The signature of the function is the following one: 

1947 

1948 :: 

1949 

1950 def function(value, column_name): 

1951 # .... 

1952 return new_value 

1953 

1954 Example: 

1955 

1956 :: 

1957 

1958 tbl = tbl.modify_all(lambda v,c: {"string":"", "numerical":0}.get(c,None) if v is None else v) 

1959 """ 

1960 values = [] 

1961 for row in self.values: 

1962 r = [] 

1963 for v, h in zip(row, self.header): 

1964 r.append(modification_function(v, h)) 

1965 values.append(r) 

1966 return self._private_getclass()(self.header, values) 

1967 

1968 def dcast(self, functionKey, functionInstance, full=True): 

1969 """ 

1970 @see me multiply_column_by_row_instance 

1971 """ 

1972 return self.multiply_column_by_row_instance(functionKey, functionInstance, full) 

1973 

1974 def multiply_column_by_row_instance(self, functionKey, functionInstance, full=True): 

1975 """ 

1976 @param functionKey defines a key(function) 

1977 @param functionInstance defines a second key(will be moved to the columns dimension) 

1978 @param full introduces missing values for not found combinations 

1979 @return a table 

1980 

1981 If a column contains a finite set of value, for example, 

1982 we have the temperature for several cities organized like if 

1983 it were a table from a database: city, date, temperatue. 

1984 We would like to get another table where we have: 

1985 date temparature_city1 temperature_city2... 

1986 

1987 Then we would type: 

1988 Example: 

1989 

1990 :: 

1991 

1992 mul = table.multiply_column_by_row_instance( 

1993 lambda v: v["date"], 

1994 lambda v: v["city"]) 

1995 

1996 The input table would be like: 

1997 

1998 :: 

1999 

2000 city date 

2001 A jan 

2002 A feb 

2003 B feb 

2004 

2005 It returns: 

2006 

2007 :: 

2008 

2009 KEY A|city A|date B|city B|date 

2010 feb A feb B feb 

2011 jan A jan None None 

2012 """ 

2013 values = [functionInstance(self._interpret_row(row)) 

2014 for row in self.values] 

2015 distinct = {} 

2016 for v in values: 

2017 distinct[v] = 0 

2018 distinct = [_ for _ in distinct] 

2019 distinct.sort() 

2020 table1 = copy.deepcopy(self) 

2021 table = None 

2022 header = copy.copy(table1.header) 

2023 orig = len(header) 

2024 nameKey = "~KEY~" 

2025 while nameKey in header: 

2026 nameKey += "*" 

2027 nbJoin = 0 

2028 

2029 for val in distinct: 

2030 table2 = table1.filter( 

2031 lambda v, val=val: functionInstance(v) == val) 

2032 if table is None: 

2033 table = table2.copy() 

2034 else: 

2035 colkey = table.header[0] 

2036 table = table.innerjoin(table2, functionKey if nbJoin == 0 else (lambda v, c=colkey: v[c]), 

2037 functionKey, nameKey=nameKey, 

2038 prefixToAdd=str(val) + "|", 

2039 full=full, keepKey=nbJoin == 0, 

2040 putKeyInColumn=None if nbJoin == 0 else 0, 

2041 uniqueKey=True) 

2042 

2043 if nbJoin == 0: 

2044 head = [] 

2045 nb = 0 

2046 for h in table.header: 

2047 if not h.endswith("~") and nb < orig: 

2048 head.append(f"{distinct[0]}|{h}") 

2049 nb += 1 

2050 else: 

2051 head.append(h) 

2052 header = ["KEY"] + head[1:] 

2053 table = self._private_getclass()(header, table.values) 

2054 

2055 nbJoin += 1 

2056 

2057 if nbJoin == 0: 

2058 head = [] 

2059 nb = 0 

2060 for h in table.header: 

2061 if not h.endswith("~") and nb < orig: 

2062 head.append(f"{distinct[0]}|{h}") 

2063 nb += 1 

2064 else: 

2065 head.append(h) 

2066 values = [] 

2067 for row in self.values: 

2068 v = self._interpret_row(row) 

2069 r = [functionKey(v)] + row 

2070 values.append(r) 

2071 header = ["KEY"] + head 

2072 table = self._private_getclass()(header, values) 

2073 

2074 return table 

2075 

2076 def create_index(self, functionIndex): 

2077 """ 

2078 this method creates an index, 

2079 to get an indexes row, use method get 

2080 Example: 

2081 

2082 :: 

2083 

2084 table.create_index(lambda v:(v["name"], v["d_a"])) 

2085 row = table.get(('A', 1.1)) 

2086 value = table.get(('A', 1.1), 2) 

2087 """ 

2088 self.indexspecial = {} 

2089 for row in self.values: 

2090 v = self._interpret_row(row) 

2091 nr = functionIndex(v) 

2092 if nr in self.indexspecial: 

2093 raise KeyError( 

2094 f"unable to add {str(nr)} because it is already present") 

2095 self.indexspecial[nr] = row 

2096 return self 

2097 

2098 def get(self, rowIndex, column=None): 

2099 """ 

2100 use the index created by method create_index 

2101 Example: 

2102 

2103 :: 

2104 

2105 table.create_index(lambda v:(v["name"], v["d_a"])) 

2106 row = table.get(('A', 1.1)) 

2107 value = table.get(('A', 1.1), 2) 

2108 """ 

2109 if "indexspecial" not in self.__dict__: 

2110 raise RuntimeError("no index was created") 

2111 row = self.indexspecial[rowIndex] 

2112 if column is None: 

2113 return row 

2114 elif isinstance(column, int): 

2115 return row[column] 

2116 else: 

2117 return row[self.index[column]] 

2118 

2119 def avg_std(self, functionValue, functionWeight=lambda v: 1): 

2120 """ 

2121 returns the average and standard deviation 

2122 """ 

2123 avg = 0. 

2124 std = 0. 

2125 n = 0. 

2126 for i, row in enumerate(self.values): 

2127 v = self._interpret_row(row) 

2128 x = float(functionValue(v)) 

2129 w = functionWeight(v) 

2130 avg += x * w 

2131 std += x * x * w 

2132 n += w 

2133 

2134 if n != 0: 

2135 avg /= n 

2136 std /= n 

2137 std -= avg * avg 

2138 std = math.sqrt(std) 

2139 else: 

2140 avg = 0. 

2141 std = 0. 

2142 return avg, std 

2143 

2144 def add_column_cumulative(self, column_index, column_name, functionIndex, functionValue, 

2145 normalize=False, reverse=False, cumulative=True, functionSort=None): 

2146 """ 

2147 also called the Gini function 

2148 Example: 

2149 

2150 :: 

2151 

2152 table.add_column_cumulative("index_%s" % col, "dist_%s" % col, 

2153 lambda v: v["sum_nbclient"], lambda v: v[col], 

2154 functionSort = lambda v: v [col] / v["sum_nbclient"], 

2155 normalize=True) 

2156 """ 

2157 if functionSort is None: 

2158 functionSort = functionValue 

2159 val = [] 

2160 for row in self.values: 

2161 v = self._interpret_row(row) 

2162 i = functionIndex(v) 

2163 s = functionSort(v) 

2164 v = functionValue(v) 

2165 val.append((s, i, v)) 

2166 val.sort(reverse=reverse) 

2167 

2168 if cumulative: 

2169 res = [(0., 0.)] 

2170 for s, i, v in val: 

2171 res.append((i + res[-1][0], v + res[-1][1])) 

2172 del res[0] 

2173 

2174 if normalize: 

2175 sumi = res[-1][0] 

2176 sumv = res[-1][1] 

2177 if sumi != 0 and sumv != 0: 

2178 res = [(_[0] / sumi, _[1] / sumv) for _ in res] 

2179 else: 

2180 raise ZeroDivisionError( 

2181 "cannot divide by zero, all indexes or all values are null") 

2182 else: 

2183 res = [(i, v) for s, i, v in val] 

2184 

2185 if normalize: 

2186 sumi = sum([_[0] for _ in res]) 

2187 sumv = sum([_[1] for _ in res]) 

2188 if sumi != 0 and sumv != 0: 

2189 res = [(_[0] / sumi, _[1] / sumv) for _ in res] 

2190 else: 

2191 raise ZeroDivisionError( 

2192 "cannot divide by zero, all indexes or all values are null") 

2193 

2194 for row, add in zip(self.values, res): 

2195 row.extend(add) 

2196 self.index[column_index] = len(self.index) 

2197 self.index[column_name] = len(self.index) 

2198 self.header.append(column_index) 

2199 self.header.append(column_name) 

2200 return self 

2201 

2202 def transpose(self, labelC=None, labelAsRow=True): 

2203 """ 

2204 Computes the transpose. 

2205 @param labelC proposes labels for the column, 

2206 if None, take "r%d" % i, 

2207 if it is a string, the function assumes it is a column name 

2208 @param labelAsRow add the label as a row 

2209 @return new table 

2210 """ 

2211 if labelC is None: 

2212 label = ["r%d" % i for i in range(0, len(self.values))] 

2213 if labelAsRow: 

2214 label = ["rowheader"] + label 

2215 rem = None 

2216 elif isinstance(labelC, str): 

2217 label = list(self.select(lambda v: v[labelC])) 

2218 rem = label 

2219 else: 

2220 rem = None 

2221 label = labelC 

2222 

2223 values = [] 

2224 for i in range(0, len(self.header)): 

2225 if rem is not None and self.header[i] == labelC: 

2226 continue 

2227 row = [_[i] for _ in self.values] 

2228 if labelAsRow: 

2229 row = [self.header[i]] + row 

2230 values.append(row) 

2231 return self._private_getclass()(label, values) 

2232 

2233 def covariance(self): 

2234 """ 

2235 Computes the covariance matrix, the first column 

2236 will contains the column names. 

2237 @return new table 

2238 """ 

2239 for i, x in enumerate(self.values[0]): 

2240 if not isinstance(x, float): 

2241 raise TypeError("expecting a float on column %d" % i) 

2242 values = self.np_matrix 

2243 N = values.shape[0] 

2244 sums = numpy.sum(values, axis=0) / N 

2245 for i in range(0, values.shape[1]): 

2246 values[:, i] -= sums[0, i] 

2247 cov = values.transpose() * values 

2248 cov /= N 

2249 head = ["var"] + self.header 

2250 size = cov.shape 

2251 values = [[self.header[ 

2252 i]] + [float(cov[i, j]) for j in range(0, size[1])] for i in range(0, size[0])] 

2253 tbl = self._private_getclass()(head, values) 

2254 return tbl 

2255 

2256 def correlation_col(self, col1, col2, noCenter=False): 

2257 """ 

2258 Computes the correlation between two columns. 

2259 @param col1 column 1 

2260 @param col2 column 2 

2261 @param noCenter does the computation without removing the average 

2262 @return float(covariance) 

2263 """ 

2264 values = [[self._interpret_row(row)[col1], self._interpret_row(row)[ 

2265 col2]] for row in self.values] 

2266 if len(values) <= 1: 

2267 raise ValueError( # pragma: no cover 

2268 f"expecting more than one observation, not {len(values)}") 

2269 mx = 0. 

2270 my = 0. 

2271 vx = 0. 

2272 vy = 0. 

2273 co = 0. 

2274 nb = 0. 

2275 for a, b in values: 

2276 nb += 1 

2277 mx += a 

2278 my += b 

2279 vx += a ** 2 

2280 vy += b ** 2 

2281 co += a * b 

2282 mx /= nb 

2283 my /= nb 

2284 vx /= nb 

2285 vy /= nb 

2286 co /= nb 

2287 if not noCenter: 

2288 vx -= mx ** 2 

2289 vy -= my ** 2 

2290 co -= mx * my 

2291 vx = vx ** 0.5 

2292 vy = vy ** 0.5 

2293 v = vx * vy 

2294 if v != 0: 

2295 co /= v 

2296 return co 

2297 

2298 def covariance_col(self, col1, col2, noCenter=False): 

2299 """ 

2300 Computes the correlation between two columns. 

2301 @param col1 column 1 

2302 @param col2 column 2 

2303 @param noCenter does the computation without removing the average 

2304 @return float(covariance) 

2305 """ 

2306 values = [[self._interpret_row(row)[col1], 

2307 self._interpret_row(row)[col2]] for row in self.values] 

2308 

2309 if len(values) <= 1: 

2310 raise ValueError( # pragma: no cover 

2311 f"expecting more than one observation, not {len(values)}") 

2312 

2313 mx = 0. 

2314 my = 0. 

2315 co = 0. 

2316 nb = 0. 

2317 for a, b in values: 

2318 nb += 1 

2319 mx += a 

2320 my += b 

2321 co += a * b 

2322 mx /= nb 

2323 my /= nb 

2324 co /= nb 

2325 if not noCenter: 

2326 co -= mx * my 

2327 return co 

2328 

2329 def correlation_row(self, row1, row2, noCenter=False): 

2330 """ 

2331 computes the correlation between two columns 

2332 @param row1 row 1(integer) 

2333 @param row2 row 2(integer) 

2334 @param noCenter does the computation without removing the average 

2335 @return float(covariance) 

2336 """ 

2337 values = [[a, b] for a, b in zip(self.values[row1], self.values[row2])] 

2338 if len(values) <= 1: 

2339 raise ValueError( # pragma: no cover 

2340 f"expecting more than one observation, not {len(values)}") 

2341 mx = 0. 

2342 my = 0. 

2343 vx = 0. 

2344 vy = 0. 

2345 co = 0. 

2346 nb = 0. 

2347 for a, b in values: 

2348 nb += 1 

2349 mx += a 

2350 my += b 

2351 vx += a ** 2 

2352 vy += b ** 2 

2353 co += a * b 

2354 mx /= nb 

2355 my /= nb 

2356 vx /= nb 

2357 vy /= nb 

2358 co /= nb 

2359 if not noCenter: 

2360 vx -= mx ** 2 

2361 vy -= my ** 2 

2362 co -= mx * my 

2363 vx = vx ** 0.5 

2364 vy = vy ** 0.5 

2365 v = vx * vy 

2366 if v != 0: 

2367 co /= v 

2368 return co 

2369 

2370 def covariance_row(self, row1, row2, noCenter=False): 

2371 """ 

2372 computes the correlation between two columns 

2373 @param row1 row 1(integer) 

2374 @param row2 row 2(integer) 

2375 @param noCenter does the computation without removing the average 

2376 @return float(covariance) 

2377 """ 

2378 values = [[a, b] for a, b in zip(self.values[row1], self.values[row2])] 

2379 if len(values) <= 1: 

2380 raise ValueError( # pragma: no cover 

2381 f"expecting more than one observation, not {len(values)}") 

2382 mx = 0. 

2383 my = 0. 

2384 co = 0. 

2385 nb = 0. 

2386 for a, b in values: 

2387 nb += 1 

2388 mx += a 

2389 my += b 

2390 co += a * b 

2391 mx /= nb 

2392 my /= nb 

2393 co /= nb 

2394 if not noCenter: 

2395 co -= mx * my 

2396 return co 

2397 

2398 def correlation(self, useBootstrap=False, collapseFormat=True, nbdraws=-1, alpha=0.05, 

2399 functionKeepValue=lambda val, low, high: f"{val:f}|{low:f},{high:f}"): 

2400 """ 

2401 Computes the correlation matrix, the first column 

2402 will contains the column names. 

2403 

2404 @param useBootstrap if True, use a bootstrap method to estimate the correlation 

2405 @param collapseFormat if True and useBootstrap is True, produces a format 

2406 ``average|lower bound|higher bound(at a definite confidence level)`` 

2407 @param nbdraws number of draws(if -1, then it will be equal to the number of observations) 

2408 @param alpha confidence level 

2409 @param functionKeepValue if collapseFormat is True, this function is used to collapse val,low,high in a single string 

2410 @return new table 

2411 """ 

2412 if useBootstrap: 

2413 head = ["var"] + self.header 

2414 values = [[i] + [0. for r in self.header] for i in self.header] 

2415 for i in range(len(self.header)): 

2416 values[i][0] = self.header[i] 

2417 for j in range(len(self.header)): 

2418 vs = [[row[i], row[j]] for row in self.values] 

2419 bo = TableFormula.bootstrap(vs, function=TableFormula.correlation_bicolumn, 

2420 nbdraws=nbdraws, alpha=alpha) 

2421 if collapseFormat: 

2422 st = functionKeepValue(bo[0], bo[2], bo[3]) 

2423 values[i][j + 1] = st 

2424 else: 

2425 raise NotImplementedError( # pragma: no cover 

2426 "collapseFormat False is not implemented yet") 

2427 tbl = self._private_getclass()(head, values) 

2428 return tbl 

2429 else: 

2430 for i, x in enumerate(self.values[0]): 

2431 if not isinstance(x, float): 

2432 raise TypeError( # pragma: no cover 

2433 "expecting a float on column %d" % i) 

2434 

2435 values = self.np_matrix 

2436 N = values.shape[0] 

2437 sums = [sum(values[:, i]) / N for i in range(0, values.shape[1])] 

2438 

2439 for i in range(0, values.shape[1]): 

2440 values[:, i] -= sums[i] 

2441 

2442 cov = values.transpose() * values 

2443 cov /= N 

2444 diag = [cov[i, i] ** 0.5 for i in range(cov.shape[0])] 

2445 for i in range(cov.shape[0]): 

2446 if diag[i] > 0: 

2447 cov[i, :] /= diag[i] 

2448 cov[:, i] /= diag[i] 

2449 

2450 head = ["var"] + self.header 

2451 size = cov.shape 

2452 values = [[self.header[ 

2453 i]] + [float(cov[i, j]) for j in range(0, size[1])] for i in range(0, size[0])] 

2454 tbl = self._private_getclass()(head, values) 

2455 return tbl 

2456 

2457 def values_to_float(self, only_if_possible=False, subset_columns=None): 

2458 """ 

2459 converts all values into float 

2460 @param only_if_possible if True, converts all possible values and catches exception, 

2461 if False, converts everything, raises an exception when not possible 

2462 @param subset_columns if None, takes all of them, otherwise, try to convert 

2463 only for the listed columns 

2464 @return table 

2465 """ 

2466 tbl = self.copy() 

2467 if subset_columns is not None: 

2468 subset = {i: True for i, v in enumerate( 

2469 self.header) if v in subset_columns} 

2470 if only_if_possible: 

2471 for row in tbl.values: 

2472 for i in range(0, len(row)): 

2473 if subset_columns is None or i in subset: 

2474 try: 

2475 v = float(row[i]) 

2476 row[i] = v 

2477 except (ValueError, TypeError): 

2478 continue 

2479 else: 

2480 for row in tbl.values: 

2481 for i in range(0, len(row)): 

2482 if subset_columns is None or i in subset: 

2483 row[i] = float(row[i]) 

2484 return tbl 

2485 

2486 def values_to_str(self, subset_columns=None, format=None): 

2487 """ 

2488 converts all values into str 

2489 @param subset_columns if None, takes all of them, otherwise, try to convert 

2490 only for the listed columns 

2491 @param format format for the conversion, by None by default but it could be for exemple %1.2f. 

2492 @return table 

2493 """ 

2494 tbl = self.copy() 

2495 if subset_columns is not None: 

2496 subset = {i: True for i, v in enumerate( 

2497 self.header) if v in subset_columns} 

2498 

2499 if format is None: 

2500 for row in tbl.values: 

2501 for i in range(0, len(row)): 

2502 if subset_columns is None or i in subset: 

2503 row[i] = str(row[i]) 

2504 else: 

2505 for row in tbl.values: 

2506 for i in range(0, len(row)): 

2507 if (subset_columns is None or i in subset) and isinstance(row[i], float): 

2508 row[i] = format % row[i] 

2509 return tbl 

2510 

2511 def values_to_date(self, format=None, only_if_possible=False, subset_columns=None): 

2512 """ 

2513 converts all values into dates 

2514 @param only_if_possible if True, converts all possible values and catches exception, 

2515 if False, converts everything, raises an exception when not possible 

2516 @param format date format see fn str_to_datetime 

2517 @param subset_columns if None, takes all of them, otherwise, try to convert 

2518 only for the listed columns 

2519 @return table 

2520 """ 

2521 tbl = self.copy() 

2522 if subset_columns is not None: 

2523 subset = {i: True for i, v in enumerate( 

2524 self.header) if v in subset_columns} 

2525 if only_if_possible: 

2526 if subset_columns is not None: 

2527 subset = {i: True for i, v in enumerate( 

2528 self.header) if v in subset_columns} 

2529 

2530 for row in tbl.values: 

2531 for i in range(0, len(row)): 

2532 if subset_columns is None or i in subset: 

2533 try: 

2534 v = str2datetime(row[i], format) 

2535 row[i] = v 

2536 except (ValueError, TypeError): 

2537 continue 

2538 else: 

2539 for row in tbl.values: 

2540 for i in range(0, len(row)): 

2541 if subset_columns is None or i in subset: 

2542 row[i] = float(row[i]) 

2543 return tbl 

2544 

2545 def histogram(self, functionValue, nbDiv=100, secondColumnIsWeight=False, 

2546 normalize=True, removeExtreme=0.05): 

2547 """ 

2548 computes an histograms on one vector 

2549 @param functionValue function which produces the value to histogram 

2550 @param nbDiv number of divisions for this histograms(boundaries are min and max) 

2551 @param secondColumnIsWeight if True, the second column is the weight 

2552 @param normalize if True, normalize by the sum of weights 

2553 @param removeExtreme remove extreme values at both sides(0.05 means 0.025 on each side) 

2554 @return table with two columns 

2555 """ 

2556 values = list([functionValue(self._interpret_row(row)) # pylint: disable=R1728 

2557 for row in self.values]) 

2558 if removeExtreme is not None and removeExtreme > 0: 

2559 values.sort() 

2560 al = int(len(values) * removeExtreme / 2) 

2561 if al == 0: 

2562 raise RuntimeError( # pragma: no cover 

2563 "removeExtreme has no impact(%d,%f)" % ( 

2564 len(values), len(values) * removeExtreme / 2)) 

2565 if al * 2 < len(values): 

2566 values = values[al:len(values) - al] 

2567 

2568 mi = min(values) 

2569 ma = max(values) 

2570 

2571 if isinstance(values[0], (tuple, list)): 

2572 W = 0. 

2573 div = (ma[0] - mi[0]) / nbDiv 

2574 hist = [[mi[0] + n * div, 0.] for n in range(0, nbDiv + 1)] 

2575 for v in values: 

2576 x = int((v[0] - mi[0]) // div) 

2577 hist[x][1] += v[1] 

2578 W += v[1] 

2579 mi = mi[0] 

2580 else: 

2581 W = len(values) 

2582 div = (ma - mi) / nbDiv 

2583 hist = [[mi + n * div, 0.] for n in range(0, nbDiv + 1)] 

2584 for v in values: 

2585 x = int((v - mi) // div) 

2586 if 0 <= x < len(hist): 

2587 hist[x][1] += 1. 

2588 

2589 if normalize and W > 0: 

2590 for i in range(len(hist)): 

2591 hist[i][1] /= W 

2592 

2593 values = [[mi + n * div, hist[n]] for n in range(len(hist))] 

2594 tbl = self._private_getclass()(["x", "hist(x)"], hist) 

2595 return tbl 

2596 

2597 def histograms(self, columnsSet, nbDiv=100, secondColumnIsWeight=False, 

2598 normalize=True, removeExtreme=0.05, histxName="histKey"): 

2599 """ 

2600 computes a common histograms on all columns 

2601 @param columnsSet set of columns 

2602 @param nbDiv number of divisions for this histograms(boundaries are min and max) 

2603 @param secondColumnIsWeight if True, the second column is the weight 

2604 @param normalize if True, normalize by the sum of weights 

2605 @param removeExtreme remove extreme values at both sides(0.05 means 0.025 on each side) 

2606 @param histxName column name given to the x axis shared by every histogram 

2607 @return table with two columns 

2608 

2609 @warning The function skips any NaN of Inf value. 

2610 """ 

2611 values = [] 

2612 for row in self.values: 

2613 temp = self._interpret_row(row) 

2614 for t in columnsSet: 

2615 values.append(temp[t]) 

2616 

2617 if removeExtreme is not None and removeExtreme > 0: 

2618 values.sort() 

2619 al = int(len(values) * removeExtreme / 2) 

2620 if al == 0: 

2621 raise RuntimeError( # pragma: no cover 

2622 "removeExtreme has no impact(%d,%f)" % ( 

2623 len(values), len(values) * removeExtreme / 2)) 

2624 if al * 2 < len(values): 

2625 values = values[al:len(values) - al] 

2626 

2627 mi = min(values) 

2628 ma = max(values) 

2629 W = len(values) 

2630 div = (ma - mi) / nbDiv 

2631 if div == 0: 

2632 raise RuntimeError( # pragma: no cover 

2633 f"unable to continue since div is null: min,max = {mi:f},{ma:f}") 

2634 hist = [[mi + n * div, 0.] for n in range(0, nbDiv + 1)] 

2635 value = {i: {histxName: hist[i][0]} for i in range(len(hist))} 

2636 su = {} 

2637 for row in self.values: 

2638 for _ in columnsSet: 

2639 temp = self._interpret_row(row) 

2640 if math.isnan(temp[_]) or math.isinf(temp[_]): 

2641 continue 

2642 x = int((temp[_] - mi) // div) 

2643 if x not in value: 

2644 # it means extremes were removed 

2645 continue 

2646 #raise RuntimeError("value %d,%f is not allowed min,max = [%f,%f]" %(x, temp[_], mi, ma)) 

2647 value[x][_] = value[x].get(_, 0.) + 1. 

2648 su[_] = su.get(_, 0.) + 1. 

2649 

2650 if normalize and W > 0: 

2651 for v in value.values(): 

2652 for _ in v: 

2653 if _ != histxName: 

2654 v[_] /= su[_] 

2655 

2656 tbl = self._private_getclass()("__byrow__", value) 

2657 return tbl 

2658 

2659 def union_columns(self, columnsSet): 

2660 """ 

2661 computes the union of all values from all columns present in columnSet 

2662 @param columnsSet set of columns 

2663 @return table 

2664 """ 

2665 values = [] 

2666 for row in self.values: 

2667 temp = self._interpret_row(row) 

2668 for t in columnsSet: 

2669 values.append(temp[t]) 

2670 tbl = self._private_getclass()(["x"], [[x] for x in values]) 

2671 return tbl 

2672 

2673 def mu_sigma(self, functionValues, removeExtreme=None): 

2674 """ 

2675 computes the average and the standard deviation a vector of values 

2676 @param functionValues function produces the vector of values 

2677 @param removeExtreme remove extreme values at both sides(0.05 means 0.025 on each side) 

2678 @return (average, standard deviation) 

2679 """ 

2680 if removeExtreme is not None and removeExtreme > 0: 

2681 values = [] 

2682 for row in self.values: 

2683 row = self._interpret_row(row) 

2684 val = functionValues(row) 

2685 values.append(val) 

2686 values.sort() 

2687 al = int(len(values) * removeExtreme / 2) 

2688 if al == 0: 

2689 raise RuntimeError("removeExtreme has no impact(%d,%f)" % ( 

2690 len(values), len(values) * removeExtreme / 2)) 

2691 if al * 2 < len(values): 

2692 values = values[al:len(values) - al] 

2693 tbl = TableFormula(["x"], [[_] for _ in values]) 

2694 return tbl.mu_sigma(lambda v: v["x"], 0) 

2695 else: 

2696 mu = 0. 

2697 si = 0. 

2698 nb = 0. 

2699 for row in self.values: 

2700 row = self._interpret_row(row) 

2701 val = functionValues(row) 

2702 mu += val 

2703 si += val ** 2 

2704 nb += 1. 

2705 mu /= nb 

2706 si /= nb 

2707 si -= mu ** 2 

2708 return mu, si ** 0.5 

2709 

2710 def mu_sigma_each_column(self, columnsSet=None, removeExtreme=None): 

2711 """ 

2712 returns a table with the average and the standard deviation for each columns 

2713 @param removeExtreme remove extreme values at both sides(0.05 means 0.025 on each side) 

2714 @param columnsSet set of column to deal with 

2715 @return table with two rows: average and standard deviation 

2716 """ 

2717 values = [[], []] 

2718 if columnsSet is None: 

2719 columnsSet = self.header 

2720 for col in columnsSet: 

2721 mu, sigma = self.mu_sigma( 

2722 (lambda v, col=col: v[col]), removeExtreme) 

2723 values[0].append(mu) 

2724 values[1].append(sigma) 

2725 tbl = self._private_getclass()(columnsSet, values) 

2726 return tbl 

2727 

2728 @property 

2729 def np_matrix(self): 

2730 """ 

2731 returns the values as a numpy matrix 

2732 @return numpy matrix 

2733 """ 

2734 return numpy.matrix(self.values) 

2735 

2736 @property 

2737 def np_array(self): 

2738 """ 

2739 returns the values as a numpy array 

2740 @return numpy array 

2741 """ 

2742 return numpy.array(self.values) 

2743 

2744 @property 

2745 def dataframe(self): 

2746 """ 

2747 creates a pandas dataframe 

2748 @return pandas.dataframe 

2749 """ 

2750 return pandas.DataFrame(self.values, columns=self.header) 

2751 

2752 @property 

2753 def json(self): 

2754 """ 

2755 returns a json format 

2756 @return string 

2757 """ 

2758 rows = [row for row in self] 

2759 return json.dumps(rows) 

2760 

2761 def center_reduce(self, columnsSet=None, op=None, removeExtreme=None, mu_sigma=None): 

2762 """ 

2763 center and reduce a set of columns(or all if columnsSet is None) 

2764 @param removeExtreme remove extreme values at both sides(0.05 means 0.025 on each side) 

2765 @param columnsSet set of column to deal with 

2766 @param op if can be: 

2767 - None: substract mean and normalize, 

2768 - "mean": substract mean only, 

2769 - "norm": normalize only 

2770 @param mu_sigma matrix with two rows(one for mean, second for sigma), if None, 

2771 if computes that from the matrix self, columns must have the same order 

2772 that columnSet 

2773 @return the same table(with only the considered columns) 

2774 """ 

2775 if op not in [None, "mean", "norm"]: 

2776 raise ValueError( # pragma: no cover 

2777 'expecting a value in [None, "mean", "norm"] for op') 

2778 if columnsSet is None: 

2779 columnsSet = self.header 

2780 mus = self.mu_sigma_each_column( 

2781 columnsSet, removeExtreme) if mu_sigma is None else mu_sigma 

2782 tbl = self.extract_columns(columnsSet) 

2783 n = len(self.header) 

2784 for row in tbl.values: 

2785 if op is None or op == "mean": 

2786 for i in range(n): 

2787 row[i] -= mus.values[0][i] 

2788 if op is None or op == "norm": 

2789 for i in range(n): 

2790 row[i] /= mus.values[1][i] 

2791 return tbl 

2792 

2793 @staticmethod 

2794 def save_multiple_as_excel(filename, list_table, font="Calibri", close=True, encoding=None): 

2795 """ 

2796 saves multiple table in one Excel file 

2797 

2798 @param filename filename(can be None) 

2799 @param list_table list of 2uple("name", tbl) 

2800 @param font font name 

2801 @param close if True, close the file, otherwise, the user will have to 

2802 @param encoding encoding 

2803 @return object Workbook 

2804 """ 

2805 ext = os.path.splitext( 

2806 filename)[-1].lower() if filename is not None else None 

2807 if ext is not None and ext == ".xls": 

2808 font0 = EXf.Font() 

2809 font0.name = font 

2810 font0.bold = True 

2811 style0 = EXs.XFStyle() 

2812 style0.font = font0 

2813 

2814 wb = EXw.Workbook( 

2815 encoding=encoding) if encoding is not None else EXw.Workbook() 

2816 for sheet_name, self in list_table: 

2817 ws0 = wb.add_sheet(sheet_name) 

2818 

2819 for i, l in enumerate(self.header): 

2820 ws0.write(0, i, l, style0) 

2821 

2822 fnt = EXf.Font() 

2823 fnt.name = font 

2824 style = EXs.XFStyle() 

2825 style.font = fnt 

2826 

2827 for irow, row in enumerate(self.values): 

2828 for icol, val in enumerate(row): 

2829 if isinstance(val, (int, float)): 

2830 st = val 

2831 elif isinstance(val, str): 

2832 if encoding is not None: 

2833 st = val.encode(encoding).decode(encoding) 

2834 else: 

2835 st = val 

2836 elif val is not None: 

2837 st = str(val) 

2838 else: 

2839 continue 

2840 ws0.write(irow + 1, icol, st, style) 

2841 

2842 wb.save(filename) 

2843 return wb 

2844 

2845 elif ext is None or ext == ".xlsx": 

2846 wb = EXxw.Workbook( 

2847 filename) if filename is not None else EXxw.Workbook() 

2848 for sheet_name, self in list_table: 

2849 ws0 = wb.add_worksheet(sheet_name) 

2850 

2851 style0 = wb.add_format({'bold': True}) 

2852 style0.set_font_name(font) 

2853 

2854 for i, l in enumerate(self.header): 

2855 ws0.write(0, i, l, style0) 

2856 

2857 style = wb.add_format() 

2858 style.set_font_name(font) 

2859 

2860 for irow, row in enumerate(self.values): 

2861 for icol, val in enumerate(row): 

2862 if isinstance(val, (int, float)): 

2863 st = val 

2864 elif isinstance(val, str): 

2865 if encoding is not None: 

2866 st = val.encode(encoding).decode(encoding) 

2867 else: 

2868 st = val 

2869 elif val is not None: 

2870 st = str(val) 

2871 else: 

2872 continue 

2873 ws0.write(irow + 1, icol, st, style) 

2874 

2875 if filename is not None and close: 

2876 wb.close() 

2877 return wb 

2878 else: 

2879 raise NameError( 

2880 "extension should be .xls or .xlsx for file " + filename) 

2881 

2882 def save_as_excel(self, filename, font="Calibri", sheet_name="sheet0", 

2883 close=True, encoding=None): 

2884 """ 

2885 saves the table as a new Excel file, you can use ``.xls`` or ``.xlsx`` 

2886 if filename is None, the function returns an object(xslx) and does not save it. 

2887 

2888 @param filename Excel filename 

2889 @param sheet_name name of the sheet to add 

2890 @param font font name 

2891 @param close if True, close the file, otherwise, the user will have to 

2892 @param encoding encoding 

2893 @return object Workbook 

2894 """ 

2895 return TableFormula.save_multiple_as_excel(filename, [(sheet_name, self)], 

2896 font=font, close=close, encoding=encoding) 

2897 

2898 def schema_database(self, add_id=True): 

2899 """ 

2900 returns the schema for a database which would contains this database 

2901 

2902 @param add_id if True, adds an index "PRIMARYKEY" 

2903 @return dictionary { index_column:(name, type) } 

2904 """ 

2905 schema = {i: (l, str) for i, l in enumerate(self.header)} 

2906 if add_id is not None: 

2907 schema[-1] = (add_id, int, "PRIMARYKEY", "AUTOINCREMENT") 

2908 

2909 if len(self) > 0: 

2910 # we use the first row to determine type 

2911 for i, v in enumerate(self.values[0]): 

2912 if not isinstance(v, str): 

2913 schema[i] = (schema[i][0], type(v)) 

2914 return schema 

2915 

2916 def fill_sql_table(self, filename_or_database, tablename, add_id="idr"): 

2917 """ 

2918 returns a Database object, creates the database if it does not exists, 

2919 same for the table 

2920 

2921 @param filename_or_database filename or Database object, 

2922 in that second case, we assume method connect 

2923 was called before 

2924 @param tablename table name 

2925 @param add_id if is not None, then the function adds an id, it first takes the 

2926 max(id) and goes on incrementing it; 

2927 @return Database object(new or the one from the parameters), 

2928 in both case, the database is not disconnected 

2929 """ 

2930 schema = self.schema_database(add_id) 

2931 

2932 if isinstance(filename_or_database, str): 

2933 fLOG("fill_sql_table: creating database ", filename_or_database) 

2934 db = Database(filename_or_database, LOG=fLOG) 

2935 db.connect() 

2936 

2937 fLOG("fill_sql_table ", schema) 

2938 if tablename not in db.get_table_list(): 

2939 fLOG("creationg of table ", schema) 

2940 cursor = db.create_table(tablename, schema) 

2941 db.append_values(self.values, tablename, schema, cursor=cursor) 

2942 else: 

2943 db.append_values(self.values, tablename, schema) 

2944 else: 

2945 db = filename_or_database 

2946 db.append_values(self.values, tablename, schema) 

2947 

2948 return db