Coverage for src/ensae_teaching_cs/homeblog/table

1# -*- coding: utf8 -*-

2"""

3@file

4@brief Implements TableFormula.

5"""

6import copy

7import os

8import sys

9import datetime

10import random

11import math

12import json

13import numpy

14import pandas

15from xlwt import Formatting as EXf

16from xlwt import Style as EXs

17import xlwt as EXw

18from xlsxwriter import workbook as EXxw

19from xlrd import open_workbook

20from pyquickhelper.loghelper import fLOG, noLOG

21from pyquickhelper.loghelper.convert_helper import str2datetime

22from pyensae.sql import Database

23from .table_formula_stat import _TableFormulaStat

26class TableFormula(_TableFormulaStat):

27 """

28 This class aims at representating a table, it provides

29 some "SQL like" functionalities such groupby or innerjoin, select, where...

30 This was a custom implementation of a DataFrame before I discover

31 `pandas <http://pandas.pydata.org/>`_.

33 The class provides an easy to go through the row table by converting each row

34 in a dictionary ``{ column_name: value }`` on the run. Example:

36 ::

38 tbl = TableFormula(...)

39 newtbl = tbl.filter(lambda v: v["criteria"] == 5)

41 See @see op __init__ for others ways to create a table.

43 @var header list of column names

44 @var values list of rows(each row contains as many value as the number of columns)

45 @var index dictionary { column name: position }, changing ``header`` means also changing ``header``.

47 Example:

49 ::

51 table = TableFormula("name d_a d_b d_c#A 1 2 3#A 1.1 2.1 3.1#B 3 4 5".replace(" ", "\\t").replace("#","\\n"))

52 print(table)

53 print("one value 0,1:", table[0,1])

55 print("---------")

56 dist = table.get_distinct_values("name")

57 for k in sorted(dist): print("*%d: %s"%(int(dist[k]),k))

59 print("---------")

60 table.add_column("has_A", lambda v: 1. if "A" in v["name"] else 0.)

61 print(table)

63 x = 1./3

64 print("------------- smoothing", x)

65 table.add_column_smooth("has_A_smooth", lambda v: v["has_A"], [-1,0,1], [x,x,x])

66 print(table)

68 print("--------- filter")

69 fil = table.filter(lambda v: v["d_b"] == 2)

70 print(fil)

72 print("--------- random")

73 rnd = table.random(5)

74 print(rnd)

76 print("--------- random unique")

77 rnd = table.random(1, True)

78 print(rnd)

80 print("--------- filter quantile")

81 fil = table.filter_quantile(lambda v: v["d_b"], 0, 0.4)

82 print(fil)

84 print("--------- aggregate_column")

85 total = table.aggregate(lambda v: v["d_c"])

86 print(total)

88 print("--------- sort")

89 table.sort(lambda v: v["d_b"] + v["d_c"])

90 print(table)

92 print("--------- union")

93 union = table.union(table)

94 print(union)

96 print("--------- group sum")

97 group = table.groupby(lambda v: v["name"],

98 [lambda v: v["d_a"],

99 lambda v: v["d_b"]],

100 ["name", "sum_d_a", "sum_d_b"])

101 print(group)

102

103 print("--------- group max")

104 groupmax = table.groupby(lambda v: v["name"],

105 [lambda v: v["d_a"],

106 lambda v: v["d_b"]],

107 ["name", "max_d_a", "max_d_b"],

108 [max, max])

109 print(groupmax)

110

111 print("--------- group sum with weights")

112 group = table.groupby(lambda v: v["name"],

113 [lambda v: v["d_a"]],

114 ["name", "weight", "sum_d_a"],

115 [lambda vec,w: sum(vec) / w],

116 lambda v: v ["d_b"])

117

118 print("--------- innerjoin")

119 innerjoin = table.innerjoin(group, lambda v: v["name"],

120 lambda v: v["name"], "group")

121 print(innerjoin)

122

123 print("------------- extraction")

124 ext = table.extract_columns(["name", "d_a"])

125 print(ext)

126

127 print("------------- remove")

128 ext = table.remove_columns(["d_a"])

129 print(ext)

130

131 print("------------- todict")

132 d = table.todict(lambda v: v["name"], lambda v: v["d_b"], True)

133 print(d)

134

135 print("------------- select")

136 d = table.select(lambda v:(v["name"], v["d_b"]))

137 print(list(d))

138

139 print("------------- use of an index")

140 table.create_index(lambda v:(v["name"], v["d_a"]))

141 row = table.get(('A', 1.1))

142 print(row)

143 value = table.get(('A', 1.1), 2)

144 print(value)

145

146 print("------------- multiply_column_by_row_instance ")

147 table = TableFormula("name d_a d_b d_c#A 1 2 3#A 1.1 2.1 3.1#B 3 4 5".replace(" ", "\\t").replace("#","\\n"))

148 table.add_column("key_add", lambda v:"unique")

149 print(table)

150 mul = table.multiply_column_by_row_instance(

151 lambda v: v["key_add"],

152 lambda v: v["name"])

153 print(mul)

154

155 if os.path.exists("BNP.PA.txt"):

156 print("--------------- financial stock")

157 table = TableFormula("BNP.PA.txt", sep=",")

158 table.sort(lambda v: v["Date"])

159 print(table[:10])

160

161 print("--------------- groupby_implicit")

162 table = TableFormula("key_name sum_a len_b avg_c#A 1 2 3#A 1.1 2.1 3.1#B 3 4 5".replace(" ", "\\t").replace("#","\\n"))

163 print(table)

164 gr = table.groupby_implicit(lambda v: v ["key_name"])

165 print(gr)

166

167 print("--------------- covariance")

168 values = [random.random() for i in range(0,100)]

169 values = [[x, x + random.random()/2] for x in values]

170 tbl = TableFormula(["x", "y"], values).values_to_float()

171 cov = tbl.covariance()

172 print(cov)

173

174 print("--------------- histogram")

175 hist = tbl.histogram(lambda v:(v["x"],1), 10)

176 print(hist)

177

178 print("--------------- histogram")

179 hist = tbl.values_to_float().histograms(["x", "y"], 10)

180 print(hist)

181

182 print("--------------- unions of columns")

183 hist = tbl.values_to_float().union_columns(["x", "y"])

184 print(hist)

185 """

186

187 @staticmethod

188 def add_header_if_not_present(filename, header, encoding=None, logFunction=noLOG):

189 """

190 the function checks if the first line contains the column in header

191 otherwise, it modifies the file and add them on the first line

192

193 @param filename filename

194 @param header list of column name(all strings)

195 @param encoding encoding

196 @param logFunction use this function to log information about what is happening

197 """

198 if encoding is None:

199 with open(filename, "r") as f:

200 firstline = f.readline().strip("\n\r ")

201 su = sum(map(lambda _: 1 if _ in header else 0, firstline.split("\t")))

202 if su < len(header) / 2.0:

203 logFunction( # pragma: no cover

204 "add_header_if_not_present: adding header({0}<{1}){2} to '{3}'\nfirstline:\n{4}".format(

205 su, len(header) / 2, header, filename, firstline))

206 with open(filename, "r") as f:

207 text = f.read()

208 text = "\t".join(header) + "\n" + text

209 logFunction( # pragma: no cover

210 "add_header_if_not_present: writing")

211 with open(filename, "w") as f:

212 f.write(text)

213 logFunction( # pragma: no cover

214 "add_header_if_not_present: complete")

215 else:

216 with open(filename, "r", encoding=encoding) as f:

217 firstline = f.readline().strip("\n\r ")

218 su = sum(map(lambda _: 1 if _ in header else 0, firstline.split("\t")))

219 if su < len(header) / 2.0:

220 logFunction( # pragma: no cover

221 "add_header_if_not_present: adding header({0}<{1}){2} to '{3}'\nfirstline:\n{4}".format(

222 su, len(header) / 2, header, filename, firstline))

223 with open(filename, "r", encoding=encoding) as f:

224 text = f.read()

225 text = "\t".join(header) + "\n" + text

226 logFunction( # pragma: no cover

227 "add_header_if_not_present: writing")

228 with open(filename, "w", encoding=encoding) as f:

229 f.write(text)

230 logFunction( # pragma: no cover

231 "add_header_if_not_present: complete")

232

233 @staticmethod

234 def random_split_file(filename, outfileprefix, nb, has_header=True, encoding=None, logFunction=noLOG):

235 """

236 split a file in nb buckets by random(lines are sent to a random file as they come)

237

238 @param filename filename to split

239 @param nb number of buckets

240 @param outfileprefix output files will start with outfileprefix + '%04d.txt' % i

241 @param encoding encoding

242 @param has_header the header will be replicated in each created file

243 @param logFunction to display information

244 @return list of created files

245 """

246 firstline = None

247 if has_header:

248 if encoding is None:

249 with open(filename, "r") as f:

250 firstline = f.readline().strip("\n\r ")

251 else: # pragma: no cover

252 with open(filename, "r", encoding=encoding) as f:

253 firstline = f.readline().strip("\n\r ")

254

255 logFunction( # pragma: no cover

256 f"random_split_file: file {filename} has header {firstline}")

257

258 logFunction("random_split_file: split %s in %d parts" % (filename, nb))

259 fileName = [outfileprefix + (".%04d.txt" % n) for n in range(0, nb)]

260 nbline = 0

261

262 if encoding is None:

263 filesP = [open(_, "w") for _ in fileName]

264 if firstline is not None:

265 for _ in filesP:

266 _.write(firstline + "\n")

267 with open(filename, "r") as f:

268 line = f.readline()

269 if firstline is not None:

270 line = f.readline()

271 while line is not None and len(line) > 0:

272 h = random.randint(0, nb - 1)

273 filesP[h].write(line)

274 line = f.readline()

275 nbline += 1

276 if nbline % 1000000 == 0:

277 logFunction( # pragma: no cover

278 "random_split_file: processed %d lines" % nbline)

279 else:

280 filesP = [open(_, "w", encoding=encoding) for _ in fileName]

281 if firstline is not None:

282 for _ in filesP:

283 _.write(firstline + "\n")

284 with open(filename, "r", encoding=encoding) as f:

285 line = f.readline()

286 if firstline is not None:

287 line = f.readline()

288 while line is not None and len(line) > 0:

289 h = random.randint(0, nb - 1)

290 filesP[h].write(line)

291 line = f.readline()

292 nbline += 1

293 if nbline % 1000000 == 0:

294 logFunction( # pragma: no cover

295 "random_split_file: processed %d lines" % nbline)

296

297 for _ in filesP:

298 _.close()

299 logFunction("random_split_file: end")

300 return fileName

301

302 @staticmethod

303 def ratio(x, y):

304 """

305 return a ratio between two real values or an empty string if the denominateur is null

306 @return a real of an empty string

307 """

308 return x * 1.0 / y if y != 0 else (0 if x == 0 else "")

309

310 @staticmethod

311 def bootstrap(values, function, nbdraws=-1, alpha=0.05):

312 """

313 return a confidence interval for a statistics

314 @param values values

315 @param function produces the statistics over a random set of observations chosen in values

316 @param nbdraws number of draws, if it is equal to -1, is equal to len(values)

317 @param alpha confidence level

318 @return average, min, lower bound, higher bound, max

319 """

320 stat = []

321 N = len(values) - 1

322 if nbdraws == - 1:

323 nbdraws = len(values)

324 for i in range(nbdraws):

325 randset = [values[random.randint(0, N)] for i in range(N + 1)]

326 s = function(randset)

327 stat.append(s)

328 stat.sort()

329 lv = len(stat)

330 alpha = alpha / 2

331 i1 = int(lv * alpha + 0.5)

332 i2 = int(lv * (1 - alpha) + 0.5)

333 i2 = min(i2, len(stat) - 1)

334 av = sum(stat) / len(stat)

335 return av, min(stat), stat[i1], stat[i2], max(stat)

336

337 @staticmethod

338 def correlation_bicolumn(values, deviations=False, noCenter=False):

339 """

340 assume values is a matrix with two columns

341 @param values 2 column matrix

342 @param deviations if True, returns cor, sigma1, sigma2

343 @param noCenter if True, do not remove the average before computing the covariance,

344 it means we assume variables are already centered

345 @return correlation factor or correlation, sigma1, sigma2 if deviations is True

346 """

347 if len(values) <= 1:

348 raise ValueError( # pragma: no cover

349 f"expecting more than one observation, not {len(values)}")

350

351 mx = 0.

352 my = 0.

353 vx = 0.

354 vy = 0.

355 co = 0.

356 nb = 0.

357 for a, b in values:

358 nb += 1

359 mx += a

360 my += b

361 vx += a ** 2

362 vy += b ** 2

363 co += a * b

364 mx /= nb

365 my /= nb

366 vx /= nb

367 vy /= nb

368 co /= nb

369 if not noCenter:

370 vx -= mx ** 2

371 vy -= my ** 2

372 co -= mx * my

373 vx = vx ** 0.5

374 vy = vy ** 0.5

375 v = vx * vy

376 if v != 0:

377 co /= v

378 if deviations:

379 return co, vx, vy # pragma: no cover

380 return co

381

382 def _private_getclass(self):

383 """

384 the class often creates another class of the same type,

385 this function returns the class object

386 """

387 return self.__class__

388

389 def __init__(self, file, numeric_column=None, sep="\t", encoding=None,

390 read_n_lines=-1, sheet=0, **options):

391 """

392 It can either take a filename, an object TableFormula,

393 a list of columns and values.

394

395 :param file: filename or a list of column names or a dictionary,

396 file can also be a `pandas DataFrame

397 <http://pandas.pydata.org/pandas-docs/dev/dsintro.html#dataframe>`_.

398 :param numeric_column: depends on file types(see below examples)

399 :param sep: column separator if file is a filename

400 :param read_n_lines: read the first n lines(or all if it is -1)

401 :param sheet: in case the file is an Excel file, this parameter precises the sheet number or name

402 :param suffix_nb: if True, adds an integer to the column name if it is a duplicate

403

404 Example:

405

406 ::

407

408 table = TableFormula("name d_a d_b d_c#A 1 2 3#A 1.1 2.1 3.1#B 3 4 5".replace(" ", "\\t").replace("#","\\n"))

409

410 or

411

412 ::

413

414 table = TableFormula("file.txt", ["nb"])

415

416 or

417

418 ::

419

420 table = TableFormula(["date", "Y", "Y2", "xl"], values)

421

422 or

423

424 ::

425

426 data = [{ "one":1, "two":2 }, {"two":2.1, "three":3 }]

427 table = TableFormula(data)

428

429 or

430

431 ::

432

433 data = { 1:{ "one":2.3, "two":2.2 }, 2:{"one":2.1, "two":3 }

434 table = TableFormula("__byrow__", data)

435

436 or

437

438 ::

439

440 table = TableFormula(numpy.matrix(...))

441

442 or

443

444 ::

445

446 table = TableFormula(numpy.array(...))

447

448

449 @warning In this second case, rows and header are not copied.

450 """

451 if numeric_column is None:

452 numeric_column = []

453 if isinstance(file, str):

454 if os.path.exists(file):

455 self._read_file(file, numeric_column, sep,

456 encoding, read_n_lines, sheet=sheet)

457 elif file == "__byrow__" and isinstance(numeric_column, dict):

458 self._fill_by_row(numeric_column)

459 else:

460 lines = file.split("\n")

461 if len(lines) == 1:

462 raise FileNotFoundError( # pragma: no cover

463 f"A file was probably expected but was not found: '{file}'.")

464 self._readlines(lines, numeric_column, sep)

465

466 elif isinstance(file, list):

467 if len(file) == 0:

468 raise ValueError( # pragma: no cover

469 "Empty data and columns are not allowed.")

470

471 if isinstance(file[0], dict):

472 self.index = {}

473 self.values = []

474 for row in file:

475 for k, v in row.items():

476 if k not in self.index:

477 self.index[k] = len(self.index)

478

479 # we sort the labels to avoid instabilities

480 labels = [k for k, v in self.index.items()]

481 labels.sort()

482 self.index = {}

483 for la in labels:

484 self.index[la] = len(self.index)

485

486 for row in file:

487 line = [None for k in self.index]

488 for k, v in row.items():

489 line[self.index[k]] = v

490 self.values.append(line)

491

492 self.header = [None for k in self.index]

493 for k, v in self.index.items():

494 self.header[v] = k

495

496 n = len(self.index)

497 for row in self.values:

498 while len(row) < n:

499 row.append(None)

500

501 elif isinstance(numeric_column, numpy.matrix):

502 self.header = file

503 self.index = {}

504 for i, h in enumerate(self.header):

505 self.index[h] = i

506 self.values = [[float(numeric_column[i, j]) for j in range(

507 numeric_column.shape[1])] for i in range(numeric_column.shape[0])]

508 elif isinstance(numeric_column, numpy.ndarray):

509 self.header = file

510 self.index = {}

511 for i, h in enumerate(self.header):

512 self.index[h] = i

513 self.values = [[float(numeric_column[i, j]) for j in range(

514 numeric_column.shape[1])] for i in range(numeric_column.shape[0])]

515 elif isinstance(file[0], list):

516 if len(file) == 1:

517 self.header = file[0]

518 self.values = file[1:] + numeric_column

519 self.index = {}

520 for i, h in enumerate(self.header):

521 self.index[h] = i

522 else:

523 self.header = file[0]

524 self.values = file[1:]

525 self.index = {}

526 for i, h in enumerate(self.header):

527 self.index[h] = i

528 elif isinstance(file[0], str):

529 self.header = file

530 self.values = numeric_column

531 self.index = {}

532 for i, h in enumerate(self.header):

533 self.index[h] = i

534 else:

535 raise RuntimeError( # pragma: no cover

536 "This case should not happen: " + str(type(file[0])))

537

538 elif isinstance(file, numpy.matrix): # pragma: no cover

539 self.header = ["c%d" % d for d in range(file.shape[1])]

540 self.index = {}

541 for i, h in enumerate(self.header):

542 self.index[h] = i

543 self.values = [[float(file[i, j]) for j in range(

544 file.shape[1])] for i in range(file.shape[0])]

545

546 elif isinstance(file, numpy.ndarray):

547 self.header = ["c%d" % d for d in range(file.shape[1])]

548 self.index = {}

549 for i, h in enumerate(self.header):

550 self.index[h] = i

551 self.values = [[float(file[i, j]) for j in range(

552 file.shape[1])] for i in range(file.shape[0])]

553

554 else:

555 if isinstance(file, pandas.DataFrame):

556 def convert(x):

557 return None if isinstance(x, float) and numpy.isnan(x) else x

558 df = file

559 self.header = [_ for _ in df.columns]

560 hi = 'index'

561 while hi in self.header:

562 hi += "_"

563 self.header.insert(0, hi)

564 self.values = []

565 for i, row in enumerate(df.values):

566 row = [df.index[i]] + [convert(x) for x in row]

567 self.values.append(row)

568

569 self.index = {}

570 for i, h in enumerate(self.header):

571 self.index[h] = i

572 else:

573 raise TypeError( # pragma: no cover

574 "File has an unexpected type: " + str(type(file)))

575

576 unique = {}

577 for i, c in enumerate(self.header):

578 if c in unique:

579 if options.get("suffix_nb", False):

580 c = "%s_%d" % (c, i)

581 self.header[i] = c

582 else:

583 raise KeyError( # pragma: no cover

584 f"column '{c}' already exists in '{self.header}'")

585 unique[c] = True

586

587 def __add__(self, other):

588 """

589 do an addition, add values if types are matching

590 :param other: matrix or float or string

591 :return: new matrix, keep the header of the first matrix

592 """

593 if len(self) != len(other):

594 raise ValueError( # pragma: no cover

595 "both matrices should have the same number of rows")

596 if len(self.header) != len(other.header):

597 raise ValueError( # pragma: no cover

598 "both matrices should have the same number of columns")

599 values = []

600 for row, rowo in zip(self.values, other.values):

601 r = []

602 for a, b in zip(row, rowo):

603 if type(a) == type(b):

604 x = a + b

605 else:

606 x = None

607 r.append(x)

608 values.append(r)

609 return self._private_getclass()(self.header, values)

610

611 def __mul__(self, other):

612 """

613 do a multiplication(by a number)

614 :param other: matrix or float or string

615 :return: new matrix, keep the header of the first matrix

616 """

617 if not isinstance(other, float) and not isinstance(other, int):

618 raise TypeError( # pragma: no cover

619 "other should be a number")

620 values = []

621 for row in self.values:

622 r = []

623 for a in row:

624 if a is not None:

625 x = a * other

626 else:

627 x = None

628 r.append(x)

629 values.append(r)

630 return self._private_getclass()(self.header, values)

631

632 def multiplication_term_term(self, other):

633 """

634 do a multiplication term by term(similar to an addition),

635 add values if types are matching

636

637 :param other: matrix or float or string

638 :return: new matrix, keep the header of the first matrix

639 """

640 if len(self) != len(other):

641 raise ValueError( # pragma: no cover

642 "both matrices should have the same number of rows")

643 if len(self.header) != len(other.header):

644 raise ValueError( # pragma: no cover

645 "both matrices should have the same number of columns")

646 values = []

647 for row, rowo in zip(self.values, other.values):

648 r = []

649 for a, b in zip(row, rowo):

650 if type(a) == type(b) and not isinstance(a, str):

651 x = a * b

652 else:

653 x = None

654 r.append(x)

655 values.append(r)

656 return self._private_getclass()(self.header, values)

657

658 def replicate(self, times):

659 """replicates all rows a given number of times

660 :param times: number of multiplication

661 :return: new matrix, keep the header of the first matrix

662 """

663 values = []

664 for i in range(0, times):

665 values.extend(copy.copy(self.values))

666 return self._private_getclass()(self.header, values)

667

668 @property

669 def size(self):

670 """

671 returns the size(nb rows, nb columns)

672 """

673 return len(self), len(self.header)

674

675 @property

676 def shape(self):

677 """

678 returns the size(nb rows, nb columns)

679 """

680 return self.size

681

682 def _fill_by_row(self, values):

683 """

684 fill the table

685 :param values: dictionary { <int_row_index>: { <column name>: value} }

686 """

687 mx = max(values.keys()) + 1

688 self.index = {}

689 self.header = []

690 for k, v in values.items():

691 for col in v:

692 if col not in self.index:

693 self.index[col] = len(self.index)

694 self.header.append(col)

695 self.values = [[None for h in self.header] for k in range(mx)]

696 for k, v in values.items():

697 for col, to in v.items():

698 self.values[k][self.index[col]] = to

699

700 def __getitem__(self, irow):

701 """

702 operator [], accepts slices

703 :param irow: integer, tuple, slice or list

704 :return: depends on irow

705 - int --> a table with one row

706 - slice --> a table with several rows

707 - list --> a table with the selected rows

708 - tuple --> a value

709 """

710 if isinstance(irow, int):

711 return self._private_getclass()(

712 self.header, [self.values[irow]])

713 if isinstance(irow, slice):

714 return self._private_getclass()(

715 self.header, [self.values[ii] for ii in range(*irow.indices(len(self)))])

716 if isinstance(irow, list):

717 return self._private_getclass()(

718 self.header, [self.values[ii] for ii in irow])

719 if isinstance(irow, tuple):

720 if isinstance(irow[1], str):

721 row = self.values[irow[0]]

722 v = self._interpret_row(row)

723 return v[irow[1]]

724 return self.values[irow[0]][irow[1]]

725 raise TypeError("Invalid argument type: " + str(type(irow)))

726

727 def __setitem__(self, irow, value):

728 """

729 operator [], just accepts tuple(to change a value)

730 :param irow: 2-uple

731 :param value: new value

732 """

733 if isinstance(irow, tuple):

734 if isinstance(irow[1], str):

735 row = self.values[irow[0]]

736 v = self._interpret_row(row)

737 v[irow[1]] = value

738 else:

739 self.values[irow[0]][irow[1]] = value

740 else:

741 raise TypeError( # pragma: no cover

742 "Invalid argument type(only tuple accepted): " + str(type(irow)))

743

744 def __len__(self):

745 """

746 returns the number of rows

747 """

748 return len(self.values)

749

750 def __copy__(self):

751 """

752 operator copy

753 """

754 return self._private_getclass()(self.header, self.values)

755

756 def __deepcopy__(self, memo):

757 """

758 operator ``deepcopy``

759 """

760 return self._private_getclass()(copy.deepcopy(self.header, memo), copy.deepcopy(self.values, memo))

761

762 def copy(self):

763 """

764 call ``copy.deepcopy(self)``

765 """

766 return copy.deepcopy(self)

767

768 def delta(self, other):

769 """

770 returns a list of differences between self and others

771

772 :param other: TableFormula

773 :return: list of differences(first one)

774 """

775 if other is None:

776 return False

777 if not isinstance(other, TableFormula):

778 raise TypeError("other is not a table: " + str(type(other)))

779 if len(self.header) != len(other.header):

780 return ["different number of columns"]

781 for a, b in zip(self.header, other.header):

782 if a != b:

783 return ["different columns"]

784 if len(self.values) != len(other.values):

785 return ["different number of rows"]

786 line = 0

787 for r, s in zip(self.values, other.values):

788 if len(r) != len(s):

789 return ["different number of values on row %d" % line]

790 col = 0

791 for a, b in zip(r, s):

792 if a != b:

793 return ["different value on cell %d,%d: %s!=%s(type %s, %s)" % (line, col, a, b, str(type(a)), str(type(b)))]

794 col += 1

795 line += 1

796 return []

797

798 def __eq__(self, other):

799 """

800 check if two tables are equal by value

801 :param other: other table

802 :return: boolean

803 """

804 if other is None:

805 return False

806 if not isinstance(other, TableFormula):

807 return False

808 if len(self.header) != len(other.header):

809 return False

810 for a, b in zip(self.header, other.header):

811 if a != b:

812 return False

813 if len(self.values) != len(other.values):

814 return False

815 for r, s in zip(self.values, other.values):

816 if len(r) != len(s):

817 return False

818 for a, b in zip(r, s):

819 if a != b:

820 return False

821 return True

822

823 def __str__(self):

824 """

825 convert the table into a string

826 :return: string

827 """

828 rows = ["\t".join(self.header)]

829 for row in self.values:

830 s = "\t".join([str(_) for _ in row])

831 rows.append(s)

832 return "\n".join(rows)

833

834 def __html__(self, class_table=None, class_td=None, class_tr=None, class_th=None):

835 """

836 Converts the table into a :epkg:`html` string.

837

838 :param class_table: adds a class to the tag ``table`` (None for none)

839 :param class_td: adds a class to the tag ``td`` (None for none)

840 :param class_tr: adds a class to the tag ``tr`` (None for none)

841 :param class_th: adds a class to the tag ``th`` (None for none)

842 """

843 clta = f' class="{class_table}"' if class_table is not None else ""

844 cltr = f' class="{class_tr}"' if class_tr is not None else ""

845 cltd = f' class="{class_td}"' if class_td is not None else ""

846 clth = f' class="{class_th}"' if class_th is not None else ""

847

848 rows = [f"<table{clta}>"]

849 rows.append("{0}{1}{2}".format(("<tr%s><th%s>" % (cltr, clth)),

850 ("</th><th%s>" % clth).join(self.header), "</th></tr>"))

851 septd = f"</td><td{cltd}>"

852 strtd = f"<tr{cltr}><td{cltd}>"

853 for row in self.values:

854 s = septd.join([str(_) for _ in row])

855 rows.append(strtd + s + "</td></tr>")

856 rows.append("</table>")

857 rows.append("")

858 return "\n".join(rows)

859

860 def __rst__(self, add_line=True):

861 """

862 convert the table into rst format

863

864 ::

865

866 +------------------------+------------+----------+----------+

868 | (header rows optional) | | | |

869 +========================+============+==========+==========+

871 +------------------------+------------+----------+----------+

872 | body row 2 | ... | ... | |

873 +------------------------+------------+----------+----------+

874

875 :param add_line: add a line separator between each row

876 """

877 tbl = self.values_to_str()

878 length = [len(_) for _ in tbl.header]

879 for row in tbl.values:

880 for i, v in enumerate(row):

881 length[i] = max(length[i], len(v))

882 length = [_ + 2 for _ in length]

883 line = ["-" * le for le in length]

884 lineb = ["=" * le for le in length]

885 sline = f"+{'+'.join(line)}+"

886 slineb = f"+{'+'.join(lineb)}+"

887 res = [sline]

888

889 def complete(cool):

890 s, i = cool

891 i -= 2

892 if len(s) < i:

893 s += " " * (i - len(s))

894 return s

895

896 res.append(f"| {' | '.join(map(complete, zip(tbl.header, length)))} |")

897 res.append(slineb)

898 res.extend([f"| {' | '.join(map(complete, zip(row, length)))} |"

899 for row in tbl.values])

900 if add_line:

901 t = len(res)

902 for i in range(t - 1, 3, -1):

903 res.insert(i, sline)

904 res.append(sline)

905 return "\n".join(res) + "\n"

906

907 def strtype(self):

908 """

909 displays the type of values(not the values)

910 """

911 rows = ["\t".join(self.header)]

912 for row in self.values:

913 s = "\t".join([str(type(_)) for _ in row])

914 rows.append(s)

915 return "\n".join(rows)

916

917 def _read_file(self, file, numeric_column, sep, encoding, read_n_lines, sheet=0):

918 """

919 private

920 """

921 ext = os.path.splitext(file)[-1].lower()

922 if ext in [".xls", ".xlsx"]:

923 lines = list(open_workbook(file, sheet=sheet))

924 # removing empty column(assuming first row is the header)

925 ind = [i for i, n in enumerate(lines[0]) if len(n) > 0]

926 if len(ind) < len(lines[0]):

927 lines = [[line[i] for i in ind] for line in lines]

928 else:

929 if sys.version_info.major >= 3 or encoding is None:

930 if encoding is None:

931 f = open(file, "r")

932 else:

933 f = open(file, "r", encoding=encoding)

934 else:

935 f = open(file, "r", encoding=encoding)

936

937 if read_n_lines > 0:

938 lines = []

939 for line in f:

940 if len(lines) >= read_n_lines:

941 break

942 lines.append(line)

943 else:

944 lines = f.readlines()

945 f.close()

946 self._readlines(lines, numeric_column, sep)

947

948 def change_header(self, new_header):

949 """

950 change the column names

951

952 :param new_header: a list or a function which modifies the header

953

954 Example:

955

956 ::

957

958 tbl.change_header(lambda h: h if h != "column" else "new_name")

959

960 .. warning:: Do not do that yourself, the class holds a dictionary up to date with the column index.

961 """

962 if isinstance(new_header, list):

963 self.header = new_header

964 self.index = {}

965 for i, h in enumerate(self.header):

966 self.index[h] = i

967 else:

968 he = [new_header(h) for h in self.header]

969 self.change_header(he)

970

971 def rename_column(self, old_name, new_name):

972 """

973 rename a column

974

975 :param old_name: old name

976 :param new_name: new name

977 """

978 header = [{old_name: new_name}.get(_, _) for _ in self.header]

979 self.change_header(header)

980

981 def save(self, filename, sep="\t", encoding=None, newline="\n"):

982 """

983 saves the tables in a text file, first row is the column names

984

985 :param filename: filename

986 :param sep: column separator

987 :param encoding: encoding

988 :param newline: line separator

989 """

990 if sys.version_info.major >= 3 or encoding is None:

991 if encoding is None:

992 f = open(filename, "w", newline=newline)

993 else:

994 f = open(filename, "w", encoding=encoding, newline=newline)

995 else:

996 f = open(filename, "w", encoding=encoding)

997

998 f.write(sep.join(self.header))

999 f.write("\n")

1000 for row in self.values:

1001 f.write(sep.join([str(_) for _ in row]))

1002 f.write("\n")

1003 f.close()

1004

1005 def _readlines(self, lines, numeric_column, sep):

1006 """private"""

1007 if isinstance(lines[0], str):

1008 lines = [_.replace("\ufeff", "").replace("\xef\xbb\xbf", "")

1009 .strip("\n\r ").split(sep) for _ in lines if len(_) > 0]

1010 self.header = lines[0]

1011 self.values = lines[1:]

1012 self.index = {}

1013 for i, h in enumerate(self.header):

1014 self.index[h] = i

1015 elif isinstance(lines[0], list):

1016 self.header = lines[0]

1017 self.values = lines[1:]

1018 self.index = {}

1019 for i, h in enumerate(self.header):

1020 self.index[h] = i

1021 else:

1022 raise RuntimeError("unexpected format: " + str(type(lines[0])))

1023

1024 self._auto_conversion(numeric_column)

1025

1026 def _auto_conversion(self, others_columns):

1027 """

1028 private

1029 set up the column type based on the column name

1030 """

1031 def condition(k):

1032 if k.startswith("sum_") or k.startswith("pos_"):

1033 return True

1034 if k.startswith("avg_") or k.startswith("len_"):

1035 return True

1036 if k.startswith("nb_") or k.startswith("max_") or k.startswith("min_"):

1037 return True

1038 if k.startswith("d_") or k in others_columns:

1039 return True

1040 if k in ["Open", "High", "Low", "Close", "Volume", "Adj Close"]:

1041 return True

1042 if k in ["distance", "nb"]:

1043 return True

1044 return False

1045

1046 for i, k in enumerate(self.header):

1047 if k == "Date":

1048 for row in self.values:

1049 if isinstance(row[i], str):

1050 row[i] = datetime.datetime.strptime(row[i], '%Y-%m-%d')

1051 elif isinstance(row[i], float):

1052 row[i] = datetime.datetime.utcfromtimestamp(row[i])

1053 else:

1054 raise RuntimeError(

1055 f"unable to extract a date from type {type(row[i])}")

1056 elif condition(k):

1057 for row in self.values:

1058 row[i] = float(row[i])

1059 else:

1060 for row in self.values:

1061 if isinstance(row[i], str) and row[i] == "None":

1062 row[i] = None

1063

1064 def get_column_values(self, col):

1065 """

1066 private

1067 returns all values for one column

1068 """

1069 i = self.index[col]

1070 return [row[i] for row in self.values]

1071

1072 def get_distinct_values(self, col):

1073 """private"""

1074 row = self.get_column_values(col)

1075 dis = {}

1076 for r in row:

1077 dis[r] = dis.get(r, 0) + 1

1078 return dis

1079

1080 def _interpret_row(self, row):

1081 """

1082 private

1083 returns each row as a dictionary { column_name:value }

1084 """

1085 values = {}

1086 for a, b in zip(self.header, row):

1087 values[a] = b

1088 return values

1089

1090 def __iter__(self):

1091 """

1092 iterator on all rows, it returns a dictionary { column:value }

1093 @return dictionary

1094 """

1095 for row in self.values:

1096 yield self._interpret_row(row)

1097

1098 def add_column(self, colname, function, position=-1):

1099 """

1100 Adds a column.

1101 :param colname: column name or columns name if it is a list or a tuple

1102 :param function: function which will gives the values(or a list of functions, or a function which return a tuple)

1103 :param position: where to insert the column, -1 for the end

1104

1105 Example:

1106

1107 ::

1108

1109 table.add_column("has_A", lambda v: 1 if "A" in v["name"] else 0, 0)

1110

1111 table.add_column(("has_A", "has_B"),(lambda v: 1 if "A" in v["name"] else 0,

1112 lambda v: 1 if "B" in v["name"] else 0))

1113

1114 table.add_column(("has_A", "has_B"),(lambda v:(1 if "A" in v["name"] else 0, 1 if "B" in v["name"] else 0))

1115 """

1116 if isinstance(colname, str):

1117 if position == -1:

1118 self.index[colname] = len(self.index)

1119 for row in self.values:

1120 v = self._interpret_row(row)

1121 x = function(v)

1122 row.append(x)

1123 self.header.append(colname)

1124 else:

1125 for row in self.values:

1126 v = self._interpret_row(row)

1127 x = function(v)

1128 row.insert(position, x)

1129 self.header.insert(position, colname)

1130 self.index = {v: i for i, v in enumerate(self.header)}

1131

1132 elif isinstance(function, list):

1133 if len(colname) != len(function):

1134 raise ValueError( # pragma: no cover

1135 "unable to continue, colname and function do not have the same number of elements")

1136 if position == -1:

1137 position = [-1] * len(colname)

1138 elif isinstance(position, int):

1139 position = [position] * len(colname)

1140 else:

1141 if len(position) != len(colname):

1142 raise RuntimeError( # pragma: no cover

1143 "Unable to continue, colname and position do not "

1144 "have the same number of elements.")

1145 dec = 0

1146 for a, b, c in zip(colname, function, position):

1147 self.add_column(a, b, c + dec)

1148 dec += 1

1149

1150 else:

1151 # we assume here, the function returns a tuple

1152 if not isinstance(position, int):

1153 raise TypeError( # pragma: no cover

1154 "Int expected for position for this case.")

1155

1156 if position == -1:

1157 for row in self.values:

1158 v = self._interpret_row(row)

1159 x = function(v)

1160 row.extend(x)

1161 self.header.extend(colname)

1162

1163 else:

1164 for row in self.values:

1165 v = self._interpret_row(row)

1166 x = function(v)

1167 for i, _ in enumerate(x):

1168 row.insert(position + i, _)

1169 for i, c in enumerate(colname):

1170 self.header.insert(position + i, c)

1171 self.index = {v: i for i, v in enumerate(self.header)}

1172 return self

1173

1174 def add_column_index(self, colname="index", start=0):

1175 """

1176 Example:

1177

1178 ::

1179

1180 table.add_column("index_row")

1181 """

1182 self.index[colname] = len(self.index)

1183 for i, row in enumerate(self.values):

1184 row.append(i + start)

1185 self.header.append(colname)

1186 return self

1187

1188 def addc(self, colname, function, position=-1):

1189 """

1190 @see me add_column

1191 """

1192 return self.add_column(colname, function, position)

1193

1194 def add_column_recursive(self, colname, functionValue, functionAgg):

1195 """

1196 Example:

1197

1198 ::

1199

1200 table.add_column_recursive(lambda v: v ["norm_%s" % loi],

1201 lambda li, v: li[-1] + v)

1202 """

1203 self.index[colname] = len(self.index)

1204 values = []

1205 for row in self.values:

1206 v = self._interpret_row(row)

1207 x = functionValue(v)

1208 y = functionAgg(values, x)

1209 row.append(y)

1210 values.append(y)

1211 self.header.append(colname)

1212 return self

1213

1214 def add_column_recursive_row(self, colname, functionAgg):

1215 """

1216 Example:

1217

1218 ::

1219

1220 table.add_column_recursive_row("w_%s" % loi,

1221 lambda li, v: li[-1] + v ["norm_%s" % loi] \

1222 if len(li)> 0 else v ["norm_%s" % loi])

1223 """

1224 self.index[colname] = len(self.index)

1225 values = []

1226 for row in self.values:

1227 v = self._interpret_row(row)

1228 y = functionAgg(values, v)

1229 row.append(y)

1230 values.append(y)

1231 self.header.append(colname)

1232 return self

1233

1234 def add_column_vector(self, colname, vector):

1235 """

1236 add a column defined by vector(list of values for each row)

1237

1238 :param colname: column to add

1239 :param vector: (list) list of values to add to each row

1240 :return self

1241 """

1242 if len(vector) != len(self):

1243 raise ValueError( # pragma: no cover

1244 f"vector and table have different length {len(vector)} != {len(self)}")

1245 for vec, row in zip(vector, self.values):

1246 row.append(vec)

1247 self.index[colname] = len(self.index)

1248 self.header.append(colname)

1249 return self

1250

1251 def add_column_smooth(self, colname, function, position, weights):

1252 """

1253 Example:

1254

1255 ::

1256

1257 x = 1./3

1258 table.add_column_smooth("has_A_smooth", lambda v: v["has_A"], [-1,0,1], [x,x,x])

1259 """

1260 if len(position) != len(weights):

1261 raise ValueError( # pragma: no cover

1262 "position and weights must have the same length")

1263 self.index[colname] = len(self.index)

1264 column = [function(self._interpret_row(row)) for row in self.values]

1265 tw = sum(weights)

1266 couple = list(zip(position, weights))

1267 for p, row in enumerate(self.values):

1268 sx = 0.

1269 sw = 0.

1270 ms = 0

1271 for i, w in couple:

1272 pi = p + i

1273 if 0 <= pi < len(self):

1274 sx += column[pi] * w

1275 sw += w

1276 else:

1277 ms += 1

1278

1279 if ms == 0:

1280 row.append(sx)

1281 elif sw != 0:

1282 row.append(sx * tw / sw)

1283 else:

1284 row.append(sx)

1285 self.header.append(colname)

1286 return self

1287

1288 def aggregate_column(self, colname, aggregated_function=sum):

1289 """

1290 Example:

1291

1292 ::

1293

1294 total = table.aggregate_column("d_c", sum)

1295 """

1296 def function(v):

1297 return v[colname]

1298 return self.aggregate(function, aggregated_function)

1299

1300 def aggregate(self, function, aggregated_function=sum):

1301 """

1302 Example:

1303

1304 ::

1305

1306 total = table.aggregate_column(lambda v: v["d_c"], len)

1307 """

1308 return aggregated_function([function(self._interpret_row(row)) for row in self.values])

1309

1310 def where(self, condition_function):

1311 """

1312 @see me filter

1313 """

1314 return self.filter(condition_function)

1315

1316 def filter(self, condition_function):

1317 """

1318 Example:

1319

1320 ::

1321

1322 fil = table.filter(lambda v: v["d_b"] == 2)

1323

1324 @warning Rows are not copied.

1325 """

1326 newv = []

1327 for row in self.values:

1328 v = self._interpret_row(row)

1329 x = condition_function(v)

1330 if x:

1331 newv.append(row)

1332 final = self._private_getclass()(self.header, newv)

1333 return final

1334

1335 def groupby_implicit(self, functionKey, functionWeight=None, logging=None):

1336 """

1337 use prefix of a column name to know which function to use

1338 as an aggregated(sum, avg, len, key, none, max, min)

1339 Example:

1340

1341 ::

1342

1343 group = table.groupby_implicit(lambda v: v["name"])

1344 """

1345 def identical(col, v):

1346 return v[col]

1347

1348 def first(vec):

1349 return vec[0]

1350

1351 def avg(vec):

1352 return TableFormula.ratio(sum(vec), len(vec))

1353

1354 functions = []

1355 labels = ["key"]

1356 functionsAgg = []

1357 for col in self.header:

1358 if col.startswith("key"):

1359 values = self.select(

1360 lambda v, col=col: (v[col], functionKey(v)))

1361 dd = {}

1362 for v in values:

1363 if v[1] not in dd:

1364 dd[v[1]] = {}

1365 dd[v[1]][v[0]] = 1

1366 for k in dd:

1367 dd[k] = len(dd[k]) # pylint: disable=E4702

1368 keep = []

1369 for k, v in dd.items():

1370 if v > 1:

1371 keep.append((k, v))

1372

1373 if len(keep) == 0:

1374 functions.append(lambda v, col=col: identical(col, v))

1375 labels.append(col)

1376 functionsAgg.append(first)

1377 elif logging is not None:

1378 end = min(len(keep), 10)

1379 mes = ",".join([str(_) for _ in keep[:end]])

1380 logging("removing column '{0}' no unique value: {1}: {2}".format(

1381 col, len(dd), mes))

1382 elif col.startswith("sum"):

1383 functions.append(lambda v, col=col: identical(col, v))

1384 labels.append(col)

1385 functionsAgg.append(sum)

1386 elif col.startswith("len"):

1387 functions.append(lambda v, col=col: 1)

1388 labels.append(col)

1389 functionsAgg.append(len)

1390 elif col.startswith("min"):

1391 functions.append(lambda v, col=col: 1)

1392 labels.append(col)

1393 functionsAgg.append(min)

1394 elif col.startswith("max"):

1395 functions.append(lambda v, col=col: 1)

1396 labels.append(col)

1397 functionsAgg.append(max)

1398 elif col.startswith("avg"):

1399 functions.append(lambda v, col=col: identical(col, v))

1400 labels.append(col)

1401 functionsAgg.append(avg)

1402 elif col.startswith("none"):

1403 pass

1404 else:

1405 raise RuntimeError("unable to aggregate column " + col)

1406

1407 return self.groupby(functionKey, functions, labels, functionsAgg, functionWeight)

1408

1409 def groupby(self, functionKey, functionsValue, columns=None, functionsAgg=None, functionWeight=None):

1410 """

1411 Example:

1412

1413 ::

1414

1415 group = table.groupby(lambda v: v["name"],

1416 [lambda v: v["d_a"],

1417 lambda v: v["d_b"]],

1418 ["name", "sum_d_a", "sum_d_b"])

1419

1420 or

1421

1422 ::

1423

1424 groupmax = table.groupby(lambda v: v["name"],

1425 [lambda v: v["d_a"],

1426 lambda v: v["d_b"]],

1427 ["name", "max_d_a", "max_d_b"],

1428 [max, max])

1429 """

1430 if not isinstance(functionsValue, list):

1431 functionsValue = [functionsValue]

1432 if functionsAgg is None:

1433 functionsAgg = [sum for f in functionsValue]

1434 if functionWeight is None:

1435 if columns is not None and len(columns) != len(functionsValue) + 1:

1436 raise RuntimeError("columns should have %d names not(%d)" % (

1437 len(functionsValue) + 1, len(columns)))

1438 else:

1439 if columns is not None and len(columns) != len(functionsValue) + 2:

1440 raise RuntimeError("columns should have %d names not(%d)" % (

1441 len(functionsValue) + 2, len(columns)))

1442 if columns is not None and not isinstance(columns[0], str):

1443 raise TypeError("expecting type str not %s in columns" %

1444 (str(type(columns[0]))))

1445

1446 hist = {}

1447 if functionWeight is not None:

1448 histWeight = {}

1449

1450 for row in self.values:

1451 v = self._interpret_row(row)

1452 key = functionKey(v)

1453 w = 1. if functionWeight is None else functionWeight(v)

1454

1455 if key not in hist:

1456 histWeight[key] = [w]

1457 hist[key] = [[f(v) * w] for f in functionsValue]

1458 else:

1459 histWeight[key].append(w)

1460 h = hist[key]

1461 for i, f in enumerate(functionsValue):

1462 h[i].append(f(v) * w)

1463

1464 for key in hist: # pylint: disable=C0206

1465 h = hist[key]

1466 w = sum(histWeight[key])

1467 for i in range(0, len(h)):

1468 h[i] = functionsAgg[i](h[i], w)

1469

1470 f = hist.items if sys.version_info.major >= 3 else hist.items

1471 histValues = [[k, sum(histWeight[k])] + v for k, v in f()]

1472

1473 if columns is None:

1474 columns = ["key", "weight"] + ["val%d" %

1475 i for i, f in enumerate(functionsValue)]

1476 ret = self._private_getclass()(columns, histValues)

1477 return ret

1478 else:

1479 for row in self.values:

1480 v = self._interpret_row(row)

1481 key = functionKey(v)

1482 if key not in hist:

1483 hist[key] = [[f(v)] for f in functionsValue]

1484 else:

1485 h = hist[key]

1486 for i, f in enumerate(functionsValue):

1487 h[i].append(f(v))

1488

1489 for key in hist: # pylint: disable=C0206

1490 h = hist[key]

1491 for i in range(0, len(h)):

1492 h[i] = functionsAgg[i](h[i])

1493

1494 f = hist.items if sys.version_info.major >= 3 else hist.items

1495 histValues = [[k] + v for k, v in f()]

1496

1497 if columns is None:

1498 columns = ["key"] + ["val%d" %

1499 i for i, f in enumerate(functionsValue)]

1500 ret = self._private_getclass()(columns, histValues)

1501 return ret

1502

1503 def sort(self, functionValue, reverse=False):

1504 """

1505 Example:

1506

1507 ::

1508

1509 table.sort(lambda v: v["d_b"] + v["d_c"])

1510 """

1511 values = [(functionValue(self._interpret_row(row)), i)

1512 for i, row in enumerate(self.values)]

1513 values.sort(reverse=reverse)

1514 self.values = [self.values[_[1]] for _ in values]

1515 return self

1516

1517 def extract_columns(self, listColumns):

1518 """

1519 extract some columns

1520

1521 @param listColumns list of columns to remove or a function

1522 which returns True if the column has to be extracted

1523 based on its name

1524 @return table

1525

1526 Example:

1527

1528 ::

1529

1530 ext = table.extract_columns(["name", "d_a"])

1531 """

1532 if isinstance(listColumns, list):

1533 indexes = [(self.index[col] if isinstance(col, str) else col)

1534 for col in listColumns]

1535 header = listColumns

1536 values = [[row[i] for i in indexes] for row in self.values]

1537 return self._private_getclass()(header, values)

1538 else:

1539 header = [_ for _ in self.header if listColumns(_)]

1540 return self.extract_columns(header)

1541

1542 def remove_columns(self, listColumns):

1543 """

1544 remove some columns

1545

1546 @param listColumns list of columns to remove or a function

1547 which returns True if the column has to be removed

1548 based on its name

1549 @return table

1550

1551 Example:

1552

1553 ::

1554

1555 rem = table.remove("d_a")

1556 """

1557 if isinstance(listColumns, list):

1558 cols = [_ for i, _ in enumerate(

1559 self.header) if _ not in listColumns and i not in listColumns]

1560 return self.extract_columns(cols)

1561 if isinstance(listColumns, str):

1562 cols = [_ for _ in self.header if _ != listColumns]

1563 return self.extract_columns(cols)

1564 cols = [_ for _ in self.header if not listColumns(_)]

1565 return self.extract_columns(cols)

1566

1567 def innerjoin(self, table, functionKey1, functionKey2, nameKey="key",

1568 addSuffixAnyWay=False, prefixToAdd=None, full=False,

1569 keepKey=True, putKeyInColumn=None, missingValue=None,

1570 uniqueKey=False):

1571 """

1572 @param table other table to join with

1573 @param functionKey1 key for the first table(a function)

1574 @param functionKey2 key for the second table(a function) innerjoin .... ON ...

1575 @param addSuffixAnyWay add a suffix to every column from the second table even

1576 if names are different(suffix is "+")

1577 @param prefixToAdd prefix to add the the columns of the second table

1578 @param full add all items even if there is no common keys(``FULL OUTER JOIN``),

1579 otherwise keep only common keys

1580 @param keepKey keep the key as a column in the result(column is key), otherwise not

1581 @param putKeyInColumn private parameter: keepKey has to be true and in this case,

1582 put the key in this column(integer)

1583 @param missingValue when there is not key on one side, this default value will be put in place

1584 @param uniqueKey if True, the function assumes there is a bijection between rows

1585 and keys(one row <--> one key) on both tables,

1586 otherwise, it will not.

1587 @return a table

1588

1589 Example:

1590

1591 ::

1592

1593 innerjoin = table.innerjoin(group, lambda v: v["name"],

1594 lambda v: v["name"], "group")

1595 """

1596 defaultVal1 = [missingValue for k in self.header]

1597 defaultVal2 = [missingValue for k in table.header]

1598

1599 if uniqueKey:

1600 keys = {}

1601 for row in self.values:

1602 v = self._interpret_row(row)

1603 key = functionKey1(v)

1604 keys[key] = (row, None)

1605

1606 for row in table.values:

1607 v = table._interpret_row(row)

1608 key = functionKey2(v)

1609 if key in keys:

1610 keys[key] = (keys[key][0], row)

1611 elif full:

1612 keys[key] = (None, row)

1613

1614 if not full:

1615 d = []

1616 for k, v in keys.items():

1617 if None in v:

1618 d.append(k)

1619 for _ in d:

1620 del keys[_]

1621 else:

1622 for k in keys: # pylint: disable=C0206

1623 v = keys[k]

1624 if v[0] is None:

1625 keys[k] = (defaultVal1, v[1]) # pylint: disable=E4702

1626 elif v[1] is None:

1627 keys[k] = (v[0], defaultVal2) # pylint: disable=E4702

1628

1629 if keepKey:

1630 columns = [nameKey]

1631 for x in self.header:

1632 while x in columns:

1633 x += "~"

1634 columns.append(x)

1635

1636 for x in table.header:

1637 if prefixToAdd is not None:

1638 x = prefixToAdd + x

1639 elif addSuffixAnyWay:

1640 x += "+"

1641 while x in columns:

1642 x += "+"

1643 columns.append(x)

1644

1645 f = keys.items if sys.version_info.major >= 3 else keys.items

1646 values = [[k] + v[0] + v[1] for k, v in f() if len(v) == 2]

1647 return self._private_getclass()(columns, values)

1648 else:

1649 columns = []

1650 for x in self.header:

1651 while x in columns:

1652 x += "~"

1653 columns.append(x)

1654

1655 for x in table.header:

1656 if prefixToAdd is not None:

1657 x = prefixToAdd + x

1658 elif addSuffixAnyWay:

1659 x += "+"

1660 while x in columns:

1661 x += "+"

1662 columns.append(x)

1663

1664 f = keys.items if sys.version_info.major >= 3 else keys.items

1665

1666 if putKeyInColumn is None:

1667 values = [v[0] + v[1] for k, v in f() if len(v) == 2]

1668 else:

1669 values = []

1670 for k, v in f():

1671 if len(v) == 2:

1672 nr = v[0] + v[1]

1673 nr[putKeyInColumn] = k

1674 values.append(nr)

1675

1676 return self._private_getclass()(columns, values)

1677 else:

1678 keys = {}

1679 for row in self.values:

1680 v = self._interpret_row(row)

1681 key = functionKey1(v)

1682 if key in keys:

1683 keys[key][0].append(row)

1684 else:

1685 keys[key] = ([row], None)

1686

1687 for row in table.values:

1688 v = table._interpret_row(row)

1689 key = functionKey2(v)

1690 if key in keys:

1691 if keys[key][1] is None:

1692 keys[key] = (keys[key][0], [row])

1693 else:

1694 keys[key][1].append(row)

1695 elif full:

1696 keys[key] = (None, [row])

1697

1698 if not full:

1699 d = []

1700 for k, v in keys.items():

1701 if None in v:

1702 d.append(k)

1703 for _ in d:

1704 del keys[_]

1705 else:

1706 for k in keys: # pylint: disable=C0206

1707 v = keys[k]

1708 if v[0] is None:

1709 keys[k] = ([defaultVal1], v[1]

1710 ) # pylint: disable=E4702

1711 elif v[1] is None:

1712 keys[k] = (v[0], [defaultVal2]

1713 ) # pylint: disable=E4702

1714

1715 if keepKey:

1716 columns = [nameKey]

1717 for x in self.header:

1718 while x in columns:

1719 x += "~"

1720 columns.append(x)

1721

1722 for x in table.header:

1723 if prefixToAdd is not None:

1724 x = prefixToAdd + x

1725 elif addSuffixAnyWay:

1726 x += "+"

1727 while x in columns:

1728 x += "+"

1729 columns.append(x)

1730

1731 f = keys.items if sys.version_info.major >= 3 else keys.items

1732

1733 values = []

1734 for k, v in f():

1735 if len(v) == 2:

1736 for ka in v[0]:

1737 for kb in v[1]:

1738 values.append([k] + ka + kb)

1739 return self._private_getclass()(columns, values)

1740 else:

1741 columns = []

1742 for x in self.header:

1743 while x in columns:

1744 x += "~"

1745 columns.append(x)

1746

1747 for x in table.header:

1748 if prefixToAdd is not None:

1749 x = prefixToAdd + x

1750 elif addSuffixAnyWay:

1751 x += "+"

1752 while x in columns:

1753 x += "+"

1754 columns.append(x)

1755

1756 f = keys.items if sys.version_info.major >= 3 else keys.items

1757

1758 if putKeyInColumn is None:

1759 values = [v[0] + v[1] for k, v in f() if len(v) == 2]

1760 else:

1761 values = []

1762 for k, v in f():

1763 if len(v) == 2:

1764 for ka in v[0]:

1765 for kb in v[1]:

1766 nr = ka + kb

1767 nr[putKeyInColumn] = k

1768 values.append(nr)

1769

1770 return self._private_getclass()(columns, values)

1771

1772 def filter_quantile(self, function, alpha_min=0.025, alpha_max=0.025):

1773 """

1774 sort all rows using criteria defined by function and remove

1775 rows at the extremes

1776

1777 @param function values used to estimate the quantiles

1778 @param alpha_min lower quantile

1779 @param alpha_max higher quantile

1780 @return a table containing all the rows where the criterium

1781 is within the two quantiles

1782

1783 Example:

1784

1785 ::

1786

1787 fil = table.filter_quantile(lambda v: v["d_b"], 0, 0.4)

1788

1789 @warning Rows are not copied.

1790 """

1791 values = []

1792 for row in self.values:

1793 v = self._interpret_row(row)

1794 val = function(v)

1795 values.append((val, row))

1796 values.sort()

1797 lv = len(values)

1798 i1 = int(lv * alpha_min + 0.5)

1799 i2 = int(lv * (1 - alpha_max) + 0.5)

1800 i1 = max(i1, 0)

1801 i1 = min(i1, lv)

1802 i2 = max(i1, i2)

1803 i2 = min(i2, lv)

1804 if i2 == i1:

1805 raise RuntimeError("unable to extract quantile, the table is either "

1806 "empty or chosen quantile are not correct")

1807 values = [_[1] for _ in values[i1:i2]]

1808 return self._private_getclass()(self.header, values)

1809

1810 def union(self, table):

1811 """

1812 @param table table

1813 @return table(with the same number of columns)

1814

1815 concatenates two tables by rows, they must have the same header, rows of both tables are merged into a single matrix

1816 Example:

1817

1818 ::

1819

1820 union = table.union(table2)

1821 """

1822 if len(self.header) != len(table.header):

1823 raise ValueError( # pragma: no cover

1824 "tables do not have the same number of columns\ntbl1: %s\ntbl2: %s" % (

1825 ",".join(self.header), ",".join(table.header)))

1826 for a, b in zip(self.header, table.header):

1827 if a != b:

1828 raise ValueError( # pragma: no cover

1829 "tables do not have the same column names")

1830 return self._private_getclass()(self.header, self.values + table.values)

1831

1832 def concatenate(self, table, addPrefix=""):

1833 """

1834 concatenates two tables by columns

1835 @param table table

1836 @param addPrefix add a prefix to each column from table

1837 @return table (with the same number of rows as the longest one)

1838 """

1839 maxr = max(len(self), len(table))

1840 header = self.header + [addPrefix + h for h in table.header]

1841 values = []

1842 for i in range(0, maxr):

1843 r1 = self.values[i] if i < len(self) else [None] * len(self.header)

1844 r2 = table.values[i] if i < len(

1845 table) else [None] * len(self.table)

1846 values.append(r1 + r2)

1847 return self._private_getclass()(header, values)

1848

1849 def random(self, n, unique=False):

1850 """

1851 select n random row from the table, returns a table

1852

1853 @param n number of desired random rows

1854 @param unique draws unique rows or non unique rows

1855 (tirage sans remise ou avec remise)

1856 @return a table

1857

1858 Example:

1859

1860 ::

1861

1862 rnd = table.random(10)

1863 """

1864 if unique:

1865 if n > len(self):

1866 raise ValueError( # pragma: no cover

1867 "number of desired random rows is higher "

1868 "than the number of rows in the table")

1869 index = {}

1870 while len(index) < n:

1871 h = random.randint(0, len(self) - 1)

1872 index[h] = 0

1873 values = [self.values[h] for h in index]

1874 return self._private_getclass()(self.header, values)

1875 else:

1876 values = []

1877 for i in range(0, n):

1878 h = random.randint(0, len(self) - 1)

1879 values.append(self.values[h])

1880 return self._private_getclass()(self.header, values)

1881

1882 def todict(self, functionKey, functionValue, useList=False):

1883 """

1884 convert the table as a dictionary { key:value }

1885 each of them is defined by functions.

1886

1887 @param functionKey defines the key

1888 @param functionValue defines the value

1889 @param useList if there are multiple rows sharing the same key, it should be true,

1890 all values are stored in a list

1891 @return a dictionary { key:row } or { key: [row1, row2, ...] }

1892

1893 Example:

1894

1895 ::

1896

1897 d = table.todict(lambda v: v["name"], lambda v: v["d_b"], True)

1898 """

1899 res = {}

1900 if useList:

1901 for row in self.values:

1902 v = self._interpret_row(row)

1903 key = functionKey(v)

1904 val = functionValue(v)

1905 if key in res:

1906 res[key].append(val)

1907 else:

1908 res[key] = [val]

1909 else:

1910 for row in self.values:

1911 v = self._interpret_row(row)

1912 key = functionKey(v)

1913 val = functionValue(v)

1914 res[key] = val

1915 return res

1916

1917 def reduce_dict(self, functionKey, functionValue, uselist=False):

1918 """

1919 @see me todict

1920 """

1921 return self.todict(functionKey, functionValue, uselist)

1922

1923 def select(self, functionRow):

1924 """

1925 @param functionRow fonction

1926 @return table

1927

1928 Example:

1929

1930 ::

1931

1932 d = table.select(lambda v:(v["name"], v["d_b"]))

1933 print(list(d))

1934 """

1935 for row in self.values:

1936 v = self._interpret_row(row)

1937 nr = functionRow(v)

1938 yield nr

1939

1940 def modify_all(self, modification_function):

1941 """

1942 apply the same modification to every number

1943 @param modification_function modification to apply to every number

1944 @return new table

1945

1946 The signature of the function is the following one:

1947

1948 ::

1949

1950 def function(value, column_name):

1951 # ....

1952 return new_value

1953

1954 Example:

1955

1956 ::

1957

1958 tbl = tbl.modify_all(lambda v,c: {"string":"", "numerical":0}.get(c,None) if v is None else v)

1959 """

1960 values = []

1961 for row in self.values:

1962 r = []

1963 for v, h in zip(row, self.header):

1964 r.append(modification_function(v, h))

1965 values.append(r)

1966 return self._private_getclass()(self.header, values)

1967

1968 def dcast(self, functionKey, functionInstance, full=True):

1969 """

1970 @see me multiply_column_by_row_instance

1971 """

1972 return self.multiply_column_by_row_instance(functionKey, functionInstance, full)

1973

1974 def multiply_column_by_row_instance(self, functionKey, functionInstance, full=True):

1975 """

1976 @param functionKey defines a key(function)

1977 @param functionInstance defines a second key(will be moved to the columns dimension)

1978 @param full introduces missing values for not found combinations

1979 @return a table

1980

1981 If a column contains a finite set of value, for example,

1982 we have the temperature for several cities organized like if

1983 it were a table from a database: city, date, temperatue.

1984 We would like to get another table where we have:

1985 date temparature_city1 temperature_city2...

1986

1987 Then we would type:

1988 Example:

1989

1990 ::

1991

1992 mul = table.multiply_column_by_row_instance(

1993 lambda v: v["date"],

1994 lambda v: v["city"])

1995

1996 The input table would be like:

1997

1998 ::

1999

2000 city date

2001 A jan

2002 A feb

2003 B feb

2004

2005 It returns:

2006

2007 ::

2008

2009 KEY A|city A|date B|city B|date

2010 feb A feb B feb

2011 jan A jan None None

2012 """

2013 values = [functionInstance(self._interpret_row(row))

2014 for row in self.values]

2015 distinct = {}

2016 for v in values:

2017 distinct[v] = 0

2018 distinct = [_ for _ in distinct]

2019 distinct.sort()

2020 table1 = copy.deepcopy(self)

2021 table = None

2022 header = copy.copy(table1.header)

2023 orig = len(header)

2024 nameKey = "~KEY~"

2025 while nameKey in header:

2026 nameKey += "*"

2027 nbJoin = 0

2028

2029 for val in distinct:

2030 table2 = table1.filter(

2031 lambda v, val=val: functionInstance(v) == val)

2032 if table is None:

2033 table = table2.copy()

2034 else:

2035 colkey = table.header[0]

2036 table = table.innerjoin(table2, functionKey if nbJoin == 0 else (lambda v, c=colkey: v[c]),

2037 functionKey, nameKey=nameKey,

2038 prefixToAdd=str(val) + "|",

2039 full=full, keepKey=nbJoin == 0,

2040 putKeyInColumn=None if nbJoin == 0 else 0,

2041 uniqueKey=True)

2042

2043 if nbJoin == 0:

2044 head = []

2045 nb = 0

2046 for h in table.header:

2047 if not h.endswith("~") and nb < orig:

2048 head.append(f"{distinct[0]}|{h}")

2049 nb += 1

2050 else:

2051 head.append(h)

2052 header = ["KEY"] + head[1:]

2053 table = self._private_getclass()(header, table.values)

2054

2055 nbJoin += 1

2056

2057 if nbJoin == 0:

2058 head = []

2059 nb = 0

2060 for h in table.header:

2061 if not h.endswith("~") and nb < orig:

2062 head.append(f"{distinct[0]}|{h}")

2063 nb += 1

2064 else:

2065 head.append(h)

2066 values = []

2067 for row in self.values:

2068 v = self._interpret_row(row)

2069 r = [functionKey(v)] + row

2070 values.append(r)

2071 header = ["KEY"] + head

2072 table = self._private_getclass()(header, values)

2073

2074 return table

2075

2076 def create_index(self, functionIndex):

2077 """

2078 this method creates an index,

2079 to get an indexes row, use method get

2080 Example:

2081

2082 ::

2083

2084 table.create_index(lambda v:(v["name"], v["d_a"]))

2085 row = table.get(('A', 1.1))

2086 value = table.get(('A', 1.1), 2)

2087 """

2088 self.indexspecial = {}

2089 for row in self.values:

2090 v = self._interpret_row(row)

2091 nr = functionIndex(v)

2092 if nr in self.indexspecial:

2093 raise KeyError(

2094 f"unable to add {str(nr)} because it is already present")

2095 self.indexspecial[nr] = row

2096 return self

2097

2098 def get(self, rowIndex, column=None):

2099 """

2100 use the index created by method create_index

2101 Example:

2102

2103 ::

2104

2105 table.create_index(lambda v:(v["name"], v["d_a"]))

2106 row = table.get(('A', 1.1))

2107 value = table.get(('A', 1.1), 2)

2108 """

2109 if "indexspecial" not in self.__dict__:

2110 raise RuntimeError("no index was created")

2111 row = self.indexspecial[rowIndex]

2112 if column is None:

2113 return row

2114 elif isinstance(column, int):

2115 return row[column]

2116 else:

2117 return row[self.index[column]]

2118

2119 def avg_std(self, functionValue, functionWeight=lambda v: 1):

2120 """

2121 returns the average and standard deviation

2122 """

2123 avg = 0.

2124 std = 0.

2125 n = 0.

2126 for i, row in enumerate(self.values):

2127 v = self._interpret_row(row)

2128 x = float(functionValue(v))

2129 w = functionWeight(v)

2130 avg += x * w

2131 std += x * x * w

2132 n += w

2133

2134 if n != 0:

2135 avg /= n

2136 std /= n

2137 std -= avg * avg

2138 std = math.sqrt(std)

2139 else:

2140 avg = 0.

2141 std = 0.

2142 return avg, std

2143

2144 def add_column_cumulative(self, column_index, column_name, functionIndex, functionValue,

2145 normalize=False, reverse=False, cumulative=True, functionSort=None):

2146 """

2147 also called the Gini function

2148 Example:

2149

2150 ::

2151

2152 table.add_column_cumulative("index_%s" % col, "dist_%s" % col,

2153 lambda v: v["sum_nbclient"], lambda v: v[col],

2154 functionSort = lambda v: v [col] / v["sum_nbclient"],

2155 normalize=True)

2156 """

2157 if functionSort is None:

2158 functionSort = functionValue

2159 val = []

2160 for row in self.values:

2161 v = self._interpret_row(row)

2162 i = functionIndex(v)

2163 s = functionSort(v)

2164 v = functionValue(v)

2165 val.append((s, i, v))

2166 val.sort(reverse=reverse)

2167

2168 if cumulative:

2169 res = [(0., 0.)]

2170 for s, i, v in val:

2171 res.append((i + res[-1][0], v + res[-1][1]))

2172 del res[0]

2173

2174 if normalize:

2175 sumi = res[-1][0]

2176 sumv = res[-1][1]

2177 if sumi != 0 and sumv != 0:

2178 res = [(_[0] / sumi, _[1] / sumv) for _ in res]

2179 else:

2180 raise ZeroDivisionError(

2181 "cannot divide by zero, all indexes or all values are null")

2182 else:

2183 res = [(i, v) for s, i, v in val]

2184

2185 if normalize:

2186 sumi = sum([_[0] for _ in res])

2187 sumv = sum([_[1] for _ in res])

2188 if sumi != 0 and sumv != 0:

2189 res = [(_[0] / sumi, _[1] / sumv) for _ in res]

2190 else:

2191 raise ZeroDivisionError(

2192 "cannot divide by zero, all indexes or all values are null")

2193

2194 for row, add in zip(self.values, res):

2195 row.extend(add)

2196 self.index[column_index] = len(self.index)

2197 self.index[column_name] = len(self.index)

2198 self.header.append(column_index)

2199 self.header.append(column_name)

2200 return self

2201

2202 def transpose(self, labelC=None, labelAsRow=True):

2203 """

2204 Computes the transpose.

2205 @param labelC proposes labels for the column,

2206 if None, take "r%d" % i,

2207 if it is a string, the function assumes it is a column name

2208 @param labelAsRow add the label as a row

2209 @return new table

2210 """

2211 if labelC is None:

2212 label = ["r%d" % i for i in range(0, len(self.values))]

2213 if labelAsRow:

2214 label = ["rowheader"] + label

2215 rem = None

2216 elif isinstance(labelC, str):

2217 label = list(self.select(lambda v: v[labelC]))

2218 rem = label

2219 else:

2220 rem = None

2221 label = labelC

2222

2223 values = []

2224 for i in range(0, len(self.header)):

2225 if rem is not None and self.header[i] == labelC:

2226 continue

2227 row = [_[i] for _ in self.values]

2228 if labelAsRow:

2229 row = [self.header[i]] + row

2230 values.append(row)

2231 return self._private_getclass()(label, values)

2232

2233 def covariance(self):

2234 """

2235 Computes the covariance matrix, the first column

2236 will contains the column names.

2237 @return new table

2238 """

2239 for i, x in enumerate(self.values[0]):

2240 if not isinstance(x, float):

2241 raise TypeError("expecting a float on column %d" % i)

2242 values = self.np_matrix

2243 N = values.shape[0]

2244 sums = numpy.sum(values, axis=0) / N

2245 for i in range(0, values.shape[1]):

2246 values[:, i] -= sums[0, i]

2247 cov = values.transpose() * values

2248 cov /= N

2249 head = ["var"] + self.header

2250 size = cov.shape

2251 values = [[self.header[

2252 i]] + [float(cov[i, j]) for j in range(0, size[1])] for i in range(0, size[0])]

2253 tbl = self._private_getclass()(head, values)

2254 return tbl

2255

2256 def correlation_col(self, col1, col2, noCenter=False):

2257 """

2258 Computes the correlation between two columns.

2259 @param col1 column 1

2260 @param col2 column 2

2261 @param noCenter does the computation without removing the average

2262 @return float(covariance)

2263 """

2264 values = [[self._interpret_row(row)[col1], self._interpret_row(row)[

2265 col2]] for row in self.values]

2266 if len(values) <= 1:

2267 raise ValueError( # pragma: no cover

2268 f"expecting more than one observation, not {len(values)}")

2269 mx = 0.

2270 my = 0.

2271 vx = 0.

2272 vy = 0.

2273 co = 0.

2274 nb = 0.

2275 for a, b in values:

2276 nb += 1

2277 mx += a

2278 my += b

2279 vx += a ** 2

2280 vy += b ** 2

2281 co += a * b

2282 mx /= nb

2283 my /= nb

2284 vx /= nb

2285 vy /= nb

2286 co /= nb

2287 if not noCenter:

2288 vx -= mx ** 2

2289 vy -= my ** 2

2290 co -= mx * my

2291 vx = vx ** 0.5

2292 vy = vy ** 0.5

2293 v = vx * vy

2294 if v != 0:

2295 co /= v

2296 return co

2297

2298 def covariance_col(self, col1, col2, noCenter=False):

2299 """

2300 Computes the correlation between two columns.

2301 @param col1 column 1

2302 @param col2 column 2

2303 @param noCenter does the computation without removing the average

2304 @return float(covariance)

2305 """

2306 values = [[self._interpret_row(row)[col1],

2307 self._interpret_row(row)[col2]] for row in self.values]

2308

2309 if len(values) <= 1:

2310 raise ValueError( # pragma: no cover

2311 f"expecting more than one observation, not {len(values)}")

2312

2313 mx = 0.

2314 my = 0.

2315 co = 0.

2316 nb = 0.

2317 for a, b in values:

2318 nb += 1

2319 mx += a

2320 my += b

2321 co += a * b

2322 mx /= nb

2323 my /= nb

2324 co /= nb

2325 if not noCenter:

2326 co -= mx * my

2327 return co

2328

2329 def correlation_row(self, row1, row2, noCenter=False):

2330 """

2331 computes the correlation between two columns

2332 @param row1 row 1(integer)

2333 @param row2 row 2(integer)

2334 @param noCenter does the computation without removing the average

2335 @return float(covariance)

2336 """

2337 values = [[a, b] for a, b in zip(self.values[row1], self.values[row2])]

2338 if len(values) <= 1:

2339 raise ValueError( # pragma: no cover

2340 f"expecting more than one observation, not {len(values)}")

2341 mx = 0.

2342 my = 0.

2343 vx = 0.

2344 vy = 0.

2345 co = 0.

2346 nb = 0.

2347 for a, b in values:

2348 nb += 1

2349 mx += a

2350 my += b

2351 vx += a ** 2

2352 vy += b ** 2

2353 co += a * b

2354 mx /= nb

2355 my /= nb

2356 vx /= nb

2357 vy /= nb

2358 co /= nb

2359 if not noCenter:

2360 vx -= mx ** 2

2361 vy -= my ** 2

2362 co -= mx * my

2363 vx = vx ** 0.5

2364 vy = vy ** 0.5

2365 v = vx * vy

2366 if v != 0:

2367 co /= v

2368 return co

2369

2370 def covariance_row(self, row1, row2, noCenter=False):

2371 """

2372 computes the correlation between two columns

2373 @param row1 row 1(integer)

2374 @param row2 row 2(integer)

2375 @param noCenter does the computation without removing the average

2376 @return float(covariance)

2377 """

2378 values = [[a, b] for a, b in zip(self.values[row1], self.values[row2])]

2379 if len(values) <= 1:

2380 raise ValueError( # pragma: no cover

2381 f"expecting more than one observation, not {len(values)}")

2382 mx = 0.

2383 my = 0.

2384 co = 0.

2385 nb = 0.

2386 for a, b in values:

2387 nb += 1

2388 mx += a

2389 my += b

2390 co += a * b

2391 mx /= nb

2392 my /= nb

2393 co /= nb

2394 if not noCenter:

2395 co -= mx * my

2396 return co

2397

2398 def correlation(self, useBootstrap=False, collapseFormat=True, nbdraws=-1, alpha=0.05,

2399 functionKeepValue=lambda val, low, high: f"{val:f}|{low:f},{high:f}"):

2400 """

2401 Computes the correlation matrix, the first column

2402 will contains the column names.

2403

2404 @param useBootstrap if True, use a bootstrap method to estimate the correlation

2405 @param collapseFormat if True and useBootstrap is True, produces a format

2406 ``average|lower bound|higher bound(at a definite confidence level)``

2407 @param nbdraws number of draws(if -1, then it will be equal to the number of observations)

2408 @param alpha confidence level

2409 @param functionKeepValue if collapseFormat is True, this function is used to collapse val,low,high in a single string

2410 @return new table

2411 """

2412 if useBootstrap:

2413 head = ["var"] + self.header

2414 values = [[i] + [0. for r in self.header] for i in self.header]

2415 for i in range(len(self.header)):

2416 values[i][0] = self.header[i]

2417 for j in range(len(self.header)):

2418 vs = [[row[i], row[j]] for row in self.values]

2419 bo = TableFormula.bootstrap(vs, function=TableFormula.correlation_bicolumn,

2420 nbdraws=nbdraws, alpha=alpha)

2421 if collapseFormat:

2422 st = functionKeepValue(bo[0], bo[2], bo[3])

2423 values[i][j + 1] = st

2424 else:

2425 raise NotImplementedError( # pragma: no cover

2426 "collapseFormat False is not implemented yet")

2427 tbl = self._private_getclass()(head, values)

2428 return tbl

2429 else:

2430 for i, x in enumerate(self.values[0]):

2431 if not isinstance(x, float):

2432 raise TypeError( # pragma: no cover

2433 "expecting a float on column %d" % i)

2434

2435 values = self.np_matrix

2436 N = values.shape[0]

2437 sums = [sum(values[:, i]) / N for i in range(0, values.shape[1])]

2438

2439 for i in range(0, values.shape[1]):

2440 values[:, i] -= sums[i]

2441

2442 cov = values.transpose() * values

2443 cov /= N

2444 diag = [cov[i, i] ** 0.5 for i in range(cov.shape[0])]

2445 for i in range(cov.shape[0]):

2446 if diag[i] > 0:

2447 cov[i, :] /= diag[i]

2448 cov[:, i] /= diag[i]

2449

2450 head = ["var"] + self.header

2451 size = cov.shape

2452 values = [[self.header[

2453 i]] + [float(cov[i, j]) for j in range(0, size[1])] for i in range(0, size[0])]

2454 tbl = self._private_getclass()(head, values)

2455 return tbl

2456

2457 def values_to_float(self, only_if_possible=False, subset_columns=None):

2458 """

2459 converts all values into float

2460 @param only_if_possible if True, converts all possible values and catches exception,

2461 if False, converts everything, raises an exception when not possible

2462 @param subset_columns if None, takes all of them, otherwise, try to convert

2463 only for the listed columns

2464 @return table

2465 """

2466 tbl = self.copy()

2467 if subset_columns is not None:

2468 subset = {i: True for i, v in enumerate(

2469 self.header) if v in subset_columns}

2470 if only_if_possible:

2471 for row in tbl.values:

2472 for i in range(0, len(row)):

2473 if subset_columns is None or i in subset:

2474 try:

2475 v = float(row[i])

2476 row[i] = v

2477 except (ValueError, TypeError):

2478 continue

2479 else:

2480 for row in tbl.values:

2481 for i in range(0, len(row)):

2482 if subset_columns is None or i in subset:

2483 row[i] = float(row[i])

2484 return tbl

2485

2486 def values_to_str(self, subset_columns=None, format=None):

2487 """

2488 converts all values into str

2489 @param subset_columns if None, takes all of them, otherwise, try to convert

2490 only for the listed columns

2491 @param format format for the conversion, by None by default but it could be for exemple %1.2f.

2492 @return table

2493 """

2494 tbl = self.copy()

2495 if subset_columns is not None:

2496 subset = {i: True for i, v in enumerate(

2497 self.header) if v in subset_columns}

2498

2499 if format is None:

2500 for row in tbl.values:

2501 for i in range(0, len(row)):

2502 if subset_columns is None or i in subset:

2503 row[i] = str(row[i])

2504 else:

2505 for row in tbl.values:

2506 for i in range(0, len(row)):

2507 if (subset_columns is None or i in subset) and isinstance(row[i], float):

2508 row[i] = format % row[i]

2509 return tbl

2510

2511 def values_to_date(self, format=None, only_if_possible=False, subset_columns=None):

2512 """

2513 converts all values into dates

2514 @param only_if_possible if True, converts all possible values and catches exception,

2515 if False, converts everything, raises an exception when not possible

2516 @param format date format see fn str_to_datetime

2517 @param subset_columns if None, takes all of them, otherwise, try to convert

2518 only for the listed columns

2519 @return table

2520 """

2521 tbl = self.copy()

2522 if subset_columns is not None:

2523 subset = {i: True for i, v in enumerate(

2524 self.header) if v in subset_columns}

2525 if only_if_possible:

2526 if subset_columns is not None:

2527 subset = {i: True for i, v in enumerate(

2528 self.header) if v in subset_columns}

2529

2530 for row in tbl.values:

2531 for i in range(0, len(row)):

2532 if subset_columns is None or i in subset:

2533 try:

2534 v = str2datetime(row[i], format)

2535 row[i] = v

2536 except (ValueError, TypeError):

2537 continue

2538 else:

2539 for row in tbl.values:

2540 for i in range(0, len(row)):

2541 if subset_columns is None or i in subset:

2542 row[i] = float(row[i])

2543 return tbl

2544

2545 def histogram(self, functionValue, nbDiv=100, secondColumnIsWeight=False,

2546 normalize=True, removeExtreme=0.05):

2547 """

2548 computes an histograms on one vector

2549 @param functionValue function which produces the value to histogram

2550 @param nbDiv number of divisions for this histograms(boundaries are min and max)

2551 @param secondColumnIsWeight if True, the second column is the weight

2552 @param normalize if True, normalize by the sum of weights

2553 @param removeExtreme remove extreme values at both sides(0.05 means 0.025 on each side)

2554 @return table with two columns

2555 """

2556 values = list([functionValue(self._interpret_row(row)) # pylint: disable=R1728

2557 for row in self.values])

2558 if removeExtreme is not None and removeExtreme > 0:

2559 values.sort()

2560 al = int(len(values) * removeExtreme / 2)

2561 if al == 0:

2562 raise RuntimeError( # pragma: no cover

2563 "removeExtreme has no impact(%d,%f)" % (

2564 len(values), len(values) * removeExtreme / 2))

2565 if al * 2 < len(values):

2566 values = values[al:len(values) - al]

2567

2568 mi = min(values)

2569 ma = max(values)

2570

2571 if isinstance(values[0], (tuple, list)):

2572 W = 0.

2573 div = (ma[0] - mi[0]) / nbDiv

2574 hist = [[mi[0] + n * div, 0.] for n in range(0, nbDiv + 1)]

2575 for v in values:

2576 x = int((v[0] - mi[0]) // div)

2577 hist[x][1] += v[1]

2578 W += v[1]

2579 mi = mi[0]

2580 else:

2581 W = len(values)

2582 div = (ma - mi) / nbDiv

2583 hist = [[mi + n * div, 0.] for n in range(0, nbDiv + 1)]

2584 for v in values:

2585 x = int((v - mi) // div)

2586 if 0 <= x < len(hist):

2587 hist[x][1] += 1.

2588

2589 if normalize and W > 0:

2590 for i in range(len(hist)):

2591 hist[i][1] /= W

2592

2593 values = [[mi + n * div, hist[n]] for n in range(len(hist))]

2594 tbl = self._private_getclass()(["x", "hist(x)"], hist)

2595 return tbl

2596

2597 def histograms(self, columnsSet, nbDiv=100, secondColumnIsWeight=False,

2598 normalize=True, removeExtreme=0.05, histxName="histKey"):

2599 """

2600 computes a common histograms on all columns

2601 @param columnsSet set of columns

2602 @param nbDiv number of divisions for this histograms(boundaries are min and max)

2603 @param secondColumnIsWeight if True, the second column is the weight

2604 @param normalize if True, normalize by the sum of weights

2605 @param removeExtreme remove extreme values at both sides(0.05 means 0.025 on each side)

2606 @param histxName column name given to the x axis shared by every histogram

2607 @return table with two columns

2608

2609 @warning The function skips any NaN of Inf value.

2610 """

2611 values = []

2612 for row in self.values:

2613 temp = self._interpret_row(row)

2614 for t in columnsSet:

2615 values.append(temp[t])

2616

2617 if removeExtreme is not None and removeExtreme > 0:

2618 values.sort()

2619 al = int(len(values) * removeExtreme / 2)

2620 if al == 0:

2621 raise RuntimeError( # pragma: no cover

2622 "removeExtreme has no impact(%d,%f)" % (

2623 len(values), len(values) * removeExtreme / 2))

2624 if al * 2 < len(values):

2625 values = values[al:len(values) - al]

2626

2627 mi = min(values)

2628 ma = max(values)

2629 W = len(values)

2630 div = (ma - mi) / nbDiv

2631 if div == 0:

2632 raise RuntimeError( # pragma: no cover

2633 f"unable to continue since div is null: min,max = {mi:f},{ma:f}")

2634 hist = [[mi + n * div, 0.] for n in range(0, nbDiv + 1)]

2635 value = {i: {histxName: hist[i][0]} for i in range(len(hist))}

2636 su = {}

2637 for row in self.values:

2638 for _ in columnsSet:

2639 temp = self._interpret_row(row)

2640 if math.isnan(temp[_]) or math.isinf(temp[_]):

2641 continue

2642 x = int((temp[_] - mi) // div)

2643 if x not in value:

2644 # it means extremes were removed

2645 continue

2646 #raise RuntimeError("value %d,%f is not allowed min,max = [%f,%f]" %(x, temp[_], mi, ma))

2647 value[x][_] = value[x].get(_, 0.) + 1.

2648 su[_] = su.get(_, 0.) + 1.

2649

2650 if normalize and W > 0:

2651 for v in value.values():

2652 for _ in v:

2653 if _ != histxName:

2654 v[_] /= su[_]

2655

2656 tbl = self._private_getclass()("__byrow__", value)

2657 return tbl

2658

2659 def union_columns(self, columnsSet):

2660 """

2661 computes the union of all values from all columns present in columnSet

2662 @param columnsSet set of columns

2663 @return table

2664 """

2665 values = []

2666 for row in self.values:

2667 temp = self._interpret_row(row)

2668 for t in columnsSet:

2669 values.append(temp[t])

2670 tbl = self._private_getclass()(["x"], [[x] for x in values])

2671 return tbl

2672

2673 def mu_sigma(self, functionValues, removeExtreme=None):

2674 """

2675 computes the average and the standard deviation a vector of values

2676 @param functionValues function produces the vector of values

2677 @param removeExtreme remove extreme values at both sides(0.05 means 0.025 on each side)

2678 @return (average, standard deviation)

2679 """

2680 if removeExtreme is not None and removeExtreme > 0:

2681 values = []

2682 for row in self.values:

2683 row = self._interpret_row(row)

2684 val = functionValues(row)

2685 values.append(val)

2686 values.sort()

2687 al = int(len(values) * removeExtreme / 2)

2688 if al == 0:

2689 raise RuntimeError("removeExtreme has no impact(%d,%f)" % (

2690 len(values), len(values) * removeExtreme / 2))

2691 if al * 2 < len(values):

2692 values = values[al:len(values) - al]

2693 tbl = TableFormula(["x"], [[_] for _ in values])

2694 return tbl.mu_sigma(lambda v: v["x"], 0)

2695 else:

2696 mu = 0.

2697 si = 0.

2698 nb = 0.

2699 for row in self.values:

2700 row = self._interpret_row(row)

2701 val = functionValues(row)

2702 mu += val

2703 si += val ** 2

2704 nb += 1.

2705 mu /= nb

2706 si /= nb

2707 si -= mu ** 2

2708 return mu, si ** 0.5

2709

2710 def mu_sigma_each_column(self, columnsSet=None, removeExtreme=None):

2711 """

2712 returns a table with the average and the standard deviation for each columns

2713 @param removeExtreme remove extreme values at both sides(0.05 means 0.025 on each side)

2714 @param columnsSet set of column to deal with

2715 @return table with two rows: average and standard deviation

2716 """

2717 values = [[], []]

2718 if columnsSet is None:

2719 columnsSet = self.header

2720 for col in columnsSet:

2721 mu, sigma = self.mu_sigma(

2722 (lambda v, col=col: v[col]), removeExtreme)

2723 values[0].append(mu)

2724 values[1].append(sigma)

2725 tbl = self._private_getclass()(columnsSet, values)

2726 return tbl

2727

2728 @property

2729 def np_matrix(self):

2730 """

2731 returns the values as a numpy matrix

2732 @return numpy matrix

2733 """

2734 return numpy.matrix(self.values)

2735

2736 @property

2737 def np_array(self):

2738 """

2739 returns the values as a numpy array

2740 @return numpy array

2741 """

2742 return numpy.array(self.values)

2743

2744 @property

2745 def dataframe(self):

2746 """

2747 creates a pandas dataframe

2748 @return pandas.dataframe

2749 """

2750 return pandas.DataFrame(self.values, columns=self.header)

2751

2752 @property

2753 def json(self):

2754 """

2755 returns a json format

2756 @return string

2757 """

2758 rows = [row for row in self]

2759 return json.dumps(rows)

2760

2761 def center_reduce(self, columnsSet=None, op=None, removeExtreme=None, mu_sigma=None):

2762 """

2763 center and reduce a set of columns(or all if columnsSet is None)

2764 @param removeExtreme remove extreme values at both sides(0.05 means 0.025 on each side)

2765 @param columnsSet set of column to deal with

2766 @param op if can be:

2767 - None: substract mean and normalize,

2768 - "mean": substract mean only,

2769 - "norm": normalize only

2770 @param mu_sigma matrix with two rows(one for mean, second for sigma), if None,

2771 if computes that from the matrix self, columns must have the same order

2772 that columnSet

2773 @return the same table(with only the considered columns)

2774 """

2775 if op not in [None, "mean", "norm"]:

2776 raise ValueError( # pragma: no cover

2777 'expecting a value in [None, "mean", "norm"] for op')

2778 if columnsSet is None:

2779 columnsSet = self.header

2780 mus = self.mu_sigma_each_column(

2781 columnsSet, removeExtreme) if mu_sigma is None else mu_sigma

2782 tbl = self.extract_columns(columnsSet)

2783 n = len(self.header)

2784 for row in tbl.values:

2785 if op is None or op == "mean":

2786 for i in range(n):

2787 row[i] -= mus.values[0][i]

2788 if op is None or op == "norm":

2789 for i in range(n):

2790 row[i] /= mus.values[1][i]

2791 return tbl

2792

2793 @staticmethod

2794 def save_multiple_as_excel(filename, list_table, font="Calibri", close=True, encoding=None):

2795 """

2796 saves multiple table in one Excel file

2797

2798 @param filename filename(can be None)

2799 @param list_table list of 2uple("name", tbl)

2800 @param font font name

2801 @param close if True, close the file, otherwise, the user will have to

2802 @param encoding encoding

2803 @return object Workbook

2804 """

2805 ext = os.path.splitext(

2806 filename)[-1].lower() if filename is not None else None

2807 if ext is not None and ext == ".xls":

2808 font0 = EXf.Font()

2809 font0.name = font

2810 font0.bold = True

2811 style0 = EXs.XFStyle()

2812 style0.font = font0

2813

2814 wb = EXw.Workbook(

2815 encoding=encoding) if encoding is not None else EXw.Workbook()

2816 for sheet_name, self in list_table:

2817 ws0 = wb.add_sheet(sheet_name)

2818

2819 for i, l in enumerate(self.header):

2820 ws0.write(0, i, l, style0)

2821

2822 fnt = EXf.Font()

2823 fnt.name = font

2824 style = EXs.XFStyle()

2825 style.font = fnt

2826

2827 for irow, row in enumerate(self.values):

2828 for icol, val in enumerate(row):

2829 if isinstance(val, (int, float)):

2830 st = val

2831 elif isinstance(val, str):

2832 if encoding is not None:

2833 st = val.encode(encoding).decode(encoding)

2834 else:

2835 st = val

2836 elif val is not None:

2837 st = str(val)

2838 else:

2839 continue

2840 ws0.write(irow + 1, icol, st, style)

2841

2842 wb.save(filename)

2843 return wb

2844

2845 elif ext is None or ext == ".xlsx":

2846 wb = EXxw.Workbook(

2847 filename) if filename is not None else EXxw.Workbook()

2848 for sheet_name, self in list_table:

2849 ws0 = wb.add_worksheet(sheet_name)

2850

2851 style0 = wb.add_format({'bold': True})

2852 style0.set_font_name(font)

2853

2854 for i, l in enumerate(self.header):

2855 ws0.write(0, i, l, style0)

2856

2857 style = wb.add_format()

2858 style.set_font_name(font)

2859

2860 for irow, row in enumerate(self.values):

2861 for icol, val in enumerate(row):

2862 if isinstance(val, (int, float)):

2863 st = val

2864 elif isinstance(val, str):

2865 if encoding is not None:

2866 st = val.encode(encoding).decode(encoding)

2867 else:

2868 st = val

2869 elif val is not None:

2870 st = str(val)

2871 else:

2872 continue

2873 ws0.write(irow + 1, icol, st, style)

2874

2875 if filename is not None and close:

2876 wb.close()

2877 return wb

2878 else:

2879 raise NameError(

2880 "extension should be .xls or .xlsx for file " + filename)

2881

2882 def save_as_excel(self, filename, font="Calibri", sheet_name="sheet0",

2883 close=True, encoding=None):

2884 """

2885 saves the table as a new Excel file, you can use ``.xls`` or ``.xlsx``

2886 if filename is None, the function returns an object(xslx) and does not save it.

2887

2888 @param filename Excel filename

2889 @param sheet_name name of the sheet to add

2890 @param font font name

2891 @param close if True, close the file, otherwise, the user will have to

2892 @param encoding encoding

2893 @return object Workbook

2894 """

2895 return TableFormula.save_multiple_as_excel(filename, [(sheet_name, self)],

2896 font=font, close=close, encoding=encoding)

2897

2898 def schema_database(self, add_id=True):

2899 """

2900 returns the schema for a database which would contains this database

2901

2902 @param add_id if True, adds an index "PRIMARYKEY"

2903 @return dictionary { index_column:(name, type) }

2904 """

2905 schema = {i: (l, str) for i, l in enumerate(self.header)}

2906 if add_id is not None:

2907 schema[-1] = (add_id, int, "PRIMARYKEY", "AUTOINCREMENT")

2908

2909 if len(self) > 0:

2910 # we use the first row to determine type

2911 for i, v in enumerate(self.values[0]):

2912 if not isinstance(v, str):

2913 schema[i] = (schema[i][0], type(v))

2914 return schema

2915

2916 def fill_sql_table(self, filename_or_database, tablename, add_id="idr"):

2917 """

2918 returns a Database object, creates the database if it does not exists,

2919 same for the table

2920

2921 @param filename_or_database filename or Database object,

2922 in that second case, we assume method connect

2923 was called before

2924 @param tablename table name

2925 @param add_id if is not None, then the function adds an id, it first takes the

2926 max(id) and goes on incrementing it;

2927 @return Database object(new or the one from the parameters),

2928 in both case, the database is not disconnected

2929 """

2930 schema = self.schema_database(add_id)

2931

2932 if isinstance(filename_or_database, str):

2933 fLOG("fill_sql_table: creating database ", filename_or_database)

2934 db = Database(filename_or_database, LOG=fLOG)

2935 db.connect()

2936

2937 fLOG("fill_sql_table ", schema)

2938 if tablename not in db.get_table_list():

2939 fLOG("creationg of table ", schema)

2940 cursor = db.create_table(tablename, schema)

2941 db.append_values(self.values, tablename, schema, cursor=cursor)

2942 else:

2943 db.append_values(self.values, tablename, schema)

2944 else:

2945 db = filename_or_database

2946 db.append_values(self.values, tablename, schema)

2947

2948 return db

Coverage for src/ensae_teaching_cs/homeblog/table_formula.py: 77%

1570 statements