Coverage for src/ensae_teaching_cs/homeblog/table_formula.py: 77%
1570 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-04-28 06:23 +0200
« prev ^ index » next coverage.py v7.1.0, created at 2023-04-28 06:23 +0200
1# -*- coding: utf8 -*-
2"""
3@file
4@brief Implements TableFormula.
5"""
6import copy
7import os
8import sys
9import datetime
10import random
11import math
12import json
13import numpy
14import pandas
15from xlwt import Formatting as EXf
16from xlwt import Style as EXs
17import xlwt as EXw
18from xlsxwriter import workbook as EXxw
19from xlrd import open_workbook
20from pyquickhelper.loghelper import fLOG, noLOG
21from pyquickhelper.loghelper.convert_helper import str2datetime
22from pyensae.sql import Database
23from .table_formula_stat import _TableFormulaStat
26class TableFormula(_TableFormulaStat):
27 """
28 This class aims at representating a table, it provides
29 some "SQL like" functionalities such groupby or innerjoin, select, where...
30 This was a custom implementation of a DataFrame before I discover
31 `pandas <http://pandas.pydata.org/>`_.
33 The class provides an easy to go through the row table by converting each row
34 in a dictionary ``{ column_name: value }`` on the run. Example:
36 ::
38 tbl = TableFormula(...)
39 newtbl = tbl.filter(lambda v: v["criteria"] == 5)
41 See @see op __init__ for others ways to create a table.
43 @var header list of column names
44 @var values list of rows(each row contains as many value as the number of columns)
45 @var index dictionary { column name: position }, changing ``header`` means also changing ``header``.
47 Example:
49 ::
51 table = TableFormula("name d_a d_b d_c#A 1 2 3#A 1.1 2.1 3.1#B 3 4 5".replace(" ", "\\t").replace("#","\\n"))
52 print(table)
53 print("one value 0,1:", table[0,1])
55 print("---------")
56 dist = table.get_distinct_values("name")
57 for k in sorted(dist): print("*%d: %s"%(int(dist[k]),k))
59 print("---------")
60 table.add_column("has_A", lambda v: 1. if "A" in v["name"] else 0.)
61 print(table)
63 x = 1./3
64 print("------------- smoothing", x)
65 table.add_column_smooth("has_A_smooth", lambda v: v["has_A"], [-1,0,1], [x,x,x])
66 print(table)
68 print("--------- filter")
69 fil = table.filter(lambda v: v["d_b"] == 2)
70 print(fil)
72 print("--------- random")
73 rnd = table.random(5)
74 print(rnd)
76 print("--------- random unique")
77 rnd = table.random(1, True)
78 print(rnd)
80 print("--------- filter quantile")
81 fil = table.filter_quantile(lambda v: v["d_b"], 0, 0.4)
82 print(fil)
84 print("--------- aggregate_column")
85 total = table.aggregate(lambda v: v["d_c"])
86 print(total)
88 print("--------- sort")
89 table.sort(lambda v: v["d_b"] + v["d_c"])
90 print(table)
92 print("--------- union")
93 union = table.union(table)
94 print(union)
96 print("--------- group sum")
97 group = table.groupby(lambda v: v["name"],
98 [lambda v: v["d_a"],
99 lambda v: v["d_b"]],
100 ["name", "sum_d_a", "sum_d_b"])
101 print(group)
103 print("--------- group max")
104 groupmax = table.groupby(lambda v: v["name"],
105 [lambda v: v["d_a"],
106 lambda v: v["d_b"]],
107 ["name", "max_d_a", "max_d_b"],
108 [max, max])
109 print(groupmax)
111 print("--------- group sum with weights")
112 group = table.groupby(lambda v: v["name"],
113 [lambda v: v["d_a"]],
114 ["name", "weight", "sum_d_a"],
115 [lambda vec,w: sum(vec) / w],
116 lambda v: v ["d_b"])
118 print("--------- innerjoin")
119 innerjoin = table.innerjoin(group, lambda v: v["name"],
120 lambda v: v["name"], "group")
121 print(innerjoin)
123 print("------------- extraction")
124 ext = table.extract_columns(["name", "d_a"])
125 print(ext)
127 print("------------- remove")
128 ext = table.remove_columns(["d_a"])
129 print(ext)
131 print("------------- todict")
132 d = table.todict(lambda v: v["name"], lambda v: v["d_b"], True)
133 print(d)
135 print("------------- select")
136 d = table.select(lambda v:(v["name"], v["d_b"]))
137 print(list(d))
139 print("------------- use of an index")
140 table.create_index(lambda v:(v["name"], v["d_a"]))
141 row = table.get(('A', 1.1))
142 print(row)
143 value = table.get(('A', 1.1), 2)
144 print(value)
146 print("------------- multiply_column_by_row_instance ")
147 table = TableFormula("name d_a d_b d_c#A 1 2 3#A 1.1 2.1 3.1#B 3 4 5".replace(" ", "\\t").replace("#","\\n"))
148 table.add_column("key_add", lambda v:"unique")
149 print(table)
150 mul = table.multiply_column_by_row_instance(
151 lambda v: v["key_add"],
152 lambda v: v["name"])
153 print(mul)
155 if os.path.exists("BNP.PA.txt"):
156 print("--------------- financial stock")
157 table = TableFormula("BNP.PA.txt", sep=",")
158 table.sort(lambda v: v["Date"])
159 print(table[:10])
161 print("--------------- groupby_implicit")
162 table = TableFormula("key_name sum_a len_b avg_c#A 1 2 3#A 1.1 2.1 3.1#B 3 4 5".replace(" ", "\\t").replace("#","\\n"))
163 print(table)
164 gr = table.groupby_implicit(lambda v: v ["key_name"])
165 print(gr)
167 print("--------------- covariance")
168 values = [random.random() for i in range(0,100)]
169 values = [[x, x + random.random()/2] for x in values]
170 tbl = TableFormula(["x", "y"], values).values_to_float()
171 cov = tbl.covariance()
172 print(cov)
174 print("--------------- histogram")
175 hist = tbl.histogram(lambda v:(v["x"],1), 10)
176 print(hist)
178 print("--------------- histogram")
179 hist = tbl.values_to_float().histograms(["x", "y"], 10)
180 print(hist)
182 print("--------------- unions of columns")
183 hist = tbl.values_to_float().union_columns(["x", "y"])
184 print(hist)
185 """
187 @staticmethod
188 def add_header_if_not_present(filename, header, encoding=None, logFunction=noLOG):
189 """
190 the function checks if the first line contains the column in header
191 otherwise, it modifies the file and add them on the first line
193 @param filename filename
194 @param header list of column name(all strings)
195 @param encoding encoding
196 @param logFunction use this function to log information about what is happening
197 """
198 if encoding is None:
199 with open(filename, "r") as f:
200 firstline = f.readline().strip("\n\r ")
201 su = sum(map(lambda _: 1 if _ in header else 0, firstline.split("\t")))
202 if su < len(header) / 2.0:
203 logFunction( # pragma: no cover
204 "add_header_if_not_present: adding header({0}<{1}){2} to '{3}'\nfirstline:\n{4}".format(
205 su, len(header) / 2, header, filename, firstline))
206 with open(filename, "r") as f:
207 text = f.read()
208 text = "\t".join(header) + "\n" + text
209 logFunction( # pragma: no cover
210 "add_header_if_not_present: writing")
211 with open(filename, "w") as f:
212 f.write(text)
213 logFunction( # pragma: no cover
214 "add_header_if_not_present: complete")
215 else:
216 with open(filename, "r", encoding=encoding) as f:
217 firstline = f.readline().strip("\n\r ")
218 su = sum(map(lambda _: 1 if _ in header else 0, firstline.split("\t")))
219 if su < len(header) / 2.0:
220 logFunction( # pragma: no cover
221 "add_header_if_not_present: adding header({0}<{1}){2} to '{3}'\nfirstline:\n{4}".format(
222 su, len(header) / 2, header, filename, firstline))
223 with open(filename, "r", encoding=encoding) as f:
224 text = f.read()
225 text = "\t".join(header) + "\n" + text
226 logFunction( # pragma: no cover
227 "add_header_if_not_present: writing")
228 with open(filename, "w", encoding=encoding) as f:
229 f.write(text)
230 logFunction( # pragma: no cover
231 "add_header_if_not_present: complete")
233 @staticmethod
234 def random_split_file(filename, outfileprefix, nb, has_header=True, encoding=None, logFunction=noLOG):
235 """
236 split a file in nb buckets by random(lines are sent to a random file as they come)
238 @param filename filename to split
239 @param nb number of buckets
240 @param outfileprefix output files will start with outfileprefix + '%04d.txt' % i
241 @param encoding encoding
242 @param has_header the header will be replicated in each created file
243 @param logFunction to display information
244 @return list of created files
245 """
246 firstline = None
247 if has_header:
248 if encoding is None:
249 with open(filename, "r") as f:
250 firstline = f.readline().strip("\n\r ")
251 else: # pragma: no cover
252 with open(filename, "r", encoding=encoding) as f:
253 firstline = f.readline().strip("\n\r ")
255 logFunction( # pragma: no cover
256 f"random_split_file: file {filename} has header {firstline}")
258 logFunction("random_split_file: split %s in %d parts" % (filename, nb))
259 fileName = [outfileprefix + (".%04d.txt" % n) for n in range(0, nb)]
260 nbline = 0
262 if encoding is None:
263 filesP = [open(_, "w") for _ in fileName]
264 if firstline is not None:
265 for _ in filesP:
266 _.write(firstline + "\n")
267 with open(filename, "r") as f:
268 line = f.readline()
269 if firstline is not None:
270 line = f.readline()
271 while line is not None and len(line) > 0:
272 h = random.randint(0, nb - 1)
273 filesP[h].write(line)
274 line = f.readline()
275 nbline += 1
276 if nbline % 1000000 == 0:
277 logFunction( # pragma: no cover
278 "random_split_file: processed %d lines" % nbline)
279 else:
280 filesP = [open(_, "w", encoding=encoding) for _ in fileName]
281 if firstline is not None:
282 for _ in filesP:
283 _.write(firstline + "\n")
284 with open(filename, "r", encoding=encoding) as f:
285 line = f.readline()
286 if firstline is not None:
287 line = f.readline()
288 while line is not None and len(line) > 0:
289 h = random.randint(0, nb - 1)
290 filesP[h].write(line)
291 line = f.readline()
292 nbline += 1
293 if nbline % 1000000 == 0:
294 logFunction( # pragma: no cover
295 "random_split_file: processed %d lines" % nbline)
297 for _ in filesP:
298 _.close()
299 logFunction("random_split_file: end")
300 return fileName
302 @staticmethod
303 def ratio(x, y):
304 """
305 return a ratio between two real values or an empty string if the denominateur is null
306 @return a real of an empty string
307 """
308 return x * 1.0 / y if y != 0 else (0 if x == 0 else "")
310 @staticmethod
311 def bootstrap(values, function, nbdraws=-1, alpha=0.05):
312 """
313 return a confidence interval for a statistics
314 @param values values
315 @param function produces the statistics over a random set of observations chosen in values
316 @param nbdraws number of draws, if it is equal to -1, is equal to len(values)
317 @param alpha confidence level
318 @return average, min, lower bound, higher bound, max
319 """
320 stat = []
321 N = len(values) - 1
322 if nbdraws == - 1:
323 nbdraws = len(values)
324 for i in range(nbdraws):
325 randset = [values[random.randint(0, N)] for i in range(N + 1)]
326 s = function(randset)
327 stat.append(s)
328 stat.sort()
329 lv = len(stat)
330 alpha = alpha / 2
331 i1 = int(lv * alpha + 0.5)
332 i2 = int(lv * (1 - alpha) + 0.5)
333 i2 = min(i2, len(stat) - 1)
334 av = sum(stat) / len(stat)
335 return av, min(stat), stat[i1], stat[i2], max(stat)
337 @staticmethod
338 def correlation_bicolumn(values, deviations=False, noCenter=False):
339 """
340 assume values is a matrix with two columns
341 @param values 2 column matrix
342 @param deviations if True, returns cor, sigma1, sigma2
343 @param noCenter if True, do not remove the average before computing the covariance,
344 it means we assume variables are already centered
345 @return correlation factor or correlation, sigma1, sigma2 if deviations is True
346 """
347 if len(values) <= 1:
348 raise ValueError( # pragma: no cover
349 f"expecting more than one observation, not {len(values)}")
351 mx = 0.
352 my = 0.
353 vx = 0.
354 vy = 0.
355 co = 0.
356 nb = 0.
357 for a, b in values:
358 nb += 1
359 mx += a
360 my += b
361 vx += a ** 2
362 vy += b ** 2
363 co += a * b
364 mx /= nb
365 my /= nb
366 vx /= nb
367 vy /= nb
368 co /= nb
369 if not noCenter:
370 vx -= mx ** 2
371 vy -= my ** 2
372 co -= mx * my
373 vx = vx ** 0.5
374 vy = vy ** 0.5
375 v = vx * vy
376 if v != 0:
377 co /= v
378 if deviations:
379 return co, vx, vy # pragma: no cover
380 return co
382 def _private_getclass(self):
383 """
384 the class often creates another class of the same type,
385 this function returns the class object
386 """
387 return self.__class__
389 def __init__(self, file, numeric_column=None, sep="\t", encoding=None,
390 read_n_lines=-1, sheet=0, **options):
391 """
392 It can either take a filename, an object TableFormula,
393 a list of columns and values.
395 :param file: filename or a list of column names or a dictionary,
396 file can also be a `pandas DataFrame
397 <http://pandas.pydata.org/pandas-docs/dev/dsintro.html#dataframe>`_.
398 :param numeric_column: depends on file types(see below examples)
399 :param sep: column separator if file is a filename
400 :param read_n_lines: read the first n lines(or all if it is -1)
401 :param sheet: in case the file is an Excel file, this parameter precises the sheet number or name
402 :param suffix_nb: if True, adds an integer to the column name if it is a duplicate
404 Example:
406 ::
408 table = TableFormula("name d_a d_b d_c#A 1 2 3#A 1.1 2.1 3.1#B 3 4 5".replace(" ", "\\t").replace("#","\\n"))
410 or
412 ::
414 table = TableFormula("file.txt", ["nb"])
416 or
418 ::
420 table = TableFormula(["date", "Y", "Y2", "xl"], values)
422 or
424 ::
426 data = [{ "one":1, "two":2 }, {"two":2.1, "three":3 }]
427 table = TableFormula(data)
429 or
431 ::
433 data = { 1:{ "one":2.3, "two":2.2 }, 2:{"one":2.1, "two":3 }
434 table = TableFormula("__byrow__", data)
436 or
438 ::
440 table = TableFormula(numpy.matrix(...))
442 or
444 ::
446 table = TableFormula(numpy.array(...))
449 @warning In this second case, rows and header are not copied.
450 """
451 if numeric_column is None:
452 numeric_column = []
453 if isinstance(file, str):
454 if os.path.exists(file):
455 self._read_file(file, numeric_column, sep,
456 encoding, read_n_lines, sheet=sheet)
457 elif file == "__byrow__" and isinstance(numeric_column, dict):
458 self._fill_by_row(numeric_column)
459 else:
460 lines = file.split("\n")
461 if len(lines) == 1:
462 raise FileNotFoundError( # pragma: no cover
463 f"A file was probably expected but was not found: '{file}'.")
464 self._readlines(lines, numeric_column, sep)
466 elif isinstance(file, list):
467 if len(file) == 0:
468 raise ValueError( # pragma: no cover
469 "Empty data and columns are not allowed.")
471 if isinstance(file[0], dict):
472 self.index = {}
473 self.values = []
474 for row in file:
475 for k, v in row.items():
476 if k not in self.index:
477 self.index[k] = len(self.index)
479 # we sort the labels to avoid instabilities
480 labels = [k for k, v in self.index.items()]
481 labels.sort()
482 self.index = {}
483 for la in labels:
484 self.index[la] = len(self.index)
486 for row in file:
487 line = [None for k in self.index]
488 for k, v in row.items():
489 line[self.index[k]] = v
490 self.values.append(line)
492 self.header = [None for k in self.index]
493 for k, v in self.index.items():
494 self.header[v] = k
496 n = len(self.index)
497 for row in self.values:
498 while len(row) < n:
499 row.append(None)
501 elif isinstance(numeric_column, numpy.matrix):
502 self.header = file
503 self.index = {}
504 for i, h in enumerate(self.header):
505 self.index[h] = i
506 self.values = [[float(numeric_column[i, j]) for j in range(
507 numeric_column.shape[1])] for i in range(numeric_column.shape[0])]
508 elif isinstance(numeric_column, numpy.ndarray):
509 self.header = file
510 self.index = {}
511 for i, h in enumerate(self.header):
512 self.index[h] = i
513 self.values = [[float(numeric_column[i, j]) for j in range(
514 numeric_column.shape[1])] for i in range(numeric_column.shape[0])]
515 elif isinstance(file[0], list):
516 if len(file) == 1:
517 self.header = file[0]
518 self.values = file[1:] + numeric_column
519 self.index = {}
520 for i, h in enumerate(self.header):
521 self.index[h] = i
522 else:
523 self.header = file[0]
524 self.values = file[1:]
525 self.index = {}
526 for i, h in enumerate(self.header):
527 self.index[h] = i
528 elif isinstance(file[0], str):
529 self.header = file
530 self.values = numeric_column
531 self.index = {}
532 for i, h in enumerate(self.header):
533 self.index[h] = i
534 else:
535 raise RuntimeError( # pragma: no cover
536 "This case should not happen: " + str(type(file[0])))
538 elif isinstance(file, numpy.matrix): # pragma: no cover
539 self.header = ["c%d" % d for d in range(file.shape[1])]
540 self.index = {}
541 for i, h in enumerate(self.header):
542 self.index[h] = i
543 self.values = [[float(file[i, j]) for j in range(
544 file.shape[1])] for i in range(file.shape[0])]
546 elif isinstance(file, numpy.ndarray):
547 self.header = ["c%d" % d for d in range(file.shape[1])]
548 self.index = {}
549 for i, h in enumerate(self.header):
550 self.index[h] = i
551 self.values = [[float(file[i, j]) for j in range(
552 file.shape[1])] for i in range(file.shape[0])]
554 else:
555 if isinstance(file, pandas.DataFrame):
556 def convert(x):
557 return None if isinstance(x, float) and numpy.isnan(x) else x
558 df = file
559 self.header = [_ for _ in df.columns]
560 hi = 'index'
561 while hi in self.header:
562 hi += "_"
563 self.header.insert(0, hi)
564 self.values = []
565 for i, row in enumerate(df.values):
566 row = [df.index[i]] + [convert(x) for x in row]
567 self.values.append(row)
569 self.index = {}
570 for i, h in enumerate(self.header):
571 self.index[h] = i
572 else:
573 raise TypeError( # pragma: no cover
574 "File has an unexpected type: " + str(type(file)))
576 unique = {}
577 for i, c in enumerate(self.header):
578 if c in unique:
579 if options.get("suffix_nb", False):
580 c = "%s_%d" % (c, i)
581 self.header[i] = c
582 else:
583 raise KeyError( # pragma: no cover
584 f"column '{c}' already exists in '{self.header}'")
585 unique[c] = True
587 def __add__(self, other):
588 """
589 do an addition, add values if types are matching
590 :param other: matrix or float or string
591 :return: new matrix, keep the header of the first matrix
592 """
593 if len(self) != len(other):
594 raise ValueError( # pragma: no cover
595 "both matrices should have the same number of rows")
596 if len(self.header) != len(other.header):
597 raise ValueError( # pragma: no cover
598 "both matrices should have the same number of columns")
599 values = []
600 for row, rowo in zip(self.values, other.values):
601 r = []
602 for a, b in zip(row, rowo):
603 if type(a) == type(b):
604 x = a + b
605 else:
606 x = None
607 r.append(x)
608 values.append(r)
609 return self._private_getclass()(self.header, values)
611 def __mul__(self, other):
612 """
613 do a multiplication(by a number)
614 :param other: matrix or float or string
615 :return: new matrix, keep the header of the first matrix
616 """
617 if not isinstance(other, float) and not isinstance(other, int):
618 raise TypeError( # pragma: no cover
619 "other should be a number")
620 values = []
621 for row in self.values:
622 r = []
623 for a in row:
624 if a is not None:
625 x = a * other
626 else:
627 x = None
628 r.append(x)
629 values.append(r)
630 return self._private_getclass()(self.header, values)
632 def multiplication_term_term(self, other):
633 """
634 do a multiplication term by term(similar to an addition),
635 add values if types are matching
637 :param other: matrix or float or string
638 :return: new matrix, keep the header of the first matrix
639 """
640 if len(self) != len(other):
641 raise ValueError( # pragma: no cover
642 "both matrices should have the same number of rows")
643 if len(self.header) != len(other.header):
644 raise ValueError( # pragma: no cover
645 "both matrices should have the same number of columns")
646 values = []
647 for row, rowo in zip(self.values, other.values):
648 r = []
649 for a, b in zip(row, rowo):
650 if type(a) == type(b) and not isinstance(a, str):
651 x = a * b
652 else:
653 x = None
654 r.append(x)
655 values.append(r)
656 return self._private_getclass()(self.header, values)
658 def replicate(self, times):
659 """replicates all rows a given number of times
660 :param times: number of multiplication
661 :return: new matrix, keep the header of the first matrix
662 """
663 values = []
664 for i in range(0, times):
665 values.extend(copy.copy(self.values))
666 return self._private_getclass()(self.header, values)
668 @property
669 def size(self):
670 """
671 returns the size(nb rows, nb columns)
672 """
673 return len(self), len(self.header)
675 @property
676 def shape(self):
677 """
678 returns the size(nb rows, nb columns)
679 """
680 return self.size
682 def _fill_by_row(self, values):
683 """
684 fill the table
685 :param values: dictionary { <int_row_index>: { <column name>: value} }
686 """
687 mx = max(values.keys()) + 1
688 self.index = {}
689 self.header = []
690 for k, v in values.items():
691 for col in v:
692 if col not in self.index:
693 self.index[col] = len(self.index)
694 self.header.append(col)
695 self.values = [[None for h in self.header] for k in range(mx)]
696 for k, v in values.items():
697 for col, to in v.items():
698 self.values[k][self.index[col]] = to
700 def __getitem__(self, irow):
701 """
702 operator [], accepts slices
703 :param irow: integer, tuple, slice or list
704 :return: depends on irow
705 - int --> a table with one row
706 - slice --> a table with several rows
707 - list --> a table with the selected rows
708 - tuple --> a value
709 """
710 if isinstance(irow, int):
711 return self._private_getclass()(
712 self.header, [self.values[irow]])
713 if isinstance(irow, slice):
714 return self._private_getclass()(
715 self.header, [self.values[ii] for ii in range(*irow.indices(len(self)))])
716 if isinstance(irow, list):
717 return self._private_getclass()(
718 self.header, [self.values[ii] for ii in irow])
719 if isinstance(irow, tuple):
720 if isinstance(irow[1], str):
721 row = self.values[irow[0]]
722 v = self._interpret_row(row)
723 return v[irow[1]]
724 return self.values[irow[0]][irow[1]]
725 raise TypeError("Invalid argument type: " + str(type(irow)))
727 def __setitem__(self, irow, value):
728 """
729 operator [], just accepts tuple(to change a value)
730 :param irow: 2-uple
731 :param value: new value
732 """
733 if isinstance(irow, tuple):
734 if isinstance(irow[1], str):
735 row = self.values[irow[0]]
736 v = self._interpret_row(row)
737 v[irow[1]] = value
738 else:
739 self.values[irow[0]][irow[1]] = value
740 else:
741 raise TypeError( # pragma: no cover
742 "Invalid argument type(only tuple accepted): " + str(type(irow)))
744 def __len__(self):
745 """
746 returns the number of rows
747 """
748 return len(self.values)
750 def __copy__(self):
751 """
752 operator copy
753 """
754 return self._private_getclass()(self.header, self.values)
756 def __deepcopy__(self, memo):
757 """
758 operator ``deepcopy``
759 """
760 return self._private_getclass()(copy.deepcopy(self.header, memo), copy.deepcopy(self.values, memo))
762 def copy(self):
763 """
764 call ``copy.deepcopy(self)``
765 """
766 return copy.deepcopy(self)
768 def delta(self, other):
769 """
770 returns a list of differences between self and others
772 :param other: TableFormula
773 :return: list of differences(first one)
774 """
775 if other is None:
776 return False
777 if not isinstance(other, TableFormula):
778 raise TypeError("other is not a table: " + str(type(other)))
779 if len(self.header) != len(other.header):
780 return ["different number of columns"]
781 for a, b in zip(self.header, other.header):
782 if a != b:
783 return ["different columns"]
784 if len(self.values) != len(other.values):
785 return ["different number of rows"]
786 line = 0
787 for r, s in zip(self.values, other.values):
788 if len(r) != len(s):
789 return ["different number of values on row %d" % line]
790 col = 0
791 for a, b in zip(r, s):
792 if a != b:
793 return ["different value on cell %d,%d: %s!=%s(type %s, %s)" % (line, col, a, b, str(type(a)), str(type(b)))]
794 col += 1
795 line += 1
796 return []
798 def __eq__(self, other):
799 """
800 check if two tables are equal by value
801 :param other: other table
802 :return: boolean
803 """
804 if other is None:
805 return False
806 if not isinstance(other, TableFormula):
807 return False
808 if len(self.header) != len(other.header):
809 return False
810 for a, b in zip(self.header, other.header):
811 if a != b:
812 return False
813 if len(self.values) != len(other.values):
814 return False
815 for r, s in zip(self.values, other.values):
816 if len(r) != len(s):
817 return False
818 for a, b in zip(r, s):
819 if a != b:
820 return False
821 return True
823 def __str__(self):
824 """
825 convert the table into a string
826 :return: string
827 """
828 rows = ["\t".join(self.header)]
829 for row in self.values:
830 s = "\t".join([str(_) for _ in row])
831 rows.append(s)
832 return "\n".join(rows)
834 def __html__(self, class_table=None, class_td=None, class_tr=None, class_th=None):
835 """
836 Converts the table into a :epkg:`html` string.
838 :param class_table: adds a class to the tag ``table`` (None for none)
839 :param class_td: adds a class to the tag ``td`` (None for none)
840 :param class_tr: adds a class to the tag ``tr`` (None for none)
841 :param class_th: adds a class to the tag ``th`` (None for none)
842 """
843 clta = f' class="{class_table}"' if class_table is not None else ""
844 cltr = f' class="{class_tr}"' if class_tr is not None else ""
845 cltd = f' class="{class_td}"' if class_td is not None else ""
846 clth = f' class="{class_th}"' if class_th is not None else ""
848 rows = [f"<table{clta}>"]
849 rows.append("{0}{1}{2}".format(("<tr%s><th%s>" % (cltr, clth)),
850 ("</th><th%s>" % clth).join(self.header), "</th></tr>"))
851 septd = f"</td><td{cltd}>"
852 strtd = f"<tr{cltr}><td{cltd}>"
853 for row in self.values:
854 s = septd.join([str(_) for _ in row])
855 rows.append(strtd + s + "</td></tr>")
856 rows.append("</table>")
857 rows.append("")
858 return "\n".join(rows)
860 def __rst__(self, add_line=True):
861 """
862 convert the table into rst format
864 ::
866 +------------------------+------------+----------+----------+
867 | Header row, column 1 | Header 2 | Header 3 | Header 4 |
868 | (header rows optional) | | | |
869 +========================+============+==========+==========+
870 | body row 1, column 1 | column 2 | column 3 | column 4 |
871 +------------------------+------------+----------+----------+
872 | body row 2 | ... | ... | |
873 +------------------------+------------+----------+----------+
875 :param add_line: add a line separator between each row
876 """
877 tbl = self.values_to_str()
878 length = [len(_) for _ in tbl.header]
879 for row in tbl.values:
880 for i, v in enumerate(row):
881 length[i] = max(length[i], len(v))
882 length = [_ + 2 for _ in length]
883 line = ["-" * le for le in length]
884 lineb = ["=" * le for le in length]
885 sline = f"+{'+'.join(line)}+"
886 slineb = f"+{'+'.join(lineb)}+"
887 res = [sline]
889 def complete(cool):
890 s, i = cool
891 i -= 2
892 if len(s) < i:
893 s += " " * (i - len(s))
894 return s
896 res.append(f"| {' | '.join(map(complete, zip(tbl.header, length)))} |")
897 res.append(slineb)
898 res.extend([f"| {' | '.join(map(complete, zip(row, length)))} |"
899 for row in tbl.values])
900 if add_line:
901 t = len(res)
902 for i in range(t - 1, 3, -1):
903 res.insert(i, sline)
904 res.append(sline)
905 return "\n".join(res) + "\n"
907 def strtype(self):
908 """
909 displays the type of values(not the values)
910 """
911 rows = ["\t".join(self.header)]
912 for row in self.values:
913 s = "\t".join([str(type(_)) for _ in row])
914 rows.append(s)
915 return "\n".join(rows)
917 def _read_file(self, file, numeric_column, sep, encoding, read_n_lines, sheet=0):
918 """
919 private
920 """
921 ext = os.path.splitext(file)[-1].lower()
922 if ext in [".xls", ".xlsx"]:
923 lines = list(open_workbook(file, sheet=sheet))
924 # removing empty column(assuming first row is the header)
925 ind = [i for i, n in enumerate(lines[0]) if len(n) > 0]
926 if len(ind) < len(lines[0]):
927 lines = [[line[i] for i in ind] for line in lines]
928 else:
929 if sys.version_info.major >= 3 or encoding is None:
930 if encoding is None:
931 f = open(file, "r")
932 else:
933 f = open(file, "r", encoding=encoding)
934 else:
935 f = open(file, "r", encoding=encoding)
937 if read_n_lines > 0:
938 lines = []
939 for line in f:
940 if len(lines) >= read_n_lines:
941 break
942 lines.append(line)
943 else:
944 lines = f.readlines()
945 f.close()
946 self._readlines(lines, numeric_column, sep)
948 def change_header(self, new_header):
949 """
950 change the column names
952 :param new_header: a list or a function which modifies the header
954 Example:
956 ::
958 tbl.change_header(lambda h: h if h != "column" else "new_name")
960 .. warning:: Do not do that yourself, the class holds a dictionary up to date with the column index.
961 """
962 if isinstance(new_header, list):
963 self.header = new_header
964 self.index = {}
965 for i, h in enumerate(self.header):
966 self.index[h] = i
967 else:
968 he = [new_header(h) for h in self.header]
969 self.change_header(he)
971 def rename_column(self, old_name, new_name):
972 """
973 rename a column
975 :param old_name: old name
976 :param new_name: new name
977 """
978 header = [{old_name: new_name}.get(_, _) for _ in self.header]
979 self.change_header(header)
981 def save(self, filename, sep="\t", encoding=None, newline="\n"):
982 """
983 saves the tables in a text file, first row is the column names
985 :param filename: filename
986 :param sep: column separator
987 :param encoding: encoding
988 :param newline: line separator
989 """
990 if sys.version_info.major >= 3 or encoding is None:
991 if encoding is None:
992 f = open(filename, "w", newline=newline)
993 else:
994 f = open(filename, "w", encoding=encoding, newline=newline)
995 else:
996 f = open(filename, "w", encoding=encoding)
998 f.write(sep.join(self.header))
999 f.write("\n")
1000 for row in self.values:
1001 f.write(sep.join([str(_) for _ in row]))
1002 f.write("\n")
1003 f.close()
1005 def _readlines(self, lines, numeric_column, sep):
1006 """private"""
1007 if isinstance(lines[0], str):
1008 lines = [_.replace("\ufeff", "").replace("\xef\xbb\xbf", "")
1009 .strip("\n\r ").split(sep) for _ in lines if len(_) > 0]
1010 self.header = lines[0]
1011 self.values = lines[1:]
1012 self.index = {}
1013 for i, h in enumerate(self.header):
1014 self.index[h] = i
1015 elif isinstance(lines[0], list):
1016 self.header = lines[0]
1017 self.values = lines[1:]
1018 self.index = {}
1019 for i, h in enumerate(self.header):
1020 self.index[h] = i
1021 else:
1022 raise RuntimeError("unexpected format: " + str(type(lines[0])))
1024 self._auto_conversion(numeric_column)
1026 def _auto_conversion(self, others_columns):
1027 """
1028 private
1029 set up the column type based on the column name
1030 """
1031 def condition(k):
1032 if k.startswith("sum_") or k.startswith("pos_"):
1033 return True
1034 if k.startswith("avg_") or k.startswith("len_"):
1035 return True
1036 if k.startswith("nb_") or k.startswith("max_") or k.startswith("min_"):
1037 return True
1038 if k.startswith("d_") or k in others_columns:
1039 return True
1040 if k in ["Open", "High", "Low", "Close", "Volume", "Adj Close"]:
1041 return True
1042 if k in ["distance", "nb"]:
1043 return True
1044 return False
1046 for i, k in enumerate(self.header):
1047 if k == "Date":
1048 for row in self.values:
1049 if isinstance(row[i], str):
1050 row[i] = datetime.datetime.strptime(row[i], '%Y-%m-%d')
1051 elif isinstance(row[i], float):
1052 row[i] = datetime.datetime.utcfromtimestamp(row[i])
1053 else:
1054 raise RuntimeError(
1055 f"unable to extract a date from type {type(row[i])}")
1056 elif condition(k):
1057 for row in self.values:
1058 row[i] = float(row[i])
1059 else:
1060 for row in self.values:
1061 if isinstance(row[i], str) and row[i] == "None":
1062 row[i] = None
1064 def get_column_values(self, col):
1065 """
1066 private
1067 returns all values for one column
1068 """
1069 i = self.index[col]
1070 return [row[i] for row in self.values]
1072 def get_distinct_values(self, col):
1073 """private"""
1074 row = self.get_column_values(col)
1075 dis = {}
1076 for r in row:
1077 dis[r] = dis.get(r, 0) + 1
1078 return dis
1080 def _interpret_row(self, row):
1081 """
1082 private
1083 returns each row as a dictionary { column_name:value }
1084 """
1085 values = {}
1086 for a, b in zip(self.header, row):
1087 values[a] = b
1088 return values
1090 def __iter__(self):
1091 """
1092 iterator on all rows, it returns a dictionary { column:value }
1093 @return dictionary
1094 """
1095 for row in self.values:
1096 yield self._interpret_row(row)
1098 def add_column(self, colname, function, position=-1):
1099 """
1100 Adds a column.
1101 :param colname: column name or columns name if it is a list or a tuple
1102 :param function: function which will gives the values(or a list of functions, or a function which return a tuple)
1103 :param position: where to insert the column, -1 for the end
1105 Example:
1107 ::
1109 table.add_column("has_A", lambda v: 1 if "A" in v["name"] else 0, 0)
1111 table.add_column(("has_A", "has_B"),(lambda v: 1 if "A" in v["name"] else 0,
1112 lambda v: 1 if "B" in v["name"] else 0))
1114 table.add_column(("has_A", "has_B"),(lambda v:(1 if "A" in v["name"] else 0, 1 if "B" in v["name"] else 0))
1115 """
1116 if isinstance(colname, str):
1117 if position == -1:
1118 self.index[colname] = len(self.index)
1119 for row in self.values:
1120 v = self._interpret_row(row)
1121 x = function(v)
1122 row.append(x)
1123 self.header.append(colname)
1124 else:
1125 for row in self.values:
1126 v = self._interpret_row(row)
1127 x = function(v)
1128 row.insert(position, x)
1129 self.header.insert(position, colname)
1130 self.index = {v: i for i, v in enumerate(self.header)}
1132 elif isinstance(function, list):
1133 if len(colname) != len(function):
1134 raise ValueError( # pragma: no cover
1135 "unable to continue, colname and function do not have the same number of elements")
1136 if position == -1:
1137 position = [-1] * len(colname)
1138 elif isinstance(position, int):
1139 position = [position] * len(colname)
1140 else:
1141 if len(position) != len(colname):
1142 raise RuntimeError( # pragma: no cover
1143 "Unable to continue, colname and position do not "
1144 "have the same number of elements.")
1145 dec = 0
1146 for a, b, c in zip(colname, function, position):
1147 self.add_column(a, b, c + dec)
1148 dec += 1
1150 else:
1151 # we assume here, the function returns a tuple
1152 if not isinstance(position, int):
1153 raise TypeError( # pragma: no cover
1154 "Int expected for position for this case.")
1156 if position == -1:
1157 for row in self.values:
1158 v = self._interpret_row(row)
1159 x = function(v)
1160 row.extend(x)
1161 self.header.extend(colname)
1163 else:
1164 for row in self.values:
1165 v = self._interpret_row(row)
1166 x = function(v)
1167 for i, _ in enumerate(x):
1168 row.insert(position + i, _)
1169 for i, c in enumerate(colname):
1170 self.header.insert(position + i, c)
1171 self.index = {v: i for i, v in enumerate(self.header)}
1172 return self
1174 def add_column_index(self, colname="index", start=0):
1175 """
1176 Example:
1178 ::
1180 table.add_column("index_row")
1181 """
1182 self.index[colname] = len(self.index)
1183 for i, row in enumerate(self.values):
1184 row.append(i + start)
1185 self.header.append(colname)
1186 return self
1188 def addc(self, colname, function, position=-1):
1189 """
1190 @see me add_column
1191 """
1192 return self.add_column(colname, function, position)
1194 def add_column_recursive(self, colname, functionValue, functionAgg):
1195 """
1196 Example:
1198 ::
1200 table.add_column_recursive(lambda v: v ["norm_%s" % loi],
1201 lambda li, v: li[-1] + v)
1202 """
1203 self.index[colname] = len(self.index)
1204 values = []
1205 for row in self.values:
1206 v = self._interpret_row(row)
1207 x = functionValue(v)
1208 y = functionAgg(values, x)
1209 row.append(y)
1210 values.append(y)
1211 self.header.append(colname)
1212 return self
1214 def add_column_recursive_row(self, colname, functionAgg):
1215 """
1216 Example:
1218 ::
1220 table.add_column_recursive_row("w_%s" % loi,
1221 lambda li, v: li[-1] + v ["norm_%s" % loi] \
1222 if len(li)> 0 else v ["norm_%s" % loi])
1223 """
1224 self.index[colname] = len(self.index)
1225 values = []
1226 for row in self.values:
1227 v = self._interpret_row(row)
1228 y = functionAgg(values, v)
1229 row.append(y)
1230 values.append(y)
1231 self.header.append(colname)
1232 return self
1234 def add_column_vector(self, colname, vector):
1235 """
1236 add a column defined by vector(list of values for each row)
1238 :param colname: column to add
1239 :param vector: (list) list of values to add to each row
1240 :return self
1241 """
1242 if len(vector) != len(self):
1243 raise ValueError( # pragma: no cover
1244 f"vector and table have different length {len(vector)} != {len(self)}")
1245 for vec, row in zip(vector, self.values):
1246 row.append(vec)
1247 self.index[colname] = len(self.index)
1248 self.header.append(colname)
1249 return self
1251 def add_column_smooth(self, colname, function, position, weights):
1252 """
1253 Example:
1255 ::
1257 x = 1./3
1258 table.add_column_smooth("has_A_smooth", lambda v: v["has_A"], [-1,0,1], [x,x,x])
1259 """
1260 if len(position) != len(weights):
1261 raise ValueError( # pragma: no cover
1262 "position and weights must have the same length")
1263 self.index[colname] = len(self.index)
1264 column = [function(self._interpret_row(row)) for row in self.values]
1265 tw = sum(weights)
1266 couple = list(zip(position, weights))
1267 for p, row in enumerate(self.values):
1268 sx = 0.
1269 sw = 0.
1270 ms = 0
1271 for i, w in couple:
1272 pi = p + i
1273 if 0 <= pi < len(self):
1274 sx += column[pi] * w
1275 sw += w
1276 else:
1277 ms += 1
1279 if ms == 0:
1280 row.append(sx)
1281 elif sw != 0:
1282 row.append(sx * tw / sw)
1283 else:
1284 row.append(sx)
1285 self.header.append(colname)
1286 return self
1288 def aggregate_column(self, colname, aggregated_function=sum):
1289 """
1290 Example:
1292 ::
1294 total = table.aggregate_column("d_c", sum)
1295 """
1296 def function(v):
1297 return v[colname]
1298 return self.aggregate(function, aggregated_function)
1300 def aggregate(self, function, aggregated_function=sum):
1301 """
1302 Example:
1304 ::
1306 total = table.aggregate_column(lambda v: v["d_c"], len)
1307 """
1308 return aggregated_function([function(self._interpret_row(row)) for row in self.values])
1310 def where(self, condition_function):
1311 """
1312 @see me filter
1313 """
1314 return self.filter(condition_function)
1316 def filter(self, condition_function):
1317 """
1318 Example:
1320 ::
1322 fil = table.filter(lambda v: v["d_b"] == 2)
1324 @warning Rows are not copied.
1325 """
1326 newv = []
1327 for row in self.values:
1328 v = self._interpret_row(row)
1329 x = condition_function(v)
1330 if x:
1331 newv.append(row)
1332 final = self._private_getclass()(self.header, newv)
1333 return final
1335 def groupby_implicit(self, functionKey, functionWeight=None, logging=None):
1336 """
1337 use prefix of a column name to know which function to use
1338 as an aggregated(sum, avg, len, key, none, max, min)
1339 Example:
1341 ::
1343 group = table.groupby_implicit(lambda v: v["name"])
1344 """
1345 def identical(col, v):
1346 return v[col]
1348 def first(vec):
1349 return vec[0]
1351 def avg(vec):
1352 return TableFormula.ratio(sum(vec), len(vec))
1354 functions = []
1355 labels = ["key"]
1356 functionsAgg = []
1357 for col in self.header:
1358 if col.startswith("key"):
1359 values = self.select(
1360 lambda v, col=col: (v[col], functionKey(v)))
1361 dd = {}
1362 for v in values:
1363 if v[1] not in dd:
1364 dd[v[1]] = {}
1365 dd[v[1]][v[0]] = 1
1366 for k in dd:
1367 dd[k] = len(dd[k]) # pylint: disable=E4702
1368 keep = []
1369 for k, v in dd.items():
1370 if v > 1:
1371 keep.append((k, v))
1373 if len(keep) == 0:
1374 functions.append(lambda v, col=col: identical(col, v))
1375 labels.append(col)
1376 functionsAgg.append(first)
1377 elif logging is not None:
1378 end = min(len(keep), 10)
1379 mes = ",".join([str(_) for _ in keep[:end]])
1380 logging("removing column '{0}' no unique value: {1}: {2}".format(
1381 col, len(dd), mes))
1382 elif col.startswith("sum"):
1383 functions.append(lambda v, col=col: identical(col, v))
1384 labels.append(col)
1385 functionsAgg.append(sum)
1386 elif col.startswith("len"):
1387 functions.append(lambda v, col=col: 1)
1388 labels.append(col)
1389 functionsAgg.append(len)
1390 elif col.startswith("min"):
1391 functions.append(lambda v, col=col: 1)
1392 labels.append(col)
1393 functionsAgg.append(min)
1394 elif col.startswith("max"):
1395 functions.append(lambda v, col=col: 1)
1396 labels.append(col)
1397 functionsAgg.append(max)
1398 elif col.startswith("avg"):
1399 functions.append(lambda v, col=col: identical(col, v))
1400 labels.append(col)
1401 functionsAgg.append(avg)
1402 elif col.startswith("none"):
1403 pass
1404 else:
1405 raise RuntimeError("unable to aggregate column " + col)
1407 return self.groupby(functionKey, functions, labels, functionsAgg, functionWeight)
1409 def groupby(self, functionKey, functionsValue, columns=None, functionsAgg=None, functionWeight=None):
1410 """
1411 Example:
1413 ::
1415 group = table.groupby(lambda v: v["name"],
1416 [lambda v: v["d_a"],
1417 lambda v: v["d_b"]],
1418 ["name", "sum_d_a", "sum_d_b"])
1420 or
1422 ::
1424 groupmax = table.groupby(lambda v: v["name"],
1425 [lambda v: v["d_a"],
1426 lambda v: v["d_b"]],
1427 ["name", "max_d_a", "max_d_b"],
1428 [max, max])
1429 """
1430 if not isinstance(functionsValue, list):
1431 functionsValue = [functionsValue]
1432 if functionsAgg is None:
1433 functionsAgg = [sum for f in functionsValue]
1434 if functionWeight is None:
1435 if columns is not None and len(columns) != len(functionsValue) + 1:
1436 raise RuntimeError("columns should have %d names not(%d)" % (
1437 len(functionsValue) + 1, len(columns)))
1438 else:
1439 if columns is not None and len(columns) != len(functionsValue) + 2:
1440 raise RuntimeError("columns should have %d names not(%d)" % (
1441 len(functionsValue) + 2, len(columns)))
1442 if columns is not None and not isinstance(columns[0], str):
1443 raise TypeError("expecting type str not %s in columns" %
1444 (str(type(columns[0]))))
1446 hist = {}
1447 if functionWeight is not None:
1448 histWeight = {}
1450 for row in self.values:
1451 v = self._interpret_row(row)
1452 key = functionKey(v)
1453 w = 1. if functionWeight is None else functionWeight(v)
1455 if key not in hist:
1456 histWeight[key] = [w]
1457 hist[key] = [[f(v) * w] for f in functionsValue]
1458 else:
1459 histWeight[key].append(w)
1460 h = hist[key]
1461 for i, f in enumerate(functionsValue):
1462 h[i].append(f(v) * w)
1464 for key in hist: # pylint: disable=C0206
1465 h = hist[key]
1466 w = sum(histWeight[key])
1467 for i in range(0, len(h)):
1468 h[i] = functionsAgg[i](h[i], w)
1470 f = hist.items if sys.version_info.major >= 3 else hist.items
1471 histValues = [[k, sum(histWeight[k])] + v for k, v in f()]
1473 if columns is None:
1474 columns = ["key", "weight"] + ["val%d" %
1475 i for i, f in enumerate(functionsValue)]
1476 ret = self._private_getclass()(columns, histValues)
1477 return ret
1478 else:
1479 for row in self.values:
1480 v = self._interpret_row(row)
1481 key = functionKey(v)
1482 if key not in hist:
1483 hist[key] = [[f(v)] for f in functionsValue]
1484 else:
1485 h = hist[key]
1486 for i, f in enumerate(functionsValue):
1487 h[i].append(f(v))
1489 for key in hist: # pylint: disable=C0206
1490 h = hist[key]
1491 for i in range(0, len(h)):
1492 h[i] = functionsAgg[i](h[i])
1494 f = hist.items if sys.version_info.major >= 3 else hist.items
1495 histValues = [[k] + v for k, v in f()]
1497 if columns is None:
1498 columns = ["key"] + ["val%d" %
1499 i for i, f in enumerate(functionsValue)]
1500 ret = self._private_getclass()(columns, histValues)
1501 return ret
1503 def sort(self, functionValue, reverse=False):
1504 """
1505 Example:
1507 ::
1509 table.sort(lambda v: v["d_b"] + v["d_c"])
1510 """
1511 values = [(functionValue(self._interpret_row(row)), i)
1512 for i, row in enumerate(self.values)]
1513 values.sort(reverse=reverse)
1514 self.values = [self.values[_[1]] for _ in values]
1515 return self
1517 def extract_columns(self, listColumns):
1518 """
1519 extract some columns
1521 @param listColumns list of columns to remove or a function
1522 which returns True if the column has to be extracted
1523 based on its name
1524 @return table
1526 Example:
1528 ::
1530 ext = table.extract_columns(["name", "d_a"])
1531 """
1532 if isinstance(listColumns, list):
1533 indexes = [(self.index[col] if isinstance(col, str) else col)
1534 for col in listColumns]
1535 header = listColumns
1536 values = [[row[i] for i in indexes] for row in self.values]
1537 return self._private_getclass()(header, values)
1538 else:
1539 header = [_ for _ in self.header if listColumns(_)]
1540 return self.extract_columns(header)
1542 def remove_columns(self, listColumns):
1543 """
1544 remove some columns
1546 @param listColumns list of columns to remove or a function
1547 which returns True if the column has to be removed
1548 based on its name
1549 @return table
1551 Example:
1553 ::
1555 rem = table.remove("d_a")
1556 """
1557 if isinstance(listColumns, list):
1558 cols = [_ for i, _ in enumerate(
1559 self.header) if _ not in listColumns and i not in listColumns]
1560 return self.extract_columns(cols)
1561 if isinstance(listColumns, str):
1562 cols = [_ for _ in self.header if _ != listColumns]
1563 return self.extract_columns(cols)
1564 cols = [_ for _ in self.header if not listColumns(_)]
1565 return self.extract_columns(cols)
1567 def innerjoin(self, table, functionKey1, functionKey2, nameKey="key",
1568 addSuffixAnyWay=False, prefixToAdd=None, full=False,
1569 keepKey=True, putKeyInColumn=None, missingValue=None,
1570 uniqueKey=False):
1571 """
1572 @param table other table to join with
1573 @param functionKey1 key for the first table(a function)
1574 @param functionKey2 key for the second table(a function) innerjoin .... ON ...
1575 @param addSuffixAnyWay add a suffix to every column from the second table even
1576 if names are different(suffix is "+")
1577 @param prefixToAdd prefix to add the the columns of the second table
1578 @param full add all items even if there is no common keys(``FULL OUTER JOIN``),
1579 otherwise keep only common keys
1580 @param keepKey keep the key as a column in the result(column is key), otherwise not
1581 @param putKeyInColumn private parameter: keepKey has to be true and in this case,
1582 put the key in this column(integer)
1583 @param missingValue when there is not key on one side, this default value will be put in place
1584 @param uniqueKey if True, the function assumes there is a bijection between rows
1585 and keys(one row <--> one key) on both tables,
1586 otherwise, it will not.
1587 @return a table
1589 Example:
1591 ::
1593 innerjoin = table.innerjoin(group, lambda v: v["name"],
1594 lambda v: v["name"], "group")
1595 """
1596 defaultVal1 = [missingValue for k in self.header]
1597 defaultVal2 = [missingValue for k in table.header]
1599 if uniqueKey:
1600 keys = {}
1601 for row in self.values:
1602 v = self._interpret_row(row)
1603 key = functionKey1(v)
1604 keys[key] = (row, None)
1606 for row in table.values:
1607 v = table._interpret_row(row)
1608 key = functionKey2(v)
1609 if key in keys:
1610 keys[key] = (keys[key][0], row)
1611 elif full:
1612 keys[key] = (None, row)
1614 if not full:
1615 d = []
1616 for k, v in keys.items():
1617 if None in v:
1618 d.append(k)
1619 for _ in d:
1620 del keys[_]
1621 else:
1622 for k in keys: # pylint: disable=C0206
1623 v = keys[k]
1624 if v[0] is None:
1625 keys[k] = (defaultVal1, v[1]) # pylint: disable=E4702
1626 elif v[1] is None:
1627 keys[k] = (v[0], defaultVal2) # pylint: disable=E4702
1629 if keepKey:
1630 columns = [nameKey]
1631 for x in self.header:
1632 while x in columns:
1633 x += "~"
1634 columns.append(x)
1636 for x in table.header:
1637 if prefixToAdd is not None:
1638 x = prefixToAdd + x
1639 elif addSuffixAnyWay:
1640 x += "+"
1641 while x in columns:
1642 x += "+"
1643 columns.append(x)
1645 f = keys.items if sys.version_info.major >= 3 else keys.items
1646 values = [[k] + v[0] + v[1] for k, v in f() if len(v) == 2]
1647 return self._private_getclass()(columns, values)
1648 else:
1649 columns = []
1650 for x in self.header:
1651 while x in columns:
1652 x += "~"
1653 columns.append(x)
1655 for x in table.header:
1656 if prefixToAdd is not None:
1657 x = prefixToAdd + x
1658 elif addSuffixAnyWay:
1659 x += "+"
1660 while x in columns:
1661 x += "+"
1662 columns.append(x)
1664 f = keys.items if sys.version_info.major >= 3 else keys.items
1666 if putKeyInColumn is None:
1667 values = [v[0] + v[1] for k, v in f() if len(v) == 2]
1668 else:
1669 values = []
1670 for k, v in f():
1671 if len(v) == 2:
1672 nr = v[0] + v[1]
1673 nr[putKeyInColumn] = k
1674 values.append(nr)
1676 return self._private_getclass()(columns, values)
1677 else:
1678 keys = {}
1679 for row in self.values:
1680 v = self._interpret_row(row)
1681 key = functionKey1(v)
1682 if key in keys:
1683 keys[key][0].append(row)
1684 else:
1685 keys[key] = ([row], None)
1687 for row in table.values:
1688 v = table._interpret_row(row)
1689 key = functionKey2(v)
1690 if key in keys:
1691 if keys[key][1] is None:
1692 keys[key] = (keys[key][0], [row])
1693 else:
1694 keys[key][1].append(row)
1695 elif full:
1696 keys[key] = (None, [row])
1698 if not full:
1699 d = []
1700 for k, v in keys.items():
1701 if None in v:
1702 d.append(k)
1703 for _ in d:
1704 del keys[_]
1705 else:
1706 for k in keys: # pylint: disable=C0206
1707 v = keys[k]
1708 if v[0] is None:
1709 keys[k] = ([defaultVal1], v[1]
1710 ) # pylint: disable=E4702
1711 elif v[1] is None:
1712 keys[k] = (v[0], [defaultVal2]
1713 ) # pylint: disable=E4702
1715 if keepKey:
1716 columns = [nameKey]
1717 for x in self.header:
1718 while x in columns:
1719 x += "~"
1720 columns.append(x)
1722 for x in table.header:
1723 if prefixToAdd is not None:
1724 x = prefixToAdd + x
1725 elif addSuffixAnyWay:
1726 x += "+"
1727 while x in columns:
1728 x += "+"
1729 columns.append(x)
1731 f = keys.items if sys.version_info.major >= 3 else keys.items
1733 values = []
1734 for k, v in f():
1735 if len(v) == 2:
1736 for ka in v[0]:
1737 for kb in v[1]:
1738 values.append([k] + ka + kb)
1739 return self._private_getclass()(columns, values)
1740 else:
1741 columns = []
1742 for x in self.header:
1743 while x in columns:
1744 x += "~"
1745 columns.append(x)
1747 for x in table.header:
1748 if prefixToAdd is not None:
1749 x = prefixToAdd + x
1750 elif addSuffixAnyWay:
1751 x += "+"
1752 while x in columns:
1753 x += "+"
1754 columns.append(x)
1756 f = keys.items if sys.version_info.major >= 3 else keys.items
1758 if putKeyInColumn is None:
1759 values = [v[0] + v[1] for k, v in f() if len(v) == 2]
1760 else:
1761 values = []
1762 for k, v in f():
1763 if len(v) == 2:
1764 for ka in v[0]:
1765 for kb in v[1]:
1766 nr = ka + kb
1767 nr[putKeyInColumn] = k
1768 values.append(nr)
1770 return self._private_getclass()(columns, values)
1772 def filter_quantile(self, function, alpha_min=0.025, alpha_max=0.025):
1773 """
1774 sort all rows using criteria defined by function and remove
1775 rows at the extremes
1777 @param function values used to estimate the quantiles
1778 @param alpha_min lower quantile
1779 @param alpha_max higher quantile
1780 @return a table containing all the rows where the criterium
1781 is within the two quantiles
1783 Example:
1785 ::
1787 fil = table.filter_quantile(lambda v: v["d_b"], 0, 0.4)
1789 @warning Rows are not copied.
1790 """
1791 values = []
1792 for row in self.values:
1793 v = self._interpret_row(row)
1794 val = function(v)
1795 values.append((val, row))
1796 values.sort()
1797 lv = len(values)
1798 i1 = int(lv * alpha_min + 0.5)
1799 i2 = int(lv * (1 - alpha_max) + 0.5)
1800 i1 = max(i1, 0)
1801 i1 = min(i1, lv)
1802 i2 = max(i1, i2)
1803 i2 = min(i2, lv)
1804 if i2 == i1:
1805 raise RuntimeError("unable to extract quantile, the table is either "
1806 "empty or chosen quantile are not correct")
1807 values = [_[1] for _ in values[i1:i2]]
1808 return self._private_getclass()(self.header, values)
1810 def union(self, table):
1811 """
1812 @param table table
1813 @return table(with the same number of columns)
1815 concatenates two tables by rows, they must have the same header, rows of both tables are merged into a single matrix
1816 Example:
1818 ::
1820 union = table.union(table2)
1821 """
1822 if len(self.header) != len(table.header):
1823 raise ValueError( # pragma: no cover
1824 "tables do not have the same number of columns\ntbl1: %s\ntbl2: %s" % (
1825 ",".join(self.header), ",".join(table.header)))
1826 for a, b in zip(self.header, table.header):
1827 if a != b:
1828 raise ValueError( # pragma: no cover
1829 "tables do not have the same column names")
1830 return self._private_getclass()(self.header, self.values + table.values)
1832 def concatenate(self, table, addPrefix=""):
1833 """
1834 concatenates two tables by columns
1835 @param table table
1836 @param addPrefix add a prefix to each column from table
1837 @return table (with the same number of rows as the longest one)
1838 """
1839 maxr = max(len(self), len(table))
1840 header = self.header + [addPrefix + h for h in table.header]
1841 values = []
1842 for i in range(0, maxr):
1843 r1 = self.values[i] if i < len(self) else [None] * len(self.header)
1844 r2 = table.values[i] if i < len(
1845 table) else [None] * len(self.table)
1846 values.append(r1 + r2)
1847 return self._private_getclass()(header, values)
1849 def random(self, n, unique=False):
1850 """
1851 select n random row from the table, returns a table
1853 @param n number of desired random rows
1854 @param unique draws unique rows or non unique rows
1855 (tirage sans remise ou avec remise)
1856 @return a table
1858 Example:
1860 ::
1862 rnd = table.random(10)
1863 """
1864 if unique:
1865 if n > len(self):
1866 raise ValueError( # pragma: no cover
1867 "number of desired random rows is higher "
1868 "than the number of rows in the table")
1869 index = {}
1870 while len(index) < n:
1871 h = random.randint(0, len(self) - 1)
1872 index[h] = 0
1873 values = [self.values[h] for h in index]
1874 return self._private_getclass()(self.header, values)
1875 else:
1876 values = []
1877 for i in range(0, n):
1878 h = random.randint(0, len(self) - 1)
1879 values.append(self.values[h])
1880 return self._private_getclass()(self.header, values)
1882 def todict(self, functionKey, functionValue, useList=False):
1883 """
1884 convert the table as a dictionary { key:value }
1885 each of them is defined by functions.
1887 @param functionKey defines the key
1888 @param functionValue defines the value
1889 @param useList if there are multiple rows sharing the same key, it should be true,
1890 all values are stored in a list
1891 @return a dictionary { key:row } or { key: [row1, row2, ...] }
1893 Example:
1895 ::
1897 d = table.todict(lambda v: v["name"], lambda v: v["d_b"], True)
1898 """
1899 res = {}
1900 if useList:
1901 for row in self.values:
1902 v = self._interpret_row(row)
1903 key = functionKey(v)
1904 val = functionValue(v)
1905 if key in res:
1906 res[key].append(val)
1907 else:
1908 res[key] = [val]
1909 else:
1910 for row in self.values:
1911 v = self._interpret_row(row)
1912 key = functionKey(v)
1913 val = functionValue(v)
1914 res[key] = val
1915 return res
1917 def reduce_dict(self, functionKey, functionValue, uselist=False):
1918 """
1919 @see me todict
1920 """
1921 return self.todict(functionKey, functionValue, uselist)
1923 def select(self, functionRow):
1924 """
1925 @param functionRow fonction
1926 @return table
1928 Example:
1930 ::
1932 d = table.select(lambda v:(v["name"], v["d_b"]))
1933 print(list(d))
1934 """
1935 for row in self.values:
1936 v = self._interpret_row(row)
1937 nr = functionRow(v)
1938 yield nr
1940 def modify_all(self, modification_function):
1941 """
1942 apply the same modification to every number
1943 @param modification_function modification to apply to every number
1944 @return new table
1946 The signature of the function is the following one:
1948 ::
1950 def function(value, column_name):
1951 # ....
1952 return new_value
1954 Example:
1956 ::
1958 tbl = tbl.modify_all(lambda v,c: {"string":"", "numerical":0}.get(c,None) if v is None else v)
1959 """
1960 values = []
1961 for row in self.values:
1962 r = []
1963 for v, h in zip(row, self.header):
1964 r.append(modification_function(v, h))
1965 values.append(r)
1966 return self._private_getclass()(self.header, values)
1968 def dcast(self, functionKey, functionInstance, full=True):
1969 """
1970 @see me multiply_column_by_row_instance
1971 """
1972 return self.multiply_column_by_row_instance(functionKey, functionInstance, full)
1974 def multiply_column_by_row_instance(self, functionKey, functionInstance, full=True):
1975 """
1976 @param functionKey defines a key(function)
1977 @param functionInstance defines a second key(will be moved to the columns dimension)
1978 @param full introduces missing values for not found combinations
1979 @return a table
1981 If a column contains a finite set of value, for example,
1982 we have the temperature for several cities organized like if
1983 it were a table from a database: city, date, temperatue.
1984 We would like to get another table where we have:
1985 date temparature_city1 temperature_city2...
1987 Then we would type:
1988 Example:
1990 ::
1992 mul = table.multiply_column_by_row_instance(
1993 lambda v: v["date"],
1994 lambda v: v["city"])
1996 The input table would be like:
1998 ::
2000 city date
2001 A jan
2002 A feb
2003 B feb
2005 It returns:
2007 ::
2009 KEY A|city A|date B|city B|date
2010 feb A feb B feb
2011 jan A jan None None
2012 """
2013 values = [functionInstance(self._interpret_row(row))
2014 for row in self.values]
2015 distinct = {}
2016 for v in values:
2017 distinct[v] = 0
2018 distinct = [_ for _ in distinct]
2019 distinct.sort()
2020 table1 = copy.deepcopy(self)
2021 table = None
2022 header = copy.copy(table1.header)
2023 orig = len(header)
2024 nameKey = "~KEY~"
2025 while nameKey in header:
2026 nameKey += "*"
2027 nbJoin = 0
2029 for val in distinct:
2030 table2 = table1.filter(
2031 lambda v, val=val: functionInstance(v) == val)
2032 if table is None:
2033 table = table2.copy()
2034 else:
2035 colkey = table.header[0]
2036 table = table.innerjoin(table2, functionKey if nbJoin == 0 else (lambda v, c=colkey: v[c]),
2037 functionKey, nameKey=nameKey,
2038 prefixToAdd=str(val) + "|",
2039 full=full, keepKey=nbJoin == 0,
2040 putKeyInColumn=None if nbJoin == 0 else 0,
2041 uniqueKey=True)
2043 if nbJoin == 0:
2044 head = []
2045 nb = 0
2046 for h in table.header:
2047 if not h.endswith("~") and nb < orig:
2048 head.append(f"{distinct[0]}|{h}")
2049 nb += 1
2050 else:
2051 head.append(h)
2052 header = ["KEY"] + head[1:]
2053 table = self._private_getclass()(header, table.values)
2055 nbJoin += 1
2057 if nbJoin == 0:
2058 head = []
2059 nb = 0
2060 for h in table.header:
2061 if not h.endswith("~") and nb < orig:
2062 head.append(f"{distinct[0]}|{h}")
2063 nb += 1
2064 else:
2065 head.append(h)
2066 values = []
2067 for row in self.values:
2068 v = self._interpret_row(row)
2069 r = [functionKey(v)] + row
2070 values.append(r)
2071 header = ["KEY"] + head
2072 table = self._private_getclass()(header, values)
2074 return table
2076 def create_index(self, functionIndex):
2077 """
2078 this method creates an index,
2079 to get an indexes row, use method get
2080 Example:
2082 ::
2084 table.create_index(lambda v:(v["name"], v["d_a"]))
2085 row = table.get(('A', 1.1))
2086 value = table.get(('A', 1.1), 2)
2087 """
2088 self.indexspecial = {}
2089 for row in self.values:
2090 v = self._interpret_row(row)
2091 nr = functionIndex(v)
2092 if nr in self.indexspecial:
2093 raise KeyError(
2094 f"unable to add {str(nr)} because it is already present")
2095 self.indexspecial[nr] = row
2096 return self
2098 def get(self, rowIndex, column=None):
2099 """
2100 use the index created by method create_index
2101 Example:
2103 ::
2105 table.create_index(lambda v:(v["name"], v["d_a"]))
2106 row = table.get(('A', 1.1))
2107 value = table.get(('A', 1.1), 2)
2108 """
2109 if "indexspecial" not in self.__dict__:
2110 raise RuntimeError("no index was created")
2111 row = self.indexspecial[rowIndex]
2112 if column is None:
2113 return row
2114 elif isinstance(column, int):
2115 return row[column]
2116 else:
2117 return row[self.index[column]]
2119 def avg_std(self, functionValue, functionWeight=lambda v: 1):
2120 """
2121 returns the average and standard deviation
2122 """
2123 avg = 0.
2124 std = 0.
2125 n = 0.
2126 for i, row in enumerate(self.values):
2127 v = self._interpret_row(row)
2128 x = float(functionValue(v))
2129 w = functionWeight(v)
2130 avg += x * w
2131 std += x * x * w
2132 n += w
2134 if n != 0:
2135 avg /= n
2136 std /= n
2137 std -= avg * avg
2138 std = math.sqrt(std)
2139 else:
2140 avg = 0.
2141 std = 0.
2142 return avg, std
2144 def add_column_cumulative(self, column_index, column_name, functionIndex, functionValue,
2145 normalize=False, reverse=False, cumulative=True, functionSort=None):
2146 """
2147 also called the Gini function
2148 Example:
2150 ::
2152 table.add_column_cumulative("index_%s" % col, "dist_%s" % col,
2153 lambda v: v["sum_nbclient"], lambda v: v[col],
2154 functionSort = lambda v: v [col] / v["sum_nbclient"],
2155 normalize=True)
2156 """
2157 if functionSort is None:
2158 functionSort = functionValue
2159 val = []
2160 for row in self.values:
2161 v = self._interpret_row(row)
2162 i = functionIndex(v)
2163 s = functionSort(v)
2164 v = functionValue(v)
2165 val.append((s, i, v))
2166 val.sort(reverse=reverse)
2168 if cumulative:
2169 res = [(0., 0.)]
2170 for s, i, v in val:
2171 res.append((i + res[-1][0], v + res[-1][1]))
2172 del res[0]
2174 if normalize:
2175 sumi = res[-1][0]
2176 sumv = res[-1][1]
2177 if sumi != 0 and sumv != 0:
2178 res = [(_[0] / sumi, _[1] / sumv) for _ in res]
2179 else:
2180 raise ZeroDivisionError(
2181 "cannot divide by zero, all indexes or all values are null")
2182 else:
2183 res = [(i, v) for s, i, v in val]
2185 if normalize:
2186 sumi = sum([_[0] for _ in res])
2187 sumv = sum([_[1] for _ in res])
2188 if sumi != 0 and sumv != 0:
2189 res = [(_[0] / sumi, _[1] / sumv) for _ in res]
2190 else:
2191 raise ZeroDivisionError(
2192 "cannot divide by zero, all indexes or all values are null")
2194 for row, add in zip(self.values, res):
2195 row.extend(add)
2196 self.index[column_index] = len(self.index)
2197 self.index[column_name] = len(self.index)
2198 self.header.append(column_index)
2199 self.header.append(column_name)
2200 return self
2202 def transpose(self, labelC=None, labelAsRow=True):
2203 """
2204 Computes the transpose.
2205 @param labelC proposes labels for the column,
2206 if None, take "r%d" % i,
2207 if it is a string, the function assumes it is a column name
2208 @param labelAsRow add the label as a row
2209 @return new table
2210 """
2211 if labelC is None:
2212 label = ["r%d" % i for i in range(0, len(self.values))]
2213 if labelAsRow:
2214 label = ["rowheader"] + label
2215 rem = None
2216 elif isinstance(labelC, str):
2217 label = list(self.select(lambda v: v[labelC]))
2218 rem = label
2219 else:
2220 rem = None
2221 label = labelC
2223 values = []
2224 for i in range(0, len(self.header)):
2225 if rem is not None and self.header[i] == labelC:
2226 continue
2227 row = [_[i] for _ in self.values]
2228 if labelAsRow:
2229 row = [self.header[i]] + row
2230 values.append(row)
2231 return self._private_getclass()(label, values)
2233 def covariance(self):
2234 """
2235 Computes the covariance matrix, the first column
2236 will contains the column names.
2237 @return new table
2238 """
2239 for i, x in enumerate(self.values[0]):
2240 if not isinstance(x, float):
2241 raise TypeError("expecting a float on column %d" % i)
2242 values = self.np_matrix
2243 N = values.shape[0]
2244 sums = numpy.sum(values, axis=0) / N
2245 for i in range(0, values.shape[1]):
2246 values[:, i] -= sums[0, i]
2247 cov = values.transpose() * values
2248 cov /= N
2249 head = ["var"] + self.header
2250 size = cov.shape
2251 values = [[self.header[
2252 i]] + [float(cov[i, j]) for j in range(0, size[1])] for i in range(0, size[0])]
2253 tbl = self._private_getclass()(head, values)
2254 return tbl
2256 def correlation_col(self, col1, col2, noCenter=False):
2257 """
2258 Computes the correlation between two columns.
2259 @param col1 column 1
2260 @param col2 column 2
2261 @param noCenter does the computation without removing the average
2262 @return float(covariance)
2263 """
2264 values = [[self._interpret_row(row)[col1], self._interpret_row(row)[
2265 col2]] for row in self.values]
2266 if len(values) <= 1:
2267 raise ValueError( # pragma: no cover
2268 f"expecting more than one observation, not {len(values)}")
2269 mx = 0.
2270 my = 0.
2271 vx = 0.
2272 vy = 0.
2273 co = 0.
2274 nb = 0.
2275 for a, b in values:
2276 nb += 1
2277 mx += a
2278 my += b
2279 vx += a ** 2
2280 vy += b ** 2
2281 co += a * b
2282 mx /= nb
2283 my /= nb
2284 vx /= nb
2285 vy /= nb
2286 co /= nb
2287 if not noCenter:
2288 vx -= mx ** 2
2289 vy -= my ** 2
2290 co -= mx * my
2291 vx = vx ** 0.5
2292 vy = vy ** 0.5
2293 v = vx * vy
2294 if v != 0:
2295 co /= v
2296 return co
2298 def covariance_col(self, col1, col2, noCenter=False):
2299 """
2300 Computes the correlation between two columns.
2301 @param col1 column 1
2302 @param col2 column 2
2303 @param noCenter does the computation without removing the average
2304 @return float(covariance)
2305 """
2306 values = [[self._interpret_row(row)[col1],
2307 self._interpret_row(row)[col2]] for row in self.values]
2309 if len(values) <= 1:
2310 raise ValueError( # pragma: no cover
2311 f"expecting more than one observation, not {len(values)}")
2313 mx = 0.
2314 my = 0.
2315 co = 0.
2316 nb = 0.
2317 for a, b in values:
2318 nb += 1
2319 mx += a
2320 my += b
2321 co += a * b
2322 mx /= nb
2323 my /= nb
2324 co /= nb
2325 if not noCenter:
2326 co -= mx * my
2327 return co
2329 def correlation_row(self, row1, row2, noCenter=False):
2330 """
2331 computes the correlation between two columns
2332 @param row1 row 1(integer)
2333 @param row2 row 2(integer)
2334 @param noCenter does the computation without removing the average
2335 @return float(covariance)
2336 """
2337 values = [[a, b] for a, b in zip(self.values[row1], self.values[row2])]
2338 if len(values) <= 1:
2339 raise ValueError( # pragma: no cover
2340 f"expecting more than one observation, not {len(values)}")
2341 mx = 0.
2342 my = 0.
2343 vx = 0.
2344 vy = 0.
2345 co = 0.
2346 nb = 0.
2347 for a, b in values:
2348 nb += 1
2349 mx += a
2350 my += b
2351 vx += a ** 2
2352 vy += b ** 2
2353 co += a * b
2354 mx /= nb
2355 my /= nb
2356 vx /= nb
2357 vy /= nb
2358 co /= nb
2359 if not noCenter:
2360 vx -= mx ** 2
2361 vy -= my ** 2
2362 co -= mx * my
2363 vx = vx ** 0.5
2364 vy = vy ** 0.5
2365 v = vx * vy
2366 if v != 0:
2367 co /= v
2368 return co
2370 def covariance_row(self, row1, row2, noCenter=False):
2371 """
2372 computes the correlation between two columns
2373 @param row1 row 1(integer)
2374 @param row2 row 2(integer)
2375 @param noCenter does the computation without removing the average
2376 @return float(covariance)
2377 """
2378 values = [[a, b] for a, b in zip(self.values[row1], self.values[row2])]
2379 if len(values) <= 1:
2380 raise ValueError( # pragma: no cover
2381 f"expecting more than one observation, not {len(values)}")
2382 mx = 0.
2383 my = 0.
2384 co = 0.
2385 nb = 0.
2386 for a, b in values:
2387 nb += 1
2388 mx += a
2389 my += b
2390 co += a * b
2391 mx /= nb
2392 my /= nb
2393 co /= nb
2394 if not noCenter:
2395 co -= mx * my
2396 return co
2398 def correlation(self, useBootstrap=False, collapseFormat=True, nbdraws=-1, alpha=0.05,
2399 functionKeepValue=lambda val, low, high: f"{val:f}|{low:f},{high:f}"):
2400 """
2401 Computes the correlation matrix, the first column
2402 will contains the column names.
2404 @param useBootstrap if True, use a bootstrap method to estimate the correlation
2405 @param collapseFormat if True and useBootstrap is True, produces a format
2406 ``average|lower bound|higher bound(at a definite confidence level)``
2407 @param nbdraws number of draws(if -1, then it will be equal to the number of observations)
2408 @param alpha confidence level
2409 @param functionKeepValue if collapseFormat is True, this function is used to collapse val,low,high in a single string
2410 @return new table
2411 """
2412 if useBootstrap:
2413 head = ["var"] + self.header
2414 values = [[i] + [0. for r in self.header] for i in self.header]
2415 for i in range(len(self.header)):
2416 values[i][0] = self.header[i]
2417 for j in range(len(self.header)):
2418 vs = [[row[i], row[j]] for row in self.values]
2419 bo = TableFormula.bootstrap(vs, function=TableFormula.correlation_bicolumn,
2420 nbdraws=nbdraws, alpha=alpha)
2421 if collapseFormat:
2422 st = functionKeepValue(bo[0], bo[2], bo[3])
2423 values[i][j + 1] = st
2424 else:
2425 raise NotImplementedError( # pragma: no cover
2426 "collapseFormat False is not implemented yet")
2427 tbl = self._private_getclass()(head, values)
2428 return tbl
2429 else:
2430 for i, x in enumerate(self.values[0]):
2431 if not isinstance(x, float):
2432 raise TypeError( # pragma: no cover
2433 "expecting a float on column %d" % i)
2435 values = self.np_matrix
2436 N = values.shape[0]
2437 sums = [sum(values[:, i]) / N for i in range(0, values.shape[1])]
2439 for i in range(0, values.shape[1]):
2440 values[:, i] -= sums[i]
2442 cov = values.transpose() * values
2443 cov /= N
2444 diag = [cov[i, i] ** 0.5 for i in range(cov.shape[0])]
2445 for i in range(cov.shape[0]):
2446 if diag[i] > 0:
2447 cov[i, :] /= diag[i]
2448 cov[:, i] /= diag[i]
2450 head = ["var"] + self.header
2451 size = cov.shape
2452 values = [[self.header[
2453 i]] + [float(cov[i, j]) for j in range(0, size[1])] for i in range(0, size[0])]
2454 tbl = self._private_getclass()(head, values)
2455 return tbl
2457 def values_to_float(self, only_if_possible=False, subset_columns=None):
2458 """
2459 converts all values into float
2460 @param only_if_possible if True, converts all possible values and catches exception,
2461 if False, converts everything, raises an exception when not possible
2462 @param subset_columns if None, takes all of them, otherwise, try to convert
2463 only for the listed columns
2464 @return table
2465 """
2466 tbl = self.copy()
2467 if subset_columns is not None:
2468 subset = {i: True for i, v in enumerate(
2469 self.header) if v in subset_columns}
2470 if only_if_possible:
2471 for row in tbl.values:
2472 for i in range(0, len(row)):
2473 if subset_columns is None or i in subset:
2474 try:
2475 v = float(row[i])
2476 row[i] = v
2477 except (ValueError, TypeError):
2478 continue
2479 else:
2480 for row in tbl.values:
2481 for i in range(0, len(row)):
2482 if subset_columns is None or i in subset:
2483 row[i] = float(row[i])
2484 return tbl
2486 def values_to_str(self, subset_columns=None, format=None):
2487 """
2488 converts all values into str
2489 @param subset_columns if None, takes all of them, otherwise, try to convert
2490 only for the listed columns
2491 @param format format for the conversion, by None by default but it could be for exemple %1.2f.
2492 @return table
2493 """
2494 tbl = self.copy()
2495 if subset_columns is not None:
2496 subset = {i: True for i, v in enumerate(
2497 self.header) if v in subset_columns}
2499 if format is None:
2500 for row in tbl.values:
2501 for i in range(0, len(row)):
2502 if subset_columns is None or i in subset:
2503 row[i] = str(row[i])
2504 else:
2505 for row in tbl.values:
2506 for i in range(0, len(row)):
2507 if (subset_columns is None or i in subset) and isinstance(row[i], float):
2508 row[i] = format % row[i]
2509 return tbl
2511 def values_to_date(self, format=None, only_if_possible=False, subset_columns=None):
2512 """
2513 converts all values into dates
2514 @param only_if_possible if True, converts all possible values and catches exception,
2515 if False, converts everything, raises an exception when not possible
2516 @param format date format see fn str_to_datetime
2517 @param subset_columns if None, takes all of them, otherwise, try to convert
2518 only for the listed columns
2519 @return table
2520 """
2521 tbl = self.copy()
2522 if subset_columns is not None:
2523 subset = {i: True for i, v in enumerate(
2524 self.header) if v in subset_columns}
2525 if only_if_possible:
2526 if subset_columns is not None:
2527 subset = {i: True for i, v in enumerate(
2528 self.header) if v in subset_columns}
2530 for row in tbl.values:
2531 for i in range(0, len(row)):
2532 if subset_columns is None or i in subset:
2533 try:
2534 v = str2datetime(row[i], format)
2535 row[i] = v
2536 except (ValueError, TypeError):
2537 continue
2538 else:
2539 for row in tbl.values:
2540 for i in range(0, len(row)):
2541 if subset_columns is None or i in subset:
2542 row[i] = float(row[i])
2543 return tbl
2545 def histogram(self, functionValue, nbDiv=100, secondColumnIsWeight=False,
2546 normalize=True, removeExtreme=0.05):
2547 """
2548 computes an histograms on one vector
2549 @param functionValue function which produces the value to histogram
2550 @param nbDiv number of divisions for this histograms(boundaries are min and max)
2551 @param secondColumnIsWeight if True, the second column is the weight
2552 @param normalize if True, normalize by the sum of weights
2553 @param removeExtreme remove extreme values at both sides(0.05 means 0.025 on each side)
2554 @return table with two columns
2555 """
2556 values = list([functionValue(self._interpret_row(row)) # pylint: disable=R1728
2557 for row in self.values])
2558 if removeExtreme is not None and removeExtreme > 0:
2559 values.sort()
2560 al = int(len(values) * removeExtreme / 2)
2561 if al == 0:
2562 raise RuntimeError( # pragma: no cover
2563 "removeExtreme has no impact(%d,%f)" % (
2564 len(values), len(values) * removeExtreme / 2))
2565 if al * 2 < len(values):
2566 values = values[al:len(values) - al]
2568 mi = min(values)
2569 ma = max(values)
2571 if isinstance(values[0], (tuple, list)):
2572 W = 0.
2573 div = (ma[0] - mi[0]) / nbDiv
2574 hist = [[mi[0] + n * div, 0.] for n in range(0, nbDiv + 1)]
2575 for v in values:
2576 x = int((v[0] - mi[0]) // div)
2577 hist[x][1] += v[1]
2578 W += v[1]
2579 mi = mi[0]
2580 else:
2581 W = len(values)
2582 div = (ma - mi) / nbDiv
2583 hist = [[mi + n * div, 0.] for n in range(0, nbDiv + 1)]
2584 for v in values:
2585 x = int((v - mi) // div)
2586 if 0 <= x < len(hist):
2587 hist[x][1] += 1.
2589 if normalize and W > 0:
2590 for i in range(len(hist)):
2591 hist[i][1] /= W
2593 values = [[mi + n * div, hist[n]] for n in range(len(hist))]
2594 tbl = self._private_getclass()(["x", "hist(x)"], hist)
2595 return tbl
2597 def histograms(self, columnsSet, nbDiv=100, secondColumnIsWeight=False,
2598 normalize=True, removeExtreme=0.05, histxName="histKey"):
2599 """
2600 computes a common histograms on all columns
2601 @param columnsSet set of columns
2602 @param nbDiv number of divisions for this histograms(boundaries are min and max)
2603 @param secondColumnIsWeight if True, the second column is the weight
2604 @param normalize if True, normalize by the sum of weights
2605 @param removeExtreme remove extreme values at both sides(0.05 means 0.025 on each side)
2606 @param histxName column name given to the x axis shared by every histogram
2607 @return table with two columns
2609 @warning The function skips any NaN of Inf value.
2610 """
2611 values = []
2612 for row in self.values:
2613 temp = self._interpret_row(row)
2614 for t in columnsSet:
2615 values.append(temp[t])
2617 if removeExtreme is not None and removeExtreme > 0:
2618 values.sort()
2619 al = int(len(values) * removeExtreme / 2)
2620 if al == 0:
2621 raise RuntimeError( # pragma: no cover
2622 "removeExtreme has no impact(%d,%f)" % (
2623 len(values), len(values) * removeExtreme / 2))
2624 if al * 2 < len(values):
2625 values = values[al:len(values) - al]
2627 mi = min(values)
2628 ma = max(values)
2629 W = len(values)
2630 div = (ma - mi) / nbDiv
2631 if div == 0:
2632 raise RuntimeError( # pragma: no cover
2633 f"unable to continue since div is null: min,max = {mi:f},{ma:f}")
2634 hist = [[mi + n * div, 0.] for n in range(0, nbDiv + 1)]
2635 value = {i: {histxName: hist[i][0]} for i in range(len(hist))}
2636 su = {}
2637 for row in self.values:
2638 for _ in columnsSet:
2639 temp = self._interpret_row(row)
2640 if math.isnan(temp[_]) or math.isinf(temp[_]):
2641 continue
2642 x = int((temp[_] - mi) // div)
2643 if x not in value:
2644 # it means extremes were removed
2645 continue
2646 #raise RuntimeError("value %d,%f is not allowed min,max = [%f,%f]" %(x, temp[_], mi, ma))
2647 value[x][_] = value[x].get(_, 0.) + 1.
2648 su[_] = su.get(_, 0.) + 1.
2650 if normalize and W > 0:
2651 for v in value.values():
2652 for _ in v:
2653 if _ != histxName:
2654 v[_] /= su[_]
2656 tbl = self._private_getclass()("__byrow__", value)
2657 return tbl
2659 def union_columns(self, columnsSet):
2660 """
2661 computes the union of all values from all columns present in columnSet
2662 @param columnsSet set of columns
2663 @return table
2664 """
2665 values = []
2666 for row in self.values:
2667 temp = self._interpret_row(row)
2668 for t in columnsSet:
2669 values.append(temp[t])
2670 tbl = self._private_getclass()(["x"], [[x] for x in values])
2671 return tbl
2673 def mu_sigma(self, functionValues, removeExtreme=None):
2674 """
2675 computes the average and the standard deviation a vector of values
2676 @param functionValues function produces the vector of values
2677 @param removeExtreme remove extreme values at both sides(0.05 means 0.025 on each side)
2678 @return (average, standard deviation)
2679 """
2680 if removeExtreme is not None and removeExtreme > 0:
2681 values = []
2682 for row in self.values:
2683 row = self._interpret_row(row)
2684 val = functionValues(row)
2685 values.append(val)
2686 values.sort()
2687 al = int(len(values) * removeExtreme / 2)
2688 if al == 0:
2689 raise RuntimeError("removeExtreme has no impact(%d,%f)" % (
2690 len(values), len(values) * removeExtreme / 2))
2691 if al * 2 < len(values):
2692 values = values[al:len(values) - al]
2693 tbl = TableFormula(["x"], [[_] for _ in values])
2694 return tbl.mu_sigma(lambda v: v["x"], 0)
2695 else:
2696 mu = 0.
2697 si = 0.
2698 nb = 0.
2699 for row in self.values:
2700 row = self._interpret_row(row)
2701 val = functionValues(row)
2702 mu += val
2703 si += val ** 2
2704 nb += 1.
2705 mu /= nb
2706 si /= nb
2707 si -= mu ** 2
2708 return mu, si ** 0.5
2710 def mu_sigma_each_column(self, columnsSet=None, removeExtreme=None):
2711 """
2712 returns a table with the average and the standard deviation for each columns
2713 @param removeExtreme remove extreme values at both sides(0.05 means 0.025 on each side)
2714 @param columnsSet set of column to deal with
2715 @return table with two rows: average and standard deviation
2716 """
2717 values = [[], []]
2718 if columnsSet is None:
2719 columnsSet = self.header
2720 for col in columnsSet:
2721 mu, sigma = self.mu_sigma(
2722 (lambda v, col=col: v[col]), removeExtreme)
2723 values[0].append(mu)
2724 values[1].append(sigma)
2725 tbl = self._private_getclass()(columnsSet, values)
2726 return tbl
2728 @property
2729 def np_matrix(self):
2730 """
2731 returns the values as a numpy matrix
2732 @return numpy matrix
2733 """
2734 return numpy.matrix(self.values)
2736 @property
2737 def np_array(self):
2738 """
2739 returns the values as a numpy array
2740 @return numpy array
2741 """
2742 return numpy.array(self.values)
2744 @property
2745 def dataframe(self):
2746 """
2747 creates a pandas dataframe
2748 @return pandas.dataframe
2749 """
2750 return pandas.DataFrame(self.values, columns=self.header)
2752 @property
2753 def json(self):
2754 """
2755 returns a json format
2756 @return string
2757 """
2758 rows = [row for row in self]
2759 return json.dumps(rows)
2761 def center_reduce(self, columnsSet=None, op=None, removeExtreme=None, mu_sigma=None):
2762 """
2763 center and reduce a set of columns(or all if columnsSet is None)
2764 @param removeExtreme remove extreme values at both sides(0.05 means 0.025 on each side)
2765 @param columnsSet set of column to deal with
2766 @param op if can be:
2767 - None: substract mean and normalize,
2768 - "mean": substract mean only,
2769 - "norm": normalize only
2770 @param mu_sigma matrix with two rows(one for mean, second for sigma), if None,
2771 if computes that from the matrix self, columns must have the same order
2772 that columnSet
2773 @return the same table(with only the considered columns)
2774 """
2775 if op not in [None, "mean", "norm"]:
2776 raise ValueError( # pragma: no cover
2777 'expecting a value in [None, "mean", "norm"] for op')
2778 if columnsSet is None:
2779 columnsSet = self.header
2780 mus = self.mu_sigma_each_column(
2781 columnsSet, removeExtreme) if mu_sigma is None else mu_sigma
2782 tbl = self.extract_columns(columnsSet)
2783 n = len(self.header)
2784 for row in tbl.values:
2785 if op is None or op == "mean":
2786 for i in range(n):
2787 row[i] -= mus.values[0][i]
2788 if op is None or op == "norm":
2789 for i in range(n):
2790 row[i] /= mus.values[1][i]
2791 return tbl
2793 @staticmethod
2794 def save_multiple_as_excel(filename, list_table, font="Calibri", close=True, encoding=None):
2795 """
2796 saves multiple table in one Excel file
2798 @param filename filename(can be None)
2799 @param list_table list of 2uple("name", tbl)
2800 @param font font name
2801 @param close if True, close the file, otherwise, the user will have to
2802 @param encoding encoding
2803 @return object Workbook
2804 """
2805 ext = os.path.splitext(
2806 filename)[-1].lower() if filename is not None else None
2807 if ext is not None and ext == ".xls":
2808 font0 = EXf.Font()
2809 font0.name = font
2810 font0.bold = True
2811 style0 = EXs.XFStyle()
2812 style0.font = font0
2814 wb = EXw.Workbook(
2815 encoding=encoding) if encoding is not None else EXw.Workbook()
2816 for sheet_name, self in list_table:
2817 ws0 = wb.add_sheet(sheet_name)
2819 for i, l in enumerate(self.header):
2820 ws0.write(0, i, l, style0)
2822 fnt = EXf.Font()
2823 fnt.name = font
2824 style = EXs.XFStyle()
2825 style.font = fnt
2827 for irow, row in enumerate(self.values):
2828 for icol, val in enumerate(row):
2829 if isinstance(val, (int, float)):
2830 st = val
2831 elif isinstance(val, str):
2832 if encoding is not None:
2833 st = val.encode(encoding).decode(encoding)
2834 else:
2835 st = val
2836 elif val is not None:
2837 st = str(val)
2838 else:
2839 continue
2840 ws0.write(irow + 1, icol, st, style)
2842 wb.save(filename)
2843 return wb
2845 elif ext is None or ext == ".xlsx":
2846 wb = EXxw.Workbook(
2847 filename) if filename is not None else EXxw.Workbook()
2848 for sheet_name, self in list_table:
2849 ws0 = wb.add_worksheet(sheet_name)
2851 style0 = wb.add_format({'bold': True})
2852 style0.set_font_name(font)
2854 for i, l in enumerate(self.header):
2855 ws0.write(0, i, l, style0)
2857 style = wb.add_format()
2858 style.set_font_name(font)
2860 for irow, row in enumerate(self.values):
2861 for icol, val in enumerate(row):
2862 if isinstance(val, (int, float)):
2863 st = val
2864 elif isinstance(val, str):
2865 if encoding is not None:
2866 st = val.encode(encoding).decode(encoding)
2867 else:
2868 st = val
2869 elif val is not None:
2870 st = str(val)
2871 else:
2872 continue
2873 ws0.write(irow + 1, icol, st, style)
2875 if filename is not None and close:
2876 wb.close()
2877 return wb
2878 else:
2879 raise NameError(
2880 "extension should be .xls or .xlsx for file " + filename)
2882 def save_as_excel(self, filename, font="Calibri", sheet_name="sheet0",
2883 close=True, encoding=None):
2884 """
2885 saves the table as a new Excel file, you can use ``.xls`` or ``.xlsx``
2886 if filename is None, the function returns an object(xslx) and does not save it.
2888 @param filename Excel filename
2889 @param sheet_name name of the sheet to add
2890 @param font font name
2891 @param close if True, close the file, otherwise, the user will have to
2892 @param encoding encoding
2893 @return object Workbook
2894 """
2895 return TableFormula.save_multiple_as_excel(filename, [(sheet_name, self)],
2896 font=font, close=close, encoding=encoding)
2898 def schema_database(self, add_id=True):
2899 """
2900 returns the schema for a database which would contains this database
2902 @param add_id if True, adds an index "PRIMARYKEY"
2903 @return dictionary { index_column:(name, type) }
2904 """
2905 schema = {i: (l, str) for i, l in enumerate(self.header)}
2906 if add_id is not None:
2907 schema[-1] = (add_id, int, "PRIMARYKEY", "AUTOINCREMENT")
2909 if len(self) > 0:
2910 # we use the first row to determine type
2911 for i, v in enumerate(self.values[0]):
2912 if not isinstance(v, str):
2913 schema[i] = (schema[i][0], type(v))
2914 return schema
2916 def fill_sql_table(self, filename_or_database, tablename, add_id="idr"):
2917 """
2918 returns a Database object, creates the database if it does not exists,
2919 same for the table
2921 @param filename_or_database filename or Database object,
2922 in that second case, we assume method connect
2923 was called before
2924 @param tablename table name
2925 @param add_id if is not None, then the function adds an id, it first takes the
2926 max(id) and goes on incrementing it;
2927 @return Database object(new or the one from the parameters),
2928 in both case, the database is not disconnected
2929 """
2930 schema = self.schema_database(add_id)
2932 if isinstance(filename_or_database, str):
2933 fLOG("fill_sql_table: creating database ", filename_or_database)
2934 db = Database(filename_or_database, LOG=fLOG)
2935 db.connect()
2937 fLOG("fill_sql_table ", schema)
2938 if tablename not in db.get_table_list():
2939 fLOG("creationg of table ", schema)
2940 cursor = db.create_table(tablename, schema)
2941 db.append_values(self.values, tablename, schema, cursor=cursor)
2942 else:
2943 db.append_values(self.values, tablename, schema)
2944 else:
2945 db = filename_or_database
2946 db.append_values(self.values, tablename, schema)
2948 return db