Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf8 -*-
2"""
3@file
4@brief Implements TableFormula.
5"""
6import copy
7import os
8import sys
9import datetime
10import random
11import math
12import json
13import numpy
14import pandas
15from xlwt import Formatting as EXf
16from xlwt import Style as EXs
17import xlwt as EXw
18from xlsxwriter import workbook as EXxw
19from xlrd import open_workbook
20from pyquickhelper.loghelper import fLOG, noLOG
21from pyquickhelper.loghelper.convert_helper import str2datetime
22from pyensae.sql import Database
23from .table_formula_stat import _TableFormulaStat
26class TableFormula(_TableFormulaStat):
27 """
28 This class aims at representating a table, it provides
29 some "SQL like" functionalities such groupby or innerjoin, select, where...
30 This was a custom implementation of a DataFrame before I discover
31 `pandas <http://pandas.pydata.org/>`_.
33 The class provides an easy to go through the row table by converting each row
34 in a dictionary ``{ column_name: value }`` on the run. Example:
36 ::
38 tbl = TableFormula(...)
39 newtbl = tbl.filter(lambda v: v["criteria"] == 5)
41 See @see op __init__ for others ways to create a table.
43 @var header list of column names
44 @var values list of rows(each row contains as many value as the number of columns)
45 @var index dictionary { column name: position }, changing ``header`` means also changing ``header``.
47 Example:
49 ::
51 table = TableFormula("name d_a d_b d_c#A 1 2 3#A 1.1 2.1 3.1#B 3 4 5".replace(" ", "\\t").replace("#","\\n"))
52 print(table)
53 print("one value 0,1:", table[0,1])
55 print("---------")
56 dist = table.get_distinct_values("name")
57 for k in sorted(dist): print("*%d: %s"%(int(dist[k]),k))
59 print("---------")
60 table.add_column("has_A", lambda v: 1. if "A" in v["name"] else 0.)
61 print(table)
63 x = 1./3
64 print("------------- smoothing", x)
65 table.add_column_smooth("has_A_smooth", lambda v: v["has_A"], [-1,0,1], [x,x,x])
66 print(table)
68 print("--------- filter")
69 fil = table.filter(lambda v: v["d_b"] == 2)
70 print(fil)
72 print("--------- random")
73 rnd = table.random(5)
74 print(rnd)
76 print("--------- random unique")
77 rnd = table.random(1, True)
78 print(rnd)
80 print("--------- filter quantile")
81 fil = table.filter_quantile(lambda v: v["d_b"], 0, 0.4)
82 print(fil)
84 print("--------- aggregate_column")
85 total = table.aggregate(lambda v: v["d_c"])
86 print(total)
88 print("--------- sort")
89 table.sort(lambda v: v["d_b"] + v["d_c"])
90 print(table)
92 print("--------- union")
93 union = table.union(table)
94 print(union)
96 print("--------- group sum")
97 group = table.groupby(lambda v: v["name"],
98 [lambda v: v["d_a"],
99 lambda v: v["d_b"]],
100 ["name", "sum_d_a", "sum_d_b"])
101 print(group)
103 print("--------- group max")
104 groupmax = table.groupby(lambda v: v["name"],
105 [lambda v: v["d_a"],
106 lambda v: v["d_b"]],
107 ["name", "max_d_a", "max_d_b"],
108 [max, max])
109 print(groupmax)
111 print("--------- group sum with weights")
112 group = table.groupby(lambda v: v["name"],
113 [lambda v: v["d_a"]],
114 ["name", "weight", "sum_d_a"],
115 [lambda vec,w: sum(vec) / w],
116 lambda v: v ["d_b"])
118 print("--------- innerjoin")
119 innerjoin = table.innerjoin(group, lambda v: v["name"],
120 lambda v: v["name"], "group")
121 print(innerjoin)
123 print("------------- extraction")
124 ext = table.extract_columns(["name", "d_a"])
125 print(ext)
127 print("------------- remove")
128 ext = table.remove_columns(["d_a"])
129 print(ext)
131 print("------------- todict")
132 d = table.todict(lambda v: v["name"], lambda v: v["d_b"], True)
133 print(d)
135 print("------------- select")
136 d = table.select(lambda v:(v["name"], v["d_b"]))
137 print(list(d))
139 print("------------- use of an index")
140 table.create_index(lambda v:(v["name"], v["d_a"]))
141 row = table.get(('A', 1.1))
142 print(row)
143 value = table.get(('A', 1.1), 2)
144 print(value)
146 print("------------- multiply_column_by_row_instance ")
147 table = TableFormula("name d_a d_b d_c#A 1 2 3#A 1.1 2.1 3.1#B 3 4 5".replace(" ", "\\t").replace("#","\\n"))
148 table.add_column("key_add", lambda v:"unique")
149 print(table)
150 mul = table.multiply_column_by_row_instance(
151 lambda v: v["key_add"],
152 lambda v: v["name"])
153 print(mul)
155 if os.path.exists("BNP.PA.txt"):
156 print("--------------- financial stock")
157 table = TableFormula("BNP.PA.txt", sep=",")
158 table.sort(lambda v: v["Date"])
159 print(table[:10])
161 print("--------------- groupby_implicit")
162 table = TableFormula("key_name sum_a len_b avg_c#A 1 2 3#A 1.1 2.1 3.1#B 3 4 5".replace(" ", "\\t").replace("#","\\n"))
163 print(table)
164 gr = table.groupby_implicit(lambda v: v ["key_name"])
165 print(gr)
167 print("--------------- covariance")
168 values = [random.random() for i in range(0,100)]
169 values = [[x, x + random.random()/2] for x in values]
170 tbl = TableFormula(["x", "y"], values).values_to_float()
171 cov = tbl.covariance()
172 print(cov)
174 print("--------------- histogram")
175 hist = tbl.histogram(lambda v:(v["x"],1), 10)
176 print(hist)
178 print("--------------- histogram")
179 hist = tbl.values_to_float().histograms(["x", "y"], 10)
180 print(hist)
182 print("--------------- unions of columns")
183 hist = tbl.values_to_float().union_columns(["x", "y"])
184 print(hist)
185 """
187 @staticmethod
188 def add_header_if_not_present(filename, header, encoding=None, logFunction=noLOG):
189 """
190 the function checks if the first line contains the column in header
191 otherwise, it modifies the file and add them on the first line
193 @param filename filename
194 @param header list of column name(all strings)
195 @param encoding encoding
196 @param logFunction use this function to log information about what is happening
197 """
198 if encoding is None:
199 with open(filename, "r") as f:
200 firstline = f.readline().strip("\n\r ")
201 su = sum(map(lambda _: 1 if _ in header else 0, firstline.split("\t")))
202 if su < len(header) / 2.0:
203 logFunction("add_header_if_not_present: adding header({0}<{1}){2} to '{3}'\nfirstline:\n{4}".format(
204 su, len(header) / 2, header, filename, firstline))
205 with open(filename, "r") as f:
206 text = f.read()
207 text = "\t".join(header) + "\n" + text
208 logFunction("add_header_if_not_present: writing")
209 with open(filename, "w") as f:
210 f.write(text)
211 logFunction("add_header_if_not_present: complete")
212 else:
213 with open(filename, "r", encoding=encoding) as f:
214 firstline = f.readline().strip("\n\r ")
215 su = sum(map(lambda _: 1 if _ in header else 0, firstline.split("\t")))
216 if su < len(header) / 2.0:
217 logFunction("add_header_if_not_present: adding header({0}<{1}){2} to '{3}'\nfirstline:\n{4}".format(
218 su, len(header) / 2, header, filename, firstline))
219 with open(filename, "r", encoding=encoding) as f:
220 text = f.read()
221 text = "\t".join(header) + "\n" + text
222 logFunction("add_header_if_not_present: writing")
223 with open(filename, "w", encoding=encoding) as f:
224 f.write(text)
225 logFunction("add_header_if_not_present: complete")
227 @staticmethod
228 def random_split_file(filename, outfileprefix, nb, has_header=True, encoding=None, logFunction=noLOG):
229 """
230 split a file in nb buckets by random(lines are sent to a random file as they come)
232 @param filename filename to split
233 @param nb number of buckets
234 @param outfileprefix output files will start with outfileprefix + '%04d.txt' % i
235 @param encoding encoding
236 @param has_header the header will be replicated in each created file
237 @param logFunction to display information
238 @return list of created files
239 """
240 firstline = None
241 if has_header:
242 if encoding is None:
243 with open(filename, "r") as f:
244 firstline = f.readline().strip("\n\r ")
245 else: # pragma: no cover
246 with open(filename, "r", encoding=encoding) as f:
247 firstline = f.readline().strip("\n\r ")
249 logFunction("random_split_file: file %s has header %s" %
250 (filename, firstline))
252 logFunction("random_split_file: split %s in %d parts" % (filename, nb))
253 fileName = [outfileprefix + (".%04d.txt" % n) for n in range(0, nb)]
254 nbline = 0
256 if encoding is None:
257 filesP = [open(_, "w") for _ in fileName]
258 if firstline is not None:
259 for _ in filesP:
260 _.write(firstline + "\n")
261 with open(filename, "r") as f:
262 line = f.readline()
263 if firstline is not None:
264 line = f.readline()
265 while line is not None and len(line) > 0:
266 h = random.randint(0, nb - 1)
267 filesP[h].write(line)
268 line = f.readline()
269 nbline += 1
270 if nbline % 1000000 == 0:
271 logFunction(
272 "random_split_file: processed %d lines" % nbline)
273 else:
274 filesP = [open(_, "w", encoding=encoding) for _ in fileName]
275 if firstline is not None:
276 for _ in filesP:
277 _.write(firstline + "\n")
278 with open(filename, "r", encoding=encoding) as f:
279 line = f.readline()
280 if firstline is not None:
281 line = f.readline()
282 while line is not None and len(line) > 0:
283 h = random.randint(0, nb - 1)
284 filesP[h].write(line)
285 line = f.readline()
286 nbline += 1
287 if nbline % 1000000 == 0:
288 logFunction(
289 "random_split_file: processed %d lines" % nbline)
291 for _ in filesP:
292 _.close()
293 logFunction("random_split_file: end")
294 return fileName
296 @staticmethod
297 def ratio(x, y):
298 """
299 return a ratio between two real values or an empty string if the denominateur is null
300 @return a real of an empty string
301 """
302 return x * 1.0 / y if y != 0 else(0 if x == 0 else "")
304 @staticmethod
305 def bootstrap(values, function, nbdraws=-1, alpha=0.05):
306 """
307 return a confidence interval for a statistics
308 @param values values
309 @param function produces the statistics over a random set of observations chosen in values
310 @param nbdraws number of draws, if it is equal to -1, is equal to len(values)
311 @param alpha confidence level
312 @return average, min, lower bound, higher bound, max
313 """
314 stat = []
315 N = len(values) - 1
316 if nbdraws == - 1:
317 nbdraws = len(values)
318 for i in range(nbdraws):
319 randset = [values[random.randint(0, N)] for i in range(N + 1)]
320 s = function(randset)
321 stat.append(s)
322 stat.sort()
323 lv = len(stat)
324 alpha = alpha / 2
325 i1 = int(lv * alpha + 0.5)
326 i2 = int(lv * (1 - alpha) + 0.5)
327 i2 = min(i2, len(stat) - 1)
328 av = sum(stat) / len(stat)
329 return av, min(stat), stat[i1], stat[i2], max(stat)
331 @staticmethod
332 def correlation_bicolumn(values, deviations=False, noCenter=False):
333 """
334 assume values is a matrix with two columns
335 @param values 2 column matrix
336 @param deviations if True, returns cor, sigma1, sigma2
337 @param noCenter if True, do not remove the average before computing the covariance,
338 it means we assume variables are already centered
339 @return correlation factor or correlation, sigma1, sigma2 if deviations is True
340 """
341 if len(values) <= 1:
342 raise ValueError(
343 "expecting more than one observation, not %d" % len(values))
345 mx = 0.
346 my = 0.
347 vx = 0.
348 vy = 0.
349 co = 0.
350 nb = 0.
351 for a, b in values:
352 nb += 1
353 mx += a
354 my += b
355 vx += a ** 2
356 vy += b ** 2
357 co += a * b
358 mx /= nb
359 my /= nb
360 vx /= nb
361 vy /= nb
362 co /= nb
363 if not noCenter:
364 vx -= mx ** 2
365 vy -= my ** 2
366 co -= mx * my
367 vx = vx ** 0.5
368 vy = vy ** 0.5
369 v = vx * vy
370 if v != 0:
371 co /= v
372 if deviations:
373 return co, vx, vy # pragma: no cover
374 return co
376 def _private_getclass(self):
377 """
378 the class often creates another class of the same type,
379 this function returns the class object
380 """
381 return self.__class__
383 def __init__(self, file, numeric_column=None, sep="\t", encoding=None,
384 read_n_lines=-1, sheet=0, **options):
385 """
386 It can either take a filename, an object TableFormula,
387 a list of columns and values.
389 :param file: filename or a list of column names or a dictionary,
390 file can also be a `pandas DataFrame
391 <http://pandas.pydata.org/pandas-docs/dev/dsintro.html#dataframe>`_.
392 :param numeric_column: depends on file types(see below examples)
393 :param sep: column separator if file is a filename
394 :param read_n_lines: read the first n lines(or all if it is -1)
395 :param sheet: in case the file is an Excel file, this parameter precises the sheet number or name
396 :param suffix_nb: if True, adds an integer to the column name if it is a duplicate
398 Example:
400 ::
402 table = TableFormula("name d_a d_b d_c#A 1 2 3#A 1.1 2.1 3.1#B 3 4 5".replace(" ", "\\t").replace("#","\\n"))
404 or
406 ::
408 table = TableFormula("file.txt", ["nb"])
410 or
412 ::
414 table = TableFormula(["date", "Y", "Y2", "xl"], values)
416 or
418 ::
420 data = [{ "one":1, "two":2 }, {"two":2.1, "three":3 }]
421 table = TableFormula(data)
423 or
425 ::
427 data = { 1:{ "one":2.3, "two":2.2 }, 2:{"one":2.1, "two":3 }
428 table = TableFormula("__byrow__", data)
430 or
432 ::
434 table = TableFormula(numpy.matrix(...))
436 or
438 ::
440 table = TableFormula(numpy.array(...))
443 @warning In this second case, rows and header are not copied.
444 """
445 if numeric_column is None:
446 numeric_column = []
447 if isinstance(file, str):
448 if os.path.exists(file):
449 self._read_file(file, numeric_column, sep,
450 encoding, read_n_lines, sheet=sheet)
451 elif file == "__byrow__" and isinstance(numeric_column, dict):
452 self._fill_by_row(numeric_column)
453 else:
454 lines = file.split("\n")
455 if len(lines) == 1:
456 raise FileNotFoundError( # pragma: no cover
457 "A file was probably expected but was not found: '{}'."
458 "".format(file))
459 self._readlines(lines, numeric_column, sep)
461 elif isinstance(file, list):
462 if len(file) == 0:
463 raise ValueError( # pragma: no cover
464 "Empty data and columns are not allowed.")
466 if isinstance(file[0], dict):
467 self.index = {}
468 self.values = []
469 for row in file:
470 for k, v in row.items():
471 if k not in self.index:
472 self.index[k] = len(self.index)
474 # we sort the labels to avoid instabilities
475 labels = [k for k, v in self.index.items()]
476 labels.sort()
477 self.index = {}
478 for la in labels:
479 self.index[la] = len(self.index)
481 for row in file:
482 line = [None for k in self.index]
483 for k, v in row.items():
484 line[self.index[k]] = v
485 self.values.append(line)
487 self.header = [None for k in self.index]
488 for k, v in self.index.items():
489 self.header[v] = k
491 n = len(self.index)
492 for row in self.values:
493 while len(row) < n:
494 row.append(None)
496 elif isinstance(numeric_column, numpy.matrix):
497 self.header = file
498 self.index = {}
499 for i, h in enumerate(self.header):
500 self.index[h] = i
501 self.values = [[float(numeric_column[i, j]) for j in range(
502 numeric_column.shape[1])] for i in range(numeric_column.shape[0])]
503 elif isinstance(numeric_column, numpy.ndarray):
504 self.header = file
505 self.index = {}
506 for i, h in enumerate(self.header):
507 self.index[h] = i
508 self.values = [[float(numeric_column[i, j]) for j in range(
509 numeric_column.shape[1])] for i in range(numeric_column.shape[0])]
510 elif isinstance(file[0], list):
511 if len(file) == 1:
512 self.header = file[0]
513 self.values = file[1:] + numeric_column
514 self.index = {}
515 for i, h in enumerate(self.header):
516 self.index[h] = i
517 else:
518 self.header = file[0]
519 self.values = file[1:]
520 self.index = {}
521 for i, h in enumerate(self.header):
522 self.index[h] = i
523 elif isinstance(file[0], str):
524 self.header = file
525 self.values = numeric_column
526 self.index = {}
527 for i, h in enumerate(self.header):
528 self.index[h] = i
529 else:
530 raise RuntimeError( # pragma: no cover
531 "This case should not happen: " + str(type(file[0])))
533 elif isinstance(file, numpy.matrix): # pragma: no cover
534 self.header = ["c%d" % d for d in range(file.shape[1])]
535 self.index = {}
536 for i, h in enumerate(self.header):
537 self.index[h] = i
538 self.values = [[float(file[i, j]) for j in range(
539 file.shape[1])] for i in range(file.shape[0])]
541 elif isinstance(file, numpy.ndarray):
542 self.header = ["c%d" % d for d in range(file.shape[1])]
543 self.index = {}
544 for i, h in enumerate(self.header):
545 self.index[h] = i
546 self.values = [[float(file[i, j]) for j in range(
547 file.shape[1])] for i in range(file.shape[0])]
549 else:
550 if isinstance(file, pandas.DataFrame):
551 def convert(x):
552 return None if isinstance(x, float) and numpy.isnan(x) else x
553 df = file
554 self.header = [_ for _ in df.columns]
555 hi = 'index'
556 while hi in self.header:
557 hi += "_"
558 self.header.insert(0, hi)
559 self.values = []
560 for i, row in enumerate(df.values):
561 row = [df.index[i]] + [convert(x) for x in row]
562 self.values.append(row)
564 self.index = {}
565 for i, h in enumerate(self.header):
566 self.index[h] = i
567 else:
568 raise TypeError( # pragma: no cover
569 "File has an unexpected type: " + str(type(file)))
571 unique = {}
572 for i, c in enumerate(self.header):
573 if c in unique:
574 if options.get("suffix_nb", False):
575 c = "%s_%d" % (c, i)
576 self.header[i] = c
577 else:
578 raise KeyError( # pragma: no cover
579 "column '{0}' already exists in '{1}'".format(c, self.header))
580 unique[c] = True
582 def __add__(self, other):
583 """
584 do an addition, add values if types are matching
585 :param other: matrix or float or string
586 :return: new matrix, keep the header of the first matrix
587 """
588 if len(self) != len(other):
589 raise ValueError( # pragma: no cover
590 "both matrices should have the same number of rows")
591 if len(self.header) != len(other.header):
592 raise ValueError( # pragma: no cover
593 "both matrices should have the same number of columns")
594 values = []
595 for row, rowo in zip(self.values, other.values):
596 r = []
597 for a, b in zip(row, rowo):
598 if type(a) == type(b):
599 x = a + b
600 else:
601 x = None
602 r.append(x)
603 values.append(r)
604 return self._private_getclass()(self.header, values)
606 def __mul__(self, other):
607 """
608 do a multiplication(by a number)
609 :param other: matrix or float or string
610 :return: new matrix, keep the header of the first matrix
611 """
612 if not isinstance(other, float) and not isinstance(other, int):
613 raise TypeError( # pragma: no cover
614 "other should be a number")
615 values = []
616 for row in self.values:
617 r = []
618 for a in row:
619 if a is not None:
620 x = a * other
621 else:
622 x = None
623 r.append(x)
624 values.append(r)
625 return self._private_getclass()(self.header, values)
627 def multiplication_term_term(self, other):
628 """
629 do a multiplication term by term(similar to an addition),
630 add values if types are matching
632 :param other: matrix or float or string
633 :return: new matrix, keep the header of the first matrix
634 """
635 if len(self) != len(other):
636 raise ValueError( # pragma: no cover
637 "both matrices should have the same number of rows")
638 if len(self.header) != len(other.header):
639 raise ValueError( # pragma: no cover
640 "both matrices should have the same number of columns")
641 values = []
642 for row, rowo in zip(self.values, other.values):
643 r = []
644 for a, b in zip(row, rowo):
645 if type(a) == type(b) and not isinstance(a, str):
646 x = a * b
647 else:
648 x = None
649 r.append(x)
650 values.append(r)
651 return self._private_getclass()(self.header, values)
653 def replicate(self, times):
654 """replicates all rows a given number of times
655 :param times: number of multiplication
656 :return: new matrix, keep the header of the first matrix
657 """
658 values = []
659 for i in range(0, times):
660 values.extend(copy.copy(self.values))
661 return self._private_getclass()(self.header, values)
663 @property
664 def size(self):
665 """
666 returns the size(nb rows, nb columns)
667 """
668 return len(self), len(self.header)
670 @property
671 def shape(self):
672 """
673 returns the size(nb rows, nb columns)
674 """
675 return self.size
677 def _fill_by_row(self, values):
678 """
679 fill the table
680 :param values: dictionary { <int_row_index>: { <column name>: value} }
681 """
682 mx = max(values.keys()) + 1
683 self.index = {}
684 self.header = []
685 for k, v in values.items():
686 for col in v:
687 if col not in self.index:
688 self.index[col] = len(self.index)
689 self.header.append(col)
690 self.values = [[None for h in self.header] for k in range(mx)]
691 for k, v in values.items():
692 for col, to in v.items():
693 self.values[k][self.index[col]] = to
695 def __getitem__(self, irow):
696 """
697 operator [], accepts slices
698 :param irow: integer, tuple, slice or list
699 :return: depends on irow
700 - int --> a table with one row
701 - slice --> a table with several rows
702 - list --> a table with the selected rows
703 - tuple --> a value
704 """
705 if isinstance(irow, int):
706 return self._private_getclass()(
707 self.header, [self.values[irow]])
708 if isinstance(irow, slice):
709 return self._private_getclass()(
710 self.header, [self.values[ii] for ii in range(*irow.indices(len(self)))])
711 if isinstance(irow, list):
712 return self._private_getclass()(
713 self.header, [self.values[ii] for ii in irow])
714 if isinstance(irow, tuple):
715 if isinstance(irow[1], str):
716 row = self.values[irow[0]]
717 v = self._interpret_row(row)
718 return v[irow[1]]
719 return self.values[irow[0]][irow[1]]
720 raise TypeError("Invalid argument type: " + str(type(irow)))
722 def __setitem__(self, irow, value):
723 """
724 operator [], just accepts tuple(to change a value)
725 :param irow: 2-uple
726 :param value: new value
727 """
728 if isinstance(irow, tuple):
729 if isinstance(irow[1], str):
730 row = self.values[irow[0]]
731 v = self._interpret_row(row)
732 v[irow[1]] = value
733 else:
734 self.values[irow[0]][irow[1]] = value
735 else:
736 raise TypeError( # pragma: no cover
737 "Invalid argument type(only tuple accepted): " + str(type(irow)))
739 def __len__(self):
740 """
741 returns the number of rows
742 """
743 return len(self.values)
745 def __copy__(self):
746 """
747 operator copy
748 """
749 return self._private_getclass()(self.header, self.values)
751 def __deepcopy__(self, memo):
752 """
753 operator ``deepcopy``
754 """
755 return self._private_getclass()(copy.deepcopy(self.header, memo), copy.deepcopy(self.values, memo))
757 def copy(self):
758 """
759 call ``copy.deepcopy(self)``
760 """
761 return copy.deepcopy(self)
763 def delta(self, other):
764 """
765 returns a list of differences between self and others
767 :param other: TableFormula
768 :return: list of differences(first one)
769 """
770 if other is None:
771 return False
772 if not isinstance(other, TableFormula):
773 raise TypeError("other is not a table: " + str(type(other)))
774 if len(self.header) != len(other.header):
775 return ["different number of columns"]
776 for a, b in zip(self.header, other.header):
777 if a != b:
778 return ["different columns"]
779 if len(self.values) != len(other.values):
780 return ["different number of rows"]
781 line = 0
782 for r, s in zip(self.values, other.values):
783 if len(r) != len(s):
784 return ["different number of values on row %d" % line]
785 col = 0
786 for a, b in zip(r, s):
787 if a != b:
788 return ["different value on cell %d,%d: %s!=%s(type %s, %s)" % (line, col, a, b, str(type(a)), str(type(b)))]
789 col += 1
790 line += 1
791 return []
793 def __eq__(self, other):
794 """
795 check if two tables are equal by value
796 :param other: other table
797 :return: boolean
798 """
799 if other is None:
800 return False
801 if not isinstance(other, TableFormula):
802 return False
803 if len(self.header) != len(other.header):
804 return False
805 for a, b in zip(self.header, other.header):
806 if a != b:
807 return False
808 if len(self.values) != len(other.values):
809 return False
810 for r, s in zip(self.values, other.values):
811 if len(r) != len(s):
812 return False
813 for a, b in zip(r, s):
814 if a != b:
815 return False
816 return True
818 def __str__(self):
819 """
820 convert the table into a string
821 :return: string
822 """
823 rows = ["\t".join(self.header)]
824 for row in self.values:
825 s = "\t".join([str(_) for _ in row])
826 rows.append(s)
827 return "\n".join(rows)
829 def __html__(self, class_table=None, class_td=None, class_tr=None, class_th=None):
830 """
831 Converts the table into a :epkg:`html` string.
833 :param class_table: adds a class to the tag ``table`` (None for none)
834 :param class_td: adds a class to the tag ``td`` (None for none)
835 :param class_tr: adds a class to the tag ``tr`` (None for none)
836 :param class_th: adds a class to the tag ``th`` (None for none)
837 """
838 clta = ' class="%s"' % class_table if class_table is not None else ""
839 cltr = ' class="%s"' % class_tr if class_tr is not None else ""
840 cltd = ' class="%s"' % class_td if class_td is not None else ""
841 clth = ' class="%s"' % class_th if class_th is not None else ""
843 rows = ["<table%s>" % clta]
844 rows.append("{0}{1}{2}".format(("<tr%s><th%s>" % (cltr, clth)),
845 ("</th><th%s>" % clth).join(self.header), "</th></tr>"))
846 septd = "</td><td%s>" % cltd
847 strtd = "<tr%s><td%s>" % (cltr, cltd)
848 for row in self.values:
849 s = septd.join([str(_) for _ in row])
850 rows.append(strtd + s + "</td></tr>")
851 rows.append("</table>")
852 rows.append("")
853 return "\n".join(rows)
855 def __rst__(self, add_line=True):
856 """
857 convert the table into rst format
859 ::
861 +------------------------+------------+----------+----------+
862 | Header row, column 1 | Header 2 | Header 3 | Header 4 |
863 | (header rows optional) | | | |
864 +========================+============+==========+==========+
865 | body row 1, column 1 | column 2 | column 3 | column 4 |
866 +------------------------+------------+----------+----------+
867 | body row 2 | ... | ... | |
868 +------------------------+------------+----------+----------+
870 :param add_line: add a line separator between each row
871 """
872 tbl = self.values_to_str()
873 length = [len(_) for _ in tbl.header]
874 for row in tbl.values:
875 for i, v in enumerate(row):
876 length[i] = max(length[i], len(v))
877 length = [_ + 2 for _ in length]
878 line = ["-" * le for le in length]
879 lineb = ["=" * le for le in length]
880 sline = "+%s+" % ("+".join(line))
881 slineb = "+%s+" % ("+".join(lineb))
882 res = [sline]
884 def complete(cool):
885 s, i = cool
886 i -= 2
887 if len(s) < i:
888 s += " " * (i - len(s))
889 return s
891 res.append("| %s |" % " | ".join(
892 map(complete, zip(tbl.header, length))))
893 res.append(slineb)
894 res.extend(["| %s |" % " | ".join(map(complete, zip(row, length)))
895 for row in tbl.values])
896 if add_line:
897 t = len(res)
898 for i in range(t - 1, 3, -1):
899 res.insert(i, sline)
900 res.append(sline)
901 return "\n".join(res) + "\n"
903 def strtype(self):
904 """
905 displays the type of values(not the values)
906 """
907 rows = ["\t".join(self.header)]
908 for row in self.values:
909 s = "\t".join([str(type(_)) for _ in row])
910 rows.append(s)
911 return "\n".join(rows)
913 def _read_file(self, file, numeric_column, sep, encoding, read_n_lines, sheet=0):
914 """
915 private
916 """
917 ext = os.path.splitext(file)[-1].lower()
918 if ext in [".xls", ".xlsx"]:
919 lines = list(open_workbook(file, sheet=sheet))
920 # removing empty column(assuming first row is the header)
921 ind = [i for i, n in enumerate(lines[0]) if len(n) > 0]
922 if len(ind) < len(lines[0]):
923 lines = [[line[i] for i in ind] for line in lines]
924 else:
925 if sys.version_info.major >= 3 or encoding is None:
926 if encoding is None:
927 f = open(file, "r")
928 else:
929 f = open(file, "r", encoding=encoding)
930 else:
931 f = open(file, "r", encoding=encoding)
933 if read_n_lines > 0:
934 lines = []
935 for line in f:
936 if len(lines) >= read_n_lines:
937 break
938 lines.append(line)
939 else:
940 lines = f.readlines()
941 f.close()
942 self._readlines(lines, numeric_column, sep)
944 def change_header(self, new_header):
945 """
946 change the column names
948 :param new_header: a list or a function which modifies the header
950 Example:
952 ::
954 tbl.change_header(lambda h: h if h != "column" else "new_name")
956 .. warning:: Do not do that yourself, the class holds a dictionary up to date with the column index.
957 """
958 if isinstance(new_header, list):
959 self.header = new_header
960 self.index = {}
961 for i, h in enumerate(self.header):
962 self.index[h] = i
963 else:
964 he = [new_header(h) for h in self.header]
965 self.change_header(he)
967 def rename_column(self, old_name, new_name):
968 """
969 rename a column
971 :param old_name: old name
972 :param new_name: new name
973 """
974 header = [{old_name: new_name}.get(_, _) for _ in self.header]
975 self.change_header(header)
977 def save(self, filename, sep="\t", encoding=None, newline="\n"):
978 """
979 saves the tables in a text file, first row is the column names
981 :param filename: filename
982 :param sep: column separator
983 :param encoding: encoding
984 :param newline: line separator
985 """
986 if sys.version_info.major >= 3 or encoding is None:
987 if encoding is None:
988 f = open(filename, "w", newline=newline)
989 else:
990 f = open(filename, "w", encoding=encoding, newline=newline)
991 else:
992 f = open(filename, "w", encoding=encoding)
994 f.write(sep.join(self.header))
995 f.write("\n")
996 for row in self.values:
997 f.write(sep.join([str(_) for _ in row]))
998 f.write("\n")
999 f.close()
1001 def _readlines(self, lines, numeric_column, sep):
1002 """private"""
1003 if isinstance(lines[0], str):
1004 lines = [_.replace("\ufeff", "").replace("\xef\xbb\xbf", "")
1005 .strip("\n\r ").split(sep) for _ in lines if len(_) > 0]
1006 self.header = lines[0]
1007 self.values = lines[1:]
1008 self.index = {}
1009 for i, h in enumerate(self.header):
1010 self.index[h] = i
1011 elif isinstance(lines[0], list):
1012 self.header = lines[0]
1013 self.values = lines[1:]
1014 self.index = {}
1015 for i, h in enumerate(self.header):
1016 self.index[h] = i
1017 else:
1018 raise Exception("unexpected format: " + str(type(lines[0])))
1020 self._auto_conversion(numeric_column)
1022 def _auto_conversion(self, others_columns):
1023 """
1024 private
1025 set up the column type based on the column name
1026 """
1027 def condition(k):
1028 if k.startswith("sum_") or k.startswith("pos_"):
1029 return True
1030 if k.startswith("avg_") or k.startswith("len_"):
1031 return True
1032 if k.startswith("nb_") or k.startswith("max_") or k.startswith("min_"):
1033 return True
1034 if k.startswith("d_") or k in others_columns:
1035 return True
1036 if k in ["Open", "High", "Low", "Close", "Volume", "Adj Close"]:
1037 return True
1038 if k in ["distance", "nb"]:
1039 return True
1040 return False
1042 for i, k in enumerate(self.header):
1043 if k == "Date":
1044 for row in self.values:
1045 if isinstance(row[i], str):
1046 row[i] = datetime.datetime.strptime(row[i], '%Y-%m-%d')
1047 elif isinstance(row[i], float):
1048 row[i] = datetime.datetime.utcfromtimestamp(row[i])
1049 else:
1050 raise Exception(
1051 "unable to extract a date from type {0}".format(type(row[i])))
1052 elif condition(k):
1053 for row in self.values:
1054 row[i] = float(row[i])
1055 else:
1056 for row in self.values:
1057 if isinstance(row[i], str) and row[i] == "None":
1058 row[i] = None
1060 def get_column_values(self, col):
1061 """
1062 private
1063 returns all values for one column
1064 """
1065 i = self.index[col]
1066 return [row[i] for row in self.values]
1068 def get_distinct_values(self, col):
1069 """private"""
1070 row = self.get_column_values(col)
1071 dis = {}
1072 for r in row:
1073 dis[r] = dis.get(r, 0) + 1
1074 return dis
1076 def _interpret_row(self, row):
1077 """
1078 private
1079 returns each row as a dictionary { column_name:value }
1080 """
1081 values = {}
1082 for a, b in zip(self.header, row):
1083 values[a] = b
1084 return values
1086 def __iter__(self):
1087 """
1088 iterator on all rows, it returns a dictionary { column:value }
1089 @return dictionary
1090 """
1091 for row in self.values:
1092 yield self._interpret_row(row)
1094 def add_column(self, colname, function, position=-1):
1095 """
1096 Adds a column.
1097 :param colname: column name or columns name if it is a list or a tuple
1098 :param function: function which will gives the values(or a list of functions, or a function which return a tuple)
1099 :param position: where to insert the column, -1 for the end
1101 Example:
1103 ::
1105 table.add_column("has_A", lambda v: 1 if "A" in v["name"] else 0, 0)
1107 table.add_column(("has_A", "has_B"),(lambda v: 1 if "A" in v["name"] else 0,
1108 lambda v: 1 if "B" in v["name"] else 0))
1110 table.add_column(("has_A", "has_B"),(lambda v:(1 if "A" in v["name"] else 0, 1 if "B" in v["name"] else 0))
1111 """
1112 if isinstance(colname, str):
1113 if position == -1:
1114 self.index[colname] = len(self.index)
1115 for row in self.values:
1116 v = self._interpret_row(row)
1117 x = function(v)
1118 row.append(x)
1119 self.header.append(colname)
1120 else:
1121 for row in self.values:
1122 v = self._interpret_row(row)
1123 x = function(v)
1124 row.insert(position, x)
1125 self.header.insert(position, colname)
1126 self.index = {v: i for i, v in enumerate(self.header)}
1128 elif isinstance(function, list):
1129 if len(colname) != len(function):
1130 raise ValueError( # pragma: no cover
1131 "unable to continue, colname and function do not have the same number of elements")
1132 if position == -1:
1133 position = [-1] * len(colname)
1134 elif isinstance(position, int):
1135 position = [position] * len(colname)
1136 else:
1137 if len(position) != len(colname):
1138 raise RuntimeError( # pragma: no cover
1139 "Unable to continue, colname and position do not "
1140 "have the same number of elements.")
1141 dec = 0
1142 for a, b, c in zip(colname, function, position):
1143 self.add_column(a, b, c + dec)
1144 dec += 1
1146 else:
1147 # we assume here, the function returns a tuple
1148 if not isinstance(position, int):
1149 raise TypeError( # pragma: no cover
1150 "Int expected for position for this case.")
1152 if position == -1:
1153 for row in self.values:
1154 v = self._interpret_row(row)
1155 x = function(v)
1156 row.extend(x)
1157 self.header.extend(colname)
1159 else:
1160 for row in self.values:
1161 v = self._interpret_row(row)
1162 x = function(v)
1163 for i, _ in enumerate(x):
1164 row.insert(position + i, _)
1165 for i, c in enumerate(colname):
1166 self.header.insert(position + i, c)
1167 self.index = {v: i for i, v in enumerate(self.header)}
1168 return self
1170 def add_column_index(self, colname="index", start=0):
1171 """
1172 Example:
1174 ::
1176 table.add_column("index_row")
1177 """
1178 self.index[colname] = len(self.index)
1179 for i, row in enumerate(self.values):
1180 row.append(i + start)
1181 self.header.append(colname)
1182 return self
1184 def addc(self, colname, function, position=-1):
1185 """
1186 @see me add_column
1187 """
1188 return self.add_column(colname, function, position)
1190 def add_column_recursive(self, colname, functionValue, functionAgg):
1191 """
1192 Example:
1194 ::
1196 table.add_column_recursive(lambda v: v ["norm_%s" % loi],
1197 lambda li, v: li[-1] + v)
1198 """
1199 self.index[colname] = len(self.index)
1200 values = []
1201 for row in self.values:
1202 v = self._interpret_row(row)
1203 x = functionValue(v)
1204 y = functionAgg(values, x)
1205 row.append(y)
1206 values.append(y)
1207 self.header.append(colname)
1208 return self
1210 def add_column_recursive_row(self, colname, functionAgg):
1211 """
1212 Example:
1214 ::
1216 table.add_column_recursive_row("w_%s" % loi,
1217 lambda li, v: li[-1] + v ["norm_%s" % loi] \
1218 if len(li)> 0 else v ["norm_%s" % loi])
1219 """
1220 self.index[colname] = len(self.index)
1221 values = []
1222 for row in self.values:
1223 v = self._interpret_row(row)
1224 y = functionAgg(values, v)
1225 row.append(y)
1226 values.append(y)
1227 self.header.append(colname)
1228 return self
1230 def add_column_vector(self, colname, vector):
1231 """
1232 add a column defined by vector(list of values for each row)
1234 :param colname: column to add
1235 :param vector: (list) list of values to add to each row
1236 :return self
1237 """
1238 if len(vector) != len(self):
1239 raise ValueError("vector and table have different length {0} != {1}".format(
1240 len(vector), len(self)))
1241 for vec, row in zip(vector, self.values):
1242 row.append(vec)
1243 self.index[colname] = len(self.index)
1244 self.header.append(colname)
1245 return self
1247 def add_column_smooth(self, colname, function, position, weights):
1248 """
1249 Example:
1251 ::
1253 x = 1./3
1254 table.add_column_smooth("has_A_smooth", lambda v: v["has_A"], [-1,0,1], [x,x,x])
1255 """
1256 if len(position) != len(weights):
1257 raise ValueError("position and weights must have the same length")
1258 self.index[colname] = len(self.index)
1259 column = [function(self._interpret_row(row)) for row in self.values]
1260 tw = sum(weights)
1261 couple = list(zip(position, weights))
1262 for p, row in enumerate(self.values):
1263 sx = 0.
1264 sw = 0.
1265 ms = 0
1266 for i, w in couple:
1267 pi = p + i
1268 if 0 <= pi < len(self):
1269 sx += column[pi] * w
1270 sw += w
1271 else:
1272 ms += 1
1274 if ms == 0:
1275 row.append(sx)
1276 elif sw != 0:
1277 row.append(sx * tw / sw)
1278 else:
1279 row.append(sx)
1280 self.header.append(colname)
1281 return self
1283 def aggregate_column(self, colname, aggregated_function=sum):
1284 """
1285 Example:
1287 ::
1289 total = table.aggregate_column("d_c", sum)
1290 """
1291 def function(v):
1292 return v[colname]
1293 return self.aggregate(function, aggregated_function)
1295 def aggregate(self, function, aggregated_function=sum):
1296 """
1297 Example:
1299 ::
1301 total = table.aggregate_column(lambda v: v["d_c"], len)
1302 """
1303 return aggregated_function([function(self._interpret_row(row)) for row in self.values])
1305 def where(self, condition_function):
1306 """
1307 @see me filter
1308 """
1309 return self.filter(condition_function)
1311 def filter(self, condition_function):
1312 """
1313 Example:
1315 ::
1317 fil = table.filter(lambda v: v["d_b"] == 2)
1319 @warning Rows are not copied.
1320 """
1321 newv = []
1322 for row in self.values:
1323 v = self._interpret_row(row)
1324 x = condition_function(v)
1325 if x:
1326 newv.append(row)
1327 final = self._private_getclass()(self.header, newv)
1328 return final
1330 def groupby_implicit(self, functionKey, functionWeight=None, logging=None):
1331 """
1332 use prefix of a column name to know which function to use
1333 as an aggregated(sum, avg, len, key, none, max, min)
1334 Example:
1336 ::
1338 group = table.groupby_implicit(lambda v: v["name"])
1339 """
1340 def identical(col, v):
1341 return v[col]
1343 def first(vec):
1344 return vec[0]
1346 def avg(vec):
1347 return TableFormula.ratio(sum(vec), len(vec))
1349 functions = []
1350 labels = ["key"]
1351 functionsAgg = []
1352 for col in self.header:
1353 if col.startswith("key"):
1354 values = self.select(
1355 lambda v, col=col: (v[col], functionKey(v)))
1356 dd = {}
1357 for v in values:
1358 if v[1] not in dd:
1359 dd[v[1]] = {}
1360 dd[v[1]][v[0]] = 1
1361 for k in dd:
1362 dd[k] = len(dd[k])
1363 keep = []
1364 for k, v in dd.items():
1365 if v > 1:
1366 keep.append((k, v))
1368 if len(keep) == 0:
1369 functions.append(lambda v, col=col: identical(col, v))
1370 labels.append(col)
1371 functionsAgg.append(first)
1372 elif logging is not None:
1373 end = min(len(keep), 10)
1374 mes = ",".join([str(_) for _ in keep[:end]])
1375 logging("removing column '{0}' no unique value: {1}: {2}".format(
1376 col, len(dd), mes))
1377 elif col.startswith("sum"):
1378 functions.append(lambda v, col=col: identical(col, v))
1379 labels.append(col)
1380 functionsAgg.append(sum)
1381 elif col.startswith("len"):
1382 functions.append(lambda v, col=col: 1)
1383 labels.append(col)
1384 functionsAgg.append(len)
1385 elif col.startswith("min"):
1386 functions.append(lambda v, col=col: 1)
1387 labels.append(col)
1388 functionsAgg.append(min)
1389 elif col.startswith("max"):
1390 functions.append(lambda v, col=col: 1)
1391 labels.append(col)
1392 functionsAgg.append(max)
1393 elif col.startswith("avg"):
1394 functions.append(lambda v, col=col: identical(col, v))
1395 labels.append(col)
1396 functionsAgg.append(avg)
1397 elif col.startswith("none"):
1398 pass
1399 else:
1400 raise RuntimeError("unable to aggregate column " + col)
1402 return self.groupby(functionKey, functions, labels, functionsAgg, functionWeight)
1404 def groupby(self, functionKey, functionsValue, columns=None, functionsAgg=None, functionWeight=None):
1405 """
1406 Example:
1408 ::
1410 group = table.groupby(lambda v: v["name"],
1411 [lambda v: v["d_a"],
1412 lambda v: v["d_b"]],
1413 ["name", "sum_d_a", "sum_d_b"])
1415 or
1417 ::
1419 groupmax = table.groupby(lambda v: v["name"],
1420 [lambda v: v["d_a"],
1421 lambda v: v["d_b"]],
1422 ["name", "max_d_a", "max_d_b"],
1423 [max, max])
1424 """
1425 if not isinstance(functionsValue, list):
1426 functionsValue = [functionsValue]
1427 if functionsAgg is None:
1428 functionsAgg = [sum for f in functionsValue]
1429 if functionWeight is None:
1430 if columns is not None and len(columns) != len(functionsValue) + 1:
1431 raise Exception("columns should have %d names not(%d)" % (
1432 len(functionsValue) + 1, len(columns)))
1433 else:
1434 if columns is not None and len(columns) != len(functionsValue) + 2:
1435 raise Exception("columns should have %d names not(%d)" % (
1436 len(functionsValue) + 2, len(columns)))
1437 if columns is not None and not isinstance(columns[0], str):
1438 raise TypeError("expecting type str not %s in columns" %
1439 (str(type(columns[0]))))
1441 hist = {}
1442 if functionWeight is not None:
1443 histWeight = {}
1445 for row in self.values:
1446 v = self._interpret_row(row)
1447 key = functionKey(v)
1448 w = 1. if functionWeight is None else functionWeight(v)
1450 if key not in hist:
1451 histWeight[key] = [w]
1452 hist[key] = [[f(v) * w] for f in functionsValue]
1453 else:
1454 histWeight[key].append(w)
1455 h = hist[key]
1456 for i, f in enumerate(functionsValue):
1457 h[i].append(f(v) * w)
1459 for key in hist: # pylint: disable=C0206
1460 h = hist[key]
1461 w = sum(histWeight[key])
1462 for i in range(0, len(h)):
1463 h[i] = functionsAgg[i](h[i], w)
1465 f = hist.items if sys.version_info.major >= 3 else hist.items
1466 histValues = [[k, sum(histWeight[k])] + v for k, v in f()]
1468 if columns is None:
1469 columns = ["key", "weight"] + ["val%d" %
1470 i for i, f in enumerate(functionsValue)]
1471 ret = self._private_getclass()(columns, histValues)
1472 return ret
1473 else:
1474 for row in self.values:
1475 v = self._interpret_row(row)
1476 key = functionKey(v)
1477 if key not in hist:
1478 hist[key] = [[f(v)] for f in functionsValue]
1479 else:
1480 h = hist[key]
1481 for i, f in enumerate(functionsValue):
1482 h[i].append(f(v))
1484 for key in hist: # pylint: disable=C0206
1485 h = hist[key]
1486 for i in range(0, len(h)):
1487 h[i] = functionsAgg[i](h[i])
1489 f = hist.items if sys.version_info.major >= 3 else hist.items
1490 histValues = [[k] + v for k, v in f()]
1492 if columns is None:
1493 columns = ["key"] + ["val%d" %
1494 i for i, f in enumerate(functionsValue)]
1495 ret = self._private_getclass()(columns, histValues)
1496 return ret
1498 def sort(self, functionValue, reverse=False):
1499 """
1500 Example:
1502 ::
1504 table.sort(lambda v: v["d_b"] + v["d_c"])
1505 """
1506 values = [(functionValue(self._interpret_row(row)), i)
1507 for i, row in enumerate(self.values)]
1508 values.sort(reverse=reverse)
1509 self.values = [self.values[_[1]] for _ in values]
1510 return self
1512 def extract_columns(self, listColumns):
1513 """
1514 extract some columns
1516 @param listColumns list of columns to remove or a function
1517 which returns True if the column has to be extracted
1518 based on its name
1519 @return table
1521 Example:
1523 ::
1525 ext = table.extract_columns(["name", "d_a"])
1526 """
1527 if isinstance(listColumns, list):
1528 indexes = [(self.index[col] if isinstance(col, str) else col)
1529 for col in listColumns]
1530 header = listColumns
1531 values = [[row[i] for i in indexes] for row in self.values]
1532 return self._private_getclass()(header, values)
1533 else:
1534 header = [_ for _ in self.header if listColumns(_)]
1535 return self.extract_columns(header)
1537 def remove_columns(self, listColumns):
1538 """
1539 remove some columns
1541 @param listColumns list of columns to remove or a function
1542 which returns True if the column has to be removed
1543 based on its name
1544 @return table
1546 Example:
1548 ::
1550 rem = table.remove("d_a")
1551 """
1552 if isinstance(listColumns, list):
1553 cols = [_ for i, _ in enumerate(
1554 self.header) if _ not in listColumns and i not in listColumns]
1555 return self.extract_columns(cols)
1556 if isinstance(listColumns, str):
1557 cols = [_ for _ in self.header if _ != listColumns]
1558 return self.extract_columns(cols)
1559 cols = [_ for _ in self.header if not listColumns(_)]
1560 return self.extract_columns(cols)
1562 def innerjoin(self, table, functionKey1, functionKey2, nameKey="key",
1563 addSuffixAnyWay=False, prefixToAdd=None, full=False,
1564 keepKey=True, putKeyInColumn=None, missingValue=None,
1565 uniqueKey=False):
1566 """
1567 @param table other table to join with
1568 @param functionKey1 key for the first table(a function)
1569 @param functionKey2 key for the second table(a function) innerjoin .... ON ...
1570 @param addSuffixAnyWay add a suffix to every column from the second table even
1571 if names are different(suffix is "+")
1572 @param prefixToAdd prefix to add the the columns of the second table
1573 @param full add all items even if there is no common keys(``FULL OUTER JOIN``),
1574 otherwise keep only common keys
1575 @param keepKey keep the key as a column in the result(column is key), otherwise not
1576 @param putKeyInColumn private parameter: keepKey has to be true and in this case,
1577 put the key in this column(integer)
1578 @param missingValue when there is not key on one side, this default value will be put in place
1579 @param uniqueKey if True, the function assumes there is a bijection between rows
1580 and keys(one row <--> one key) on both tables,
1581 otherwise, it will not.
1582 @return a table
1584 Example:
1586 ::
1588 innerjoin = table.innerjoin(group, lambda v: v["name"],
1589 lambda v: v["name"], "group")
1590 """
1591 defaultVal1 = [missingValue for k in self.header]
1592 defaultVal2 = [missingValue for k in table.header]
1594 if uniqueKey:
1595 keys = {}
1596 for row in self.values:
1597 v = self._interpret_row(row)
1598 key = functionKey1(v)
1599 keys[key] = (row, None)
1601 for row in table.values:
1602 v = table._interpret_row(row)
1603 key = functionKey2(v)
1604 if key in keys:
1605 keys[key] = (keys[key][0], row)
1606 elif full:
1607 keys[key] = (None, row)
1609 if not full:
1610 d = []
1611 for k, v in keys.items():
1612 if None in v:
1613 d.append(k)
1614 for _ in d:
1615 del keys[_]
1616 else:
1617 for k in keys: # pylint: disable=C0206
1618 v = keys[k]
1619 if v[0] is None:
1620 keys[k] = (defaultVal1, v[1])
1621 elif v[1] is None:
1622 keys[k] = (v[0], defaultVal2)
1624 if keepKey:
1625 columns = [nameKey]
1626 for x in self.header:
1627 while x in columns:
1628 x += "~"
1629 columns.append(x)
1631 for x in table.header:
1632 if prefixToAdd is not None:
1633 x = prefixToAdd + x
1634 elif addSuffixAnyWay:
1635 x += "+"
1636 while x in columns:
1637 x += "+"
1638 columns.append(x)
1640 f = keys.items if sys.version_info.major >= 3 else keys.items
1641 values = [[k] + v[0] + v[1] for k, v in f() if len(v) == 2]
1642 return self._private_getclass()(columns, values)
1643 else:
1644 columns = []
1645 for x in self.header:
1646 while x in columns:
1647 x += "~"
1648 columns.append(x)
1650 for x in table.header:
1651 if prefixToAdd is not None:
1652 x = prefixToAdd + x
1653 elif addSuffixAnyWay:
1654 x += "+"
1655 while x in columns:
1656 x += "+"
1657 columns.append(x)
1659 f = keys.items if sys.version_info.major >= 3 else keys.items
1661 if putKeyInColumn is None:
1662 values = [v[0] + v[1] for k, v in f() if len(v) == 2]
1663 else:
1664 values = []
1665 for k, v in f():
1666 if len(v) == 2:
1667 nr = v[0] + v[1]
1668 nr[putKeyInColumn] = k
1669 values.append(nr)
1671 return self._private_getclass()(columns, values)
1672 else:
1673 keys = {}
1674 for row in self.values:
1675 v = self._interpret_row(row)
1676 key = functionKey1(v)
1677 if key in keys:
1678 keys[key][0].append(row)
1679 else:
1680 keys[key] = ([row], None)
1682 for row in table.values:
1683 v = table._interpret_row(row)
1684 key = functionKey2(v)
1685 if key in keys:
1686 if keys[key][1] is None:
1687 keys[key] = (keys[key][0], [row])
1688 else:
1689 keys[key][1].append(row)
1690 elif full:
1691 keys[key] = (None, [row])
1693 if not full:
1694 d = []
1695 for k, v in keys.items():
1696 if None in v:
1697 d.append(k)
1698 for _ in d:
1699 del keys[_]
1700 else:
1701 for k in keys: # pylint: disable=C0206
1702 v = keys[k]
1703 if v[0] is None:
1704 keys[k] = ([defaultVal1], v[1])
1705 elif v[1] is None:
1706 keys[k] = (v[0], [defaultVal2])
1708 if keepKey:
1709 columns = [nameKey]
1710 for x in self.header:
1711 while x in columns:
1712 x += "~"
1713 columns.append(x)
1715 for x in table.header:
1716 if prefixToAdd is not None:
1717 x = prefixToAdd + x
1718 elif addSuffixAnyWay:
1719 x += "+"
1720 while x in columns:
1721 x += "+"
1722 columns.append(x)
1724 f = keys.items if sys.version_info.major >= 3 else keys.items
1726 values = []
1727 for k, v in f():
1728 if len(v) == 2:
1729 for ka in v[0]:
1730 for kb in v[1]:
1731 values.append([k] + ka + kb)
1732 return self._private_getclass()(columns, values)
1733 else:
1734 columns = []
1735 for x in self.header:
1736 while x in columns:
1737 x += "~"
1738 columns.append(x)
1740 for x in table.header:
1741 if prefixToAdd is not None:
1742 x = prefixToAdd + x
1743 elif addSuffixAnyWay:
1744 x += "+"
1745 while x in columns:
1746 x += "+"
1747 columns.append(x)
1749 f = keys.items if sys.version_info.major >= 3 else keys.items
1751 if putKeyInColumn is None:
1752 values = [v[0] + v[1] for k, v in f() if len(v) == 2]
1753 else:
1754 values = []
1755 for k, v in f():
1756 if len(v) == 2:
1757 for ka in v[0]:
1758 for kb in v[1]:
1759 nr = ka + kb
1760 nr[putKeyInColumn] = k
1761 values.append(nr)
1763 return self._private_getclass()(columns, values)
1765 def filter_quantile(self, function, alpha_min=0.025, alpha_max=0.025):
1766 """
1767 sort all rows using criteria defined by function and remove
1768 rows at the extremes
1770 @param function values used to estimate the quantiles
1771 @param alpha_min lower quantile
1772 @param alpha_max higher quantile
1773 @return a table containing all the rows where the criterium
1774 is within the two quantiles
1776 Example:
1778 ::
1780 fil = table.filter_quantile(lambda v: v["d_b"], 0, 0.4)
1782 @warning Rows are not copied.
1783 """
1784 values = []
1785 for row in self.values:
1786 v = self._interpret_row(row)
1787 val = function(v)
1788 values.append((val, row))
1789 values.sort()
1790 lv = len(values)
1791 i1 = int(lv * alpha_min + 0.5)
1792 i2 = int(lv * (1 - alpha_max) + 0.5)
1793 i1 = max(i1, 0)
1794 i1 = min(i1, lv)
1795 i2 = max(i1, i2)
1796 i2 = min(i2, lv)
1797 if i2 == i1:
1798 raise RuntimeError("unable to extract quantile, the table is either "
1799 "empty or chosen quantile are not correct")
1800 values = [_[1] for _ in values[i1:i2]]
1801 return self._private_getclass()(self.header, values)
1803 def union(self, table):
1804 """
1805 @param table table
1806 @return table(with the same number of columns)
1808 concatenates two tables by rows, they must have the same header, rows of both tables are merged into a single matrix
1809 Example:
1811 ::
1813 union = table.union(table2)
1814 """
1815 if len(self.header) != len(table.header):
1816 raise ValueError("tables do not have the same number of columns\ntbl1: %s\ntbl2: %s" % (
1817 ",".join(self.header), ",".join(table.header)))
1818 for a, b in zip(self.header, table.header):
1819 if a != b:
1820 raise ValueError("tables do not have the same column names")
1821 return self._private_getclass()(self.header, self.values + table.values)
1823 def concatenate(self, table, addPrefix=""):
1824 """
1825 concatenates two tables by columns
1826 @param table table
1827 @param addPrefix add a prefix to each column from table
1828 @return table (with the same number of rows as the longest one)
1829 """
1830 maxr = max(len(self), len(table))
1831 header = self.header + [addPrefix + h for h in table.header]
1832 values = []
1833 for i in range(0, maxr):
1834 r1 = self.values[i] if i < len(self) else [None] * len(self.header)
1835 r2 = table.values[i] if i < len(
1836 table) else [None] * len(self.table)
1837 values.append(r1 + r2)
1838 return self._private_getclass()(header, values)
1840 def random(self, n, unique=False):
1841 """
1842 select n random row from the table, returns a table
1844 @param n number of desired random rows
1845 @param unique draws unique rows or non unique rows
1846 (tirage sans remise ou avec remise)
1847 @return a table
1849 Example:
1851 ::
1853 rnd = table.random(10)
1854 """
1855 if unique:
1856 if n > len(self):
1857 raise ValueError("number of desired random rows is higher "
1858 "than the number of rows in the table")
1859 index = {}
1860 while len(index) < n:
1861 h = random.randint(0, len(self) - 1)
1862 index[h] = 0
1863 values = [self.values[h] for h in index]
1864 return self._private_getclass()(self.header, values)
1865 else:
1866 values = []
1867 for i in range(0, n):
1868 h = random.randint(0, len(self) - 1)
1869 values.append(self.values[h])
1870 return self._private_getclass()(self.header, values)
1872 def todict(self, functionKey, functionValue, useList=False):
1873 """
1874 convert the table as a dictionary { key:value }
1875 each of them is defined by functions.
1877 @param functionKey defines the key
1878 @param functionValue defines the value
1879 @param useList if there are multiple rows sharing the same key, it should be true,
1880 all values are stored in a list
1881 @return a dictionary { key:row } or { key: [row1, row2, ...] }
1883 Example:
1885 ::
1887 d = table.todict(lambda v: v["name"], lambda v: v["d_b"], True)
1888 """
1889 res = {}
1890 if useList:
1891 for row in self.values:
1892 v = self._interpret_row(row)
1893 key = functionKey(v)
1894 val = functionValue(v)
1895 if key in res:
1896 res[key].append(val)
1897 else:
1898 res[key] = [val]
1899 else:
1900 for row in self.values:
1901 v = self._interpret_row(row)
1902 key = functionKey(v)
1903 val = functionValue(v)
1904 res[key] = val
1905 return res
1907 def reduce_dict(self, functionKey, functionValue, uselist=False):
1908 """
1909 @see me todict
1910 """
1911 return self.todict(functionKey, functionValue, uselist)
1913 def select(self, functionRow):
1914 """
1915 @param functionRow fonction
1916 @return table
1918 Example:
1920 ::
1922 d = table.select(lambda v:(v["name"], v["d_b"]))
1923 print(list(d))
1924 """
1925 for row in self.values:
1926 v = self._interpret_row(row)
1927 nr = functionRow(v)
1928 yield nr
1930 def modify_all(self, modification_function):
1931 """
1932 apply the same modification to every number
1933 @param modification_function modification to apply to every number
1934 @return new table
1936 The signature of the function is the following one:
1938 ::
1940 def function(value, column_name):
1941 # ....
1942 return new_value
1944 Example:
1946 ::
1948 tbl = tbl.modify_all(lambda v,c: {"string":"", "numerical":0}.get(c,None) if v is None else v)
1949 """
1950 values = []
1951 for row in self.values:
1952 r = []
1953 for v, h in zip(row, self.header):
1954 r.append(modification_function(v, h))
1955 values.append(r)
1956 return self._private_getclass()(self.header, values)
1958 def dcast(self, functionKey, functionInstance, full=True):
1959 """
1960 @see me multiply_column_by_row_instance
1961 """
1962 return self.multiply_column_by_row_instance(functionKey, functionInstance, full)
1964 def multiply_column_by_row_instance(self, functionKey, functionInstance, full=True):
1965 """
1966 @param functionKey defines a key(function)
1967 @param functionInstance defines a second key(will be moved to the columns dimension)
1968 @param full introduces missing values for not found combinations
1969 @return a table
1971 If a column contains a finite set of value, for example,
1972 we have the temperature for several cities organized like if
1973 it were a table from a database: city, date, temperatue.
1974 We would like to get another table where we have:
1975 date temparature_city1 temperature_city2...
1977 Then we would type:
1978 Example:
1980 ::
1982 mul = table.multiply_column_by_row_instance(
1983 lambda v: v["date"],
1984 lambda v: v["city"])
1986 The input table would be like:
1988 ::
1990 city date
1991 A jan
1992 A feb
1993 B feb
1995 It returns:
1997 ::
1999 KEY A|city A|date B|city B|date
2000 feb A feb B feb
2001 jan A jan None None
2002 """
2003 values = [functionInstance(self._interpret_row(row))
2004 for row in self.values]
2005 distinct = {}
2006 for v in values:
2007 distinct[v] = 0
2008 distinct = [_ for _ in distinct]
2009 distinct.sort()
2010 table1 = copy.deepcopy(self)
2011 table = None
2012 header = copy.copy(table1.header)
2013 orig = len(header)
2014 nameKey = "~KEY~"
2015 while nameKey in header:
2016 nameKey += "*"
2017 nbJoin = 0
2019 for val in distinct:
2020 table2 = table1.filter(
2021 lambda v, val=val: functionInstance(v) == val)
2022 if table is None:
2023 table = table2.copy()
2024 else:
2025 colkey = table.header[0]
2026 table = table.innerjoin(table2, functionKey if nbJoin == 0 else (lambda v, c=colkey: v[c]),
2027 functionKey, nameKey=nameKey,
2028 prefixToAdd=str(val) + "|",
2029 full=full, keepKey=nbJoin == 0,
2030 putKeyInColumn=None if nbJoin == 0 else 0,
2031 uniqueKey=True)
2033 if nbJoin == 0:
2034 head = []
2035 nb = 0
2036 for h in table.header:
2037 if not h.endswith("~") and nb < orig:
2038 head.append("%s|%s" % (distinct[0], h))
2039 nb += 1
2040 else:
2041 head.append(h)
2042 header = ["KEY"] + head[1:]
2043 table = self._private_getclass()(header, table.values)
2045 nbJoin += 1
2047 if nbJoin == 0:
2048 head = []
2049 nb = 0
2050 for h in table.header:
2051 if not h.endswith("~") and nb < orig:
2052 head.append("%s|%s" % (distinct[0], h))
2053 nb += 1
2054 else:
2055 head.append(h)
2056 values = []
2057 for row in self.values:
2058 v = self._interpret_row(row)
2059 r = [functionKey(v)] + row
2060 values.append(r)
2061 header = ["KEY"] + head
2062 table = self._private_getclass()(header, values)
2064 return table
2066 def create_index(self, functionIndex):
2067 """
2068 this method creates an index,
2069 to get an indexes row, use method get
2070 Example:
2072 ::
2074 table.create_index(lambda v:(v["name"], v["d_a"]))
2075 row = table.get(('A', 1.1))
2076 value = table.get(('A', 1.1), 2)
2077 """
2078 self.indexspecial = {}
2079 for row in self.values:
2080 v = self._interpret_row(row)
2081 nr = functionIndex(v)
2082 if nr in self.indexspecial:
2083 raise KeyError(
2084 "unable to add %s because it is already present" % str(nr))
2085 self.indexspecial[nr] = row
2086 return self
2088 def get(self, rowIndex, column=None):
2089 """
2090 use the index created by method create_index
2091 Example:
2093 ::
2095 table.create_index(lambda v:(v["name"], v["d_a"]))
2096 row = table.get(('A', 1.1))
2097 value = table.get(('A', 1.1), 2)
2098 """
2099 if "indexspecial" not in self.__dict__:
2100 raise Exception("no index was created")
2101 row = self.indexspecial[rowIndex]
2102 if column is None:
2103 return row
2104 elif isinstance(column, int):
2105 return row[column]
2106 else:
2107 return row[self.index[column]]
2109 def avg_std(self, functionValue, functionWeight=lambda v: 1):
2110 """
2111 returns the average and standard deviation
2112 """
2113 avg = 0.
2114 std = 0.
2115 n = 0.
2116 for i, row in enumerate(self.values):
2117 v = self._interpret_row(row)
2118 x = float(functionValue(v))
2119 w = functionWeight(v)
2120 avg += x * w
2121 std += x * x * w
2122 n += w
2124 if n != 0:
2125 avg /= n
2126 std /= n
2127 std -= avg * avg
2128 std = math.sqrt(std)
2129 else:
2130 avg = 0.
2131 std = 0.
2132 return avg, std
2134 def add_column_cumulative(self, column_index, column_name, functionIndex, functionValue,
2135 normalize=False, reverse=False, cumulative=True, functionSort=None):
2136 """
2137 also called the Gini function
2138 Example:
2140 ::
2142 table.add_column_cumulative("index_%s" % col, "dist_%s" % col,
2143 lambda v: v["sum_nbclient"], lambda v: v[col],
2144 functionSort = lambda v: v [col] / v["sum_nbclient"],
2145 normalize=True)
2146 """
2147 if functionSort is None:
2148 functionSort = functionValue
2149 val = []
2150 for row in self.values:
2151 v = self._interpret_row(row)
2152 i = functionIndex(v)
2153 s = functionSort(v)
2154 v = functionValue(v)
2155 val.append((s, i, v))
2156 val.sort(reverse=reverse)
2158 if cumulative:
2159 res = [(0., 0.)]
2160 for s, i, v in val:
2161 res.append((i + res[-1][0], v + res[-1][1]))
2162 del res[0]
2164 if normalize:
2165 sumi = res[-1][0]
2166 sumv = res[-1][1]
2167 if sumi != 0 and sumv != 0:
2168 res = [(_[0] / sumi, _[1] / sumv) for _ in res]
2169 else:
2170 raise ZeroDivisionError(
2171 "cannot divide by zero, all indexes or all values are null")
2172 else:
2173 res = [(i, v) for s, i, v in val]
2175 if normalize:
2176 sumi = sum([_[0] for _ in res])
2177 sumv = sum([_[1] for _ in res])
2178 if sumi != 0 and sumv != 0:
2179 res = [(_[0] / sumi, _[1] / sumv) for _ in res]
2180 else:
2181 raise ZeroDivisionError(
2182 "cannot divide by zero, all indexes or all values are null")
2184 for row, add in zip(self.values, res):
2185 row.extend(add)
2186 self.index[column_index] = len(self.index)
2187 self.index[column_name] = len(self.index)
2188 self.header.append(column_index)
2189 self.header.append(column_name)
2190 return self
2192 def transpose(self, labelC=None, labelAsRow=True):
2193 """
2194 Computes the transpose.
2195 @param labelC proposes labels for the column,
2196 if None, take "r%d" % i,
2197 if it is a string, the function assumes it is a column name
2198 @param labelAsRow add the label as a row
2199 @return new table
2200 """
2201 if labelC is None:
2202 label = ["r%d" % i for i in range(0, len(self.values))]
2203 if labelAsRow:
2204 label = ["rowheader"] + label
2205 rem = None
2206 elif isinstance(labelC, str):
2207 label = list(self.select(lambda v: v[labelC]))
2208 rem = label
2209 else:
2210 rem = None
2211 label = labelC
2213 values = []
2214 for i in range(0, len(self.header)):
2215 if rem is not None and self.header[i] == labelC:
2216 continue
2217 row = [_[i] for _ in self.values]
2218 if labelAsRow:
2219 row = [self.header[i]] + row
2220 values.append(row)
2221 return self._private_getclass()(label, values)
2223 def covariance(self):
2224 """
2225 Computes the covariance matrix, the first column
2226 will contains the column names.
2227 @return new table
2228 """
2229 for i, x in enumerate(self.values[0]):
2230 if not isinstance(x, float):
2231 raise TypeError("expecting a float on column %d" % i)
2232 values = self.np_matrix
2233 N = values.shape[0]
2234 sums = numpy.sum(values, axis=0) / N
2235 for i in range(0, values.shape[1]):
2236 values[:, i] -= sums[0, i]
2237 cov = values.transpose() * values
2238 cov /= N
2239 head = ["var"] + self.header
2240 size = cov.shape
2241 values = [[self.header[
2242 i]] + [float(cov[i, j]) for j in range(0, size[1])] for i in range(0, size[0])]
2243 tbl = self._private_getclass()(head, values)
2244 return tbl
2246 def correlation_col(self, col1, col2, noCenter=False):
2247 """
2248 Computes the correlation between two columns.
2249 @param col1 column 1
2250 @param col2 column 2
2251 @param noCenter does the computation without removing the average
2252 @return float(covariance)
2253 """
2254 values = [[self._interpret_row(row)[col1], self._interpret_row(row)[
2255 col2]] for row in self.values]
2256 if len(values) <= 1:
2257 raise ValueError( # pragma: no cover
2258 "expecting more than one observation, not %d" % len(values))
2259 mx = 0.
2260 my = 0.
2261 vx = 0.
2262 vy = 0.
2263 co = 0.
2264 nb = 0.
2265 for a, b in values:
2266 nb += 1
2267 mx += a
2268 my += b
2269 vx += a ** 2
2270 vy += b ** 2
2271 co += a * b
2272 mx /= nb
2273 my /= nb
2274 vx /= nb
2275 vy /= nb
2276 co /= nb
2277 if not noCenter:
2278 vx -= mx ** 2
2279 vy -= my ** 2
2280 co -= mx * my
2281 vx = vx ** 0.5
2282 vy = vy ** 0.5
2283 v = vx * vy
2284 if v != 0:
2285 co /= v
2286 return co
2288 def covariance_col(self, col1, col2, noCenter=False):
2289 """
2290 Computes the correlation between two columns.
2291 @param col1 column 1
2292 @param col2 column 2
2293 @param noCenter does the computation without removing the average
2294 @return float(covariance)
2295 """
2296 values = [[self._interpret_row(row)[col1],
2297 self._interpret_row(row)[col2]] for row in self.values]
2299 if len(values) <= 1:
2300 raise ValueError( # pragma: no cover
2301 "expecting more than one observation, not %d" % len(values))
2303 mx = 0.
2304 my = 0.
2305 co = 0.
2306 nb = 0.
2307 for a, b in values:
2308 nb += 1
2309 mx += a
2310 my += b
2311 co += a * b
2312 mx /= nb
2313 my /= nb
2314 co /= nb
2315 if not noCenter:
2316 co -= mx * my
2317 return co
2319 def correlation_row(self, row1, row2, noCenter=False):
2320 """
2321 computes the correlation between two columns
2322 @param row1 row 1(integer)
2323 @param row2 row 2(integer)
2324 @param noCenter does the computation without removing the average
2325 @return float(covariance)
2326 """
2327 values = [[a, b] for a, b in zip(self.values[row1], self.values[row2])]
2328 if len(values) <= 1:
2329 raise ValueError( # pragma: no cover
2330 "expecting more than one observation, not %d" % len(values))
2331 mx = 0.
2332 my = 0.
2333 vx = 0.
2334 vy = 0.
2335 co = 0.
2336 nb = 0.
2337 for a, b in values:
2338 nb += 1
2339 mx += a
2340 my += b
2341 vx += a ** 2
2342 vy += b ** 2
2343 co += a * b
2344 mx /= nb
2345 my /= nb
2346 vx /= nb
2347 vy /= nb
2348 co /= nb
2349 if not noCenter:
2350 vx -= mx ** 2
2351 vy -= my ** 2
2352 co -= mx * my
2353 vx = vx ** 0.5
2354 vy = vy ** 0.5
2355 v = vx * vy
2356 if v != 0:
2357 co /= v
2358 return co
2360 def covariance_row(self, row1, row2, noCenter=False):
2361 """
2362 computes the correlation between two columns
2363 @param row1 row 1(integer)
2364 @param row2 row 2(integer)
2365 @param noCenter does the computation without removing the average
2366 @return float(covariance)
2367 """
2368 values = [[a, b] for a, b in zip(self.values[row1], self.values[row2])]
2369 if len(values) <= 1:
2370 raise ValueError( # pragma: no cover
2371 "expecting more than one observation, not %d" % len(values))
2372 mx = 0.
2373 my = 0.
2374 co = 0.
2375 nb = 0.
2376 for a, b in values:
2377 nb += 1
2378 mx += a
2379 my += b
2380 co += a * b
2381 mx /= nb
2382 my /= nb
2383 co /= nb
2384 if not noCenter:
2385 co -= mx * my
2386 return co
2388 def correlation(self, useBootstrap=False, collapseFormat=True, nbdraws=-1, alpha=0.05,
2389 functionKeepValue=lambda val, low, high: "%f|%f,%f" % (val, low, high)):
2390 """
2391 Computes the correlation matrix, the first column
2392 will contains the column names.
2394 @param useBootstrap if True, use a bootstrap method to estimate the correlation
2395 @param collapseFormat if True and useBootstrap is True, produces a format
2396 ``average|lower bound|higher bound(at a definite confidence level)``
2397 @param nbdraws number of draws(if -1, then it will be equal to the number of observations)
2398 @param alpha confidence level
2399 @param functionKeepValue if collapseFormat is True, this function is used to collapse val,low,high in a single string
2400 @return new table
2401 """
2402 if useBootstrap:
2403 head = ["var"] + self.header
2404 values = [[i] + [0. for r in self.header] for i in self.header]
2405 for i in range(len(self.header)):
2406 values[i][0] = self.header[i]
2407 for j in range(len(self.header)):
2408 vs = [[row[i], row[j]] for row in self.values]
2409 bo = TableFormula.bootstrap(vs, function=TableFormula.correlation_bicolumn,
2410 nbdraws=nbdraws, alpha=alpha)
2411 if collapseFormat:
2412 st = functionKeepValue(bo[0], bo[2], bo[3])
2413 values[i][j + 1] = st
2414 else:
2415 raise NotImplementedError( # pragma: no cover
2416 "collapseFormat False is not implemented yet")
2417 tbl = self._private_getclass()(head, values)
2418 return tbl
2419 else:
2420 for i, x in enumerate(self.values[0]):
2421 if not isinstance(x, float):
2422 raise TypeError( # pragma: no cover
2423 "expecting a float on column %d" % i)
2425 values = self.np_matrix
2426 N = values.shape[0]
2427 sums = [sum(values[:, i]) / N for i in range(0, values.shape[1])]
2429 for i in range(0, values.shape[1]):
2430 values[:, i] -= sums[i]
2432 cov = values.transpose() * values
2433 cov /= N
2434 diag = [cov[i, i] ** 0.5 for i in range(cov.shape[0])]
2435 for i in range(cov.shape[0]):
2436 if diag[i] > 0:
2437 cov[i, :] /= diag[i]
2438 cov[:, i] /= diag[i]
2440 head = ["var"] + self.header
2441 size = cov.shape
2442 values = [[self.header[
2443 i]] + [float(cov[i, j]) for j in range(0, size[1])] for i in range(0, size[0])]
2444 tbl = self._private_getclass()(head, values)
2445 return tbl
2447 def values_to_float(self, only_if_possible=False, subset_columns=None):
2448 """
2449 converts all values into float
2450 @param only_if_possible if True, converts all possible values and catches exception,
2451 if False, converts everything, raises an exception when not possible
2452 @param subset_columns if None, takes all of them, otherwise, try to convert
2453 only for the listed columns
2454 @return table
2455 """
2456 tbl = self.copy()
2457 if subset_columns is not None:
2458 subset = {i: True for i, v in enumerate(
2459 self.header) if v in subset_columns}
2460 if only_if_possible:
2461 for row in tbl.values:
2462 for i in range(0, len(row)):
2463 if subset_columns is None or i in subset:
2464 try:
2465 v = float(row[i])
2466 row[i] = v
2467 except(ValueError, TypeError):
2468 continue
2469 else:
2470 for row in tbl.values:
2471 for i in range(0, len(row)):
2472 if subset_columns is None or i in subset:
2473 row[i] = float(row[i])
2474 return tbl
2476 def values_to_str(self, subset_columns=None, format=None):
2477 """
2478 converts all values into str
2479 @param subset_columns if None, takes all of them, otherwise, try to convert
2480 only for the listed columns
2481 @param format format for the conversion, by None by default but it could be for exemple %1.2f.
2482 @return table
2483 """
2484 tbl = self.copy()
2485 if subset_columns is not None:
2486 subset = {i: True for i, v in enumerate(
2487 self.header) if v in subset_columns}
2489 if format is None:
2490 for row in tbl.values:
2491 for i in range(0, len(row)):
2492 if subset_columns is None or i in subset:
2493 row[i] = str(row[i])
2494 else:
2495 for row in tbl.values:
2496 for i in range(0, len(row)):
2497 if(subset_columns is None or i in subset) and isinstance(row[i], float):
2498 row[i] = format % row[i]
2499 return tbl
2501 def values_to_date(self, format=None, only_if_possible=False, subset_columns=None):
2502 """
2503 converts all values into dates
2504 @param only_if_possible if True, converts all possible values and catches exception,
2505 if False, converts everything, raises an exception when not possible
2506 @param format date format see fn str_to_datetime
2507 @param subset_columns if None, takes all of them, otherwise, try to convert
2508 only for the listed columns
2509 @return table
2510 """
2511 tbl = self.copy()
2512 if subset_columns is not None:
2513 subset = {i: True for i, v in enumerate(
2514 self.header) if v in subset_columns}
2515 if only_if_possible:
2516 if subset_columns is not None:
2517 subset = {i: True for i, v in enumerate(
2518 self.header) if v in subset_columns}
2520 for row in tbl.values:
2521 for i in range(0, len(row)):
2522 if subset_columns is None or i in subset:
2523 try:
2524 v = str2datetime(row[i], format)
2525 row[i] = v
2526 except(ValueError, TypeError):
2527 continue
2528 else:
2529 for row in tbl.values:
2530 for i in range(0, len(row)):
2531 if subset_columns is None or i in subset:
2532 row[i] = float(row[i])
2533 return tbl
2535 def histogram(self, functionValue, nbDiv=100, secondColumnIsWeight=False,
2536 normalize=True, removeExtreme=0.05):
2537 """
2538 computes an histograms on one vector
2539 @param functionValue function which produces the value to histogram
2540 @param nbDiv number of divisions for this histograms(boundaries are min and max)
2541 @param secondColumnIsWeight if True, the second column is the weight
2542 @param normalize if True, normalize by the sum of weights
2543 @param removeExtreme remove extreme values at both sides(0.05 means 0.025 on each side)
2544 @return table with two columns
2545 """
2546 values = list([functionValue(self._interpret_row(row)) # pylint: disable=R1728
2547 for row in self.values])
2548 if removeExtreme is not None and removeExtreme > 0:
2549 values.sort()
2550 al = int(len(values) * removeExtreme / 2)
2551 if al == 0:
2552 raise Exception( # pragma: no cover
2553 "removeExtreme has no impact(%d,%f)" % (
2554 len(values), len(values) * removeExtreme / 2))
2555 if al * 2 < len(values):
2556 values = values[al:len(values) - al]
2558 mi = min(values)
2559 ma = max(values)
2561 if isinstance(values[0], (tuple, list)):
2562 W = 0.
2563 div = (ma[0] - mi[0]) / nbDiv
2564 hist = [[mi[0] + n * div, 0.] for n in range(0, nbDiv + 1)]
2565 for v in values:
2566 x = int((v[0] - mi[0]) // div)
2567 hist[x][1] += v[1]
2568 W += v[1]
2569 mi = mi[0]
2570 else:
2571 W = len(values)
2572 div = (ma - mi) / nbDiv
2573 hist = [[mi + n * div, 0.] for n in range(0, nbDiv + 1)]
2574 for v in values:
2575 x = int((v - mi) // div)
2576 if 0 <= x < len(hist):
2577 hist[x][1] += 1.
2579 if normalize and W > 0:
2580 for i in range(len(hist)):
2581 hist[i][1] /= W
2583 values = [[mi + n * div, hist[n]] for n in range(len(hist))]
2584 tbl = self._private_getclass()(["x", "hist(x)"], hist)
2585 return tbl
2587 def histograms(self, columnsSet, nbDiv=100, secondColumnIsWeight=False,
2588 normalize=True, removeExtreme=0.05, histxName="histKey"):
2589 """
2590 computes a common histograms on all columns
2591 @param columnsSet set of columns
2592 @param nbDiv number of divisions for this histograms(boundaries are min and max)
2593 @param secondColumnIsWeight if True, the second column is the weight
2594 @param normalize if True, normalize by the sum of weights
2595 @param removeExtreme remove extreme values at both sides(0.05 means 0.025 on each side)
2596 @param histxName column name given to the x axis shared by every histogram
2597 @return table with two columns
2599 @warning The function skips any NaN of Inf value.
2600 """
2601 values = []
2602 for row in self.values:
2603 temp = self._interpret_row(row)
2604 for t in columnsSet:
2605 values.append(temp[t])
2607 if removeExtreme is not None and removeExtreme > 0:
2608 values.sort()
2609 al = int(len(values) * removeExtreme / 2)
2610 if al == 0:
2611 raise Exception( # pragma: no cover
2612 "removeExtreme has no impact(%d,%f)" % (
2613 len(values), len(values) * removeExtreme / 2))
2614 if al * 2 < len(values):
2615 values = values[al:len(values) - al]
2617 mi = min(values)
2618 ma = max(values)
2619 W = len(values)
2620 div = (ma - mi) / nbDiv
2621 if div == 0:
2622 raise RuntimeError( # pragma: no cover
2623 "unable to continue since div is null: min,max = %f,%f" % (mi, ma))
2624 hist = [[mi + n * div, 0.] for n in range(0, nbDiv + 1)]
2625 value = {i: {histxName: hist[i][0]} for i in range(len(hist))}
2626 su = {}
2627 for row in self.values:
2628 for _ in columnsSet:
2629 temp = self._interpret_row(row)
2630 if math.isnan(temp[_]) or math.isinf(temp[_]):
2631 continue
2632 x = int((temp[_] - mi) // div)
2633 if x not in value:
2634 # it means extremes were removed
2635 continue
2636 #raise Exception("value %d,%f is not allowed min,max = [%f,%f]" %(x, temp[_], mi, ma))
2637 value[x][_] = value[x].get(_, 0.) + 1.
2638 su[_] = su.get(_, 0.) + 1.
2640 if normalize and W > 0:
2641 for v in value.values():
2642 for _ in v:
2643 if _ != histxName:
2644 v[_] /= su[_]
2646 tbl = self._private_getclass()("__byrow__", value)
2647 return tbl
2649 def union_columns(self, columnsSet):
2650 """
2651 computes the union of all values from all columns present in columnSet
2652 @param columnsSet set of columns
2653 @return table
2654 """
2655 values = []
2656 for row in self.values:
2657 temp = self._interpret_row(row)
2658 for t in columnsSet:
2659 values.append(temp[t])
2660 tbl = self._private_getclass()(["x"], [[x] for x in values])
2661 return tbl
2663 def mu_sigma(self, functionValues, removeExtreme=None):
2664 """
2665 computes the average and the standard deviation a vector of values
2666 @param functionValues function produces the vector of values
2667 @param removeExtreme remove extreme values at both sides(0.05 means 0.025 on each side)
2668 @return (average, standard deviation)
2669 """
2670 if removeExtreme is not None and removeExtreme > 0:
2671 values = []
2672 for row in self.values:
2673 row = self._interpret_row(row)
2674 val = functionValues(row)
2675 values.append(val)
2676 values.sort()
2677 al = int(len(values) * removeExtreme / 2)
2678 if al == 0:
2679 raise Exception("removeExtreme has no impact(%d,%f)" % (
2680 len(values), len(values) * removeExtreme / 2))
2681 if al * 2 < len(values):
2682 values = values[al:len(values) - al]
2683 tbl = TableFormula(["x"], [[_] for _ in values])
2684 return tbl.mu_sigma(lambda v: v["x"], 0)
2685 else:
2686 mu = 0.
2687 si = 0.
2688 nb = 0.
2689 for row in self.values:
2690 row = self._interpret_row(row)
2691 val = functionValues(row)
2692 mu += val
2693 si += val ** 2
2694 nb += 1.
2695 mu /= nb
2696 si /= nb
2697 si -= mu ** 2
2698 return mu, si ** 0.5
2700 def mu_sigma_each_column(self, columnsSet=None, removeExtreme=None):
2701 """
2702 returns a table with the average and the standard deviation for each columns
2703 @param removeExtreme remove extreme values at both sides(0.05 means 0.025 on each side)
2704 @param columnsSet set of column to deal with
2705 @return table with two rows: average and standard deviation
2706 """
2707 values = [[], []]
2708 if columnsSet is None:
2709 columnsSet = self.header
2710 for col in columnsSet:
2711 mu, sigma = self.mu_sigma(
2712 (lambda v, col=col: v[col]), removeExtreme)
2713 values[0].append(mu)
2714 values[1].append(sigma)
2715 tbl = self._private_getclass()(columnsSet, values)
2716 return tbl
2718 @property
2719 def np_matrix(self):
2720 """
2721 returns the values as a numpy matrix
2722 @return numpy matrix
2723 """
2724 return numpy.matrix(self.values)
2726 @property
2727 def np_array(self):
2728 """
2729 returns the values as a numpy array
2730 @return numpy array
2731 """
2732 return numpy.array(self.values)
2734 @property
2735 def dataframe(self):
2736 """
2737 creates a pandas dataframe
2738 @return pandas.dataframe
2739 """
2740 return pandas.DataFrame(self.values, columns=self.header)
2742 @property
2743 def json(self):
2744 """
2745 returns a json format
2746 @return string
2747 """
2748 rows = [row for row in self]
2749 return json.dumps(rows)
2751 def center_reduce(self, columnsSet=None, op=None, removeExtreme=None, mu_sigma=None):
2752 """
2753 center and reduce a set of columns(or all if columnsSet is None)
2754 @param removeExtreme remove extreme values at both sides(0.05 means 0.025 on each side)
2755 @param columnsSet set of column to deal with
2756 @param op if can be:
2757 - None: substract mean and normalize,
2758 - "mean": substract mean only,
2759 - "norm": normalize only
2760 @param mu_sigma matrix with two rows(one for mean, second for sigma), if None,
2761 if computes that from the matrix self, columns must have the same order
2762 that columnSet
2763 @return the same table(with only the considered columns)
2764 """
2765 if op not in [None, "mean", "norm"]:
2766 raise ValueError(
2767 'expecting a value in [None, "mean", "norm"] for op')
2768 if columnsSet is None:
2769 columnsSet = self.header
2770 mus = self.mu_sigma_each_column(
2771 columnsSet, removeExtreme) if mu_sigma is None else mu_sigma
2772 tbl = self.extract_columns(columnsSet)
2773 n = len(self.header)
2774 for row in tbl.values:
2775 if op is None or op == "mean":
2776 for i in range(n):
2777 row[i] -= mus.values[0][i]
2778 if op is None or op == "norm":
2779 for i in range(n):
2780 row[i] /= mus.values[1][i]
2781 return tbl
2783 @staticmethod
2784 def save_multiple_as_excel(filename, list_table, font="Calibri", close=True, encoding=None):
2785 """
2786 saves multiple table in one Excel file
2788 @param filename filename(can be None)
2789 @param list_table list of 2uple("name", tbl)
2790 @param font font name
2791 @param close if True, close the file, otherwise, the user will have to
2792 @param encoding encoding
2793 @return object Workbook
2794 """
2795 ext = os.path.splitext(
2796 filename)[-1].lower() if filename is not None else None
2797 if ext is not None and ext == ".xls":
2798 font0 = EXf.Font()
2799 font0.name = font
2800 font0.bold = True
2801 style0 = EXs.XFStyle()
2802 style0.font = font0
2804 wb = EXw.Workbook(
2805 encoding=encoding) if encoding is not None else EXw.Workbook()
2806 for sheet_name, self in list_table:
2807 ws0 = wb.add_sheet(sheet_name)
2809 for i, l in enumerate(self.header):
2810 ws0.write(0, i, l, style0)
2812 fnt = EXf.Font()
2813 fnt.name = font
2814 style = EXs.XFStyle()
2815 style.font = fnt
2817 for irow, row in enumerate(self.values):
2818 for icol, val in enumerate(row):
2819 if isinstance(val, (int, float)):
2820 st = val
2821 elif isinstance(val, str):
2822 if encoding is not None:
2823 st = val.encode(encoding).decode(encoding)
2824 else:
2825 st = val
2826 elif val is not None:
2827 st = str(val)
2828 else:
2829 continue
2830 ws0.write(irow + 1, icol, st, style)
2832 wb.save(filename)
2833 return wb
2835 elif ext is None or ext == ".xlsx":
2836 wb = EXxw.Workbook(
2837 filename) if filename is not None else EXxw.Workbook()
2838 for sheet_name, self in list_table:
2839 ws0 = wb.add_worksheet(sheet_name)
2841 style0 = wb.add_format({'bold': True})
2842 style0.set_font_name(font)
2844 for i, l in enumerate(self.header):
2845 ws0.write(0, i, l, style0)
2847 style = wb.add_format()
2848 style.set_font_name(font)
2850 for irow, row in enumerate(self.values):
2851 for icol, val in enumerate(row):
2852 if isinstance(val, (int, float)):
2853 st = val
2854 elif isinstance(val, str):
2855 if encoding is not None:
2856 st = val.encode(encoding).decode(encoding)
2857 else:
2858 st = val
2859 elif val is not None:
2860 st = str(val)
2861 else:
2862 continue
2863 ws0.write(irow + 1, icol, st, style)
2865 if filename is not None and close:
2866 wb.close()
2867 return wb
2868 else:
2869 raise NameError(
2870 "extension should be .xls or .xlsx for file " + filename)
2872 def save_as_excel(self, filename, font="Calibri", sheet_name="sheet0",
2873 close=True, encoding=None):
2874 """
2875 saves the table as a new Excel file, you can use ``.xls`` or ``.xlsx``
2876 if filename is None, the function returns an object(xslx) and does not save it.
2878 @param filename Excel filename
2879 @param sheet_name name of the sheet to add
2880 @param font font name
2881 @param close if True, close the file, otherwise, the user will have to
2882 @param encoding encoding
2883 @return object Workbook
2884 """
2885 return TableFormula.save_multiple_as_excel(filename, [(sheet_name, self)],
2886 font=font, close=close, encoding=encoding)
2888 def schema_database(self, add_id=True):
2889 """
2890 returns the schema for a database which would contains this database
2892 @param add_id if True, adds an index "PRIMARYKEY"
2893 @return dictionary { index_column:(name, type) }
2894 """
2895 schema = {i: (l, str) for i, l in enumerate(self.header)}
2896 if add_id is not None:
2897 schema[-1] = (add_id, int, "PRIMARYKEY", "AUTOINCREMENT")
2899 if len(self) > 0:
2900 # we use the first row to determine type
2901 for i, v in enumerate(self.values[0]):
2902 if not isinstance(v, str):
2903 schema[i] = (schema[i][0], type(v))
2904 return schema
2906 def fill_sql_table(self, filename_or_database, tablename, add_id="idr"):
2907 """
2908 returns a Database object, creates the database if it does not exists,
2909 same for the table
2911 @param filename_or_database filename or Database object,
2912 in that second case, we assume method connect
2913 was called before
2914 @param tablename table name
2915 @param add_id if is not None, then the function adds an id, it first takes the
2916 max(id) and goes on incrementing it;
2917 @return Database object(new or the one from the parameters),
2918 in both case, the database is not disconnected
2919 """
2920 schema = self.schema_database(add_id)
2922 if isinstance(filename_or_database, str):
2923 fLOG("fill_sql_table: creating database ", filename_or_database)
2924 db = Database(filename_or_database, LOG=fLOG)
2925 db.connect()
2927 fLOG("fill_sql_table ", schema)
2928 if tablename not in db.get_table_list():
2929 fLOG("creationg of table ", schema)
2930 cursor = db.create_table(tablename, schema)
2931 db.append_values(self.values, tablename, schema, cursor=cursor)
2932 else:
2933 db.append_values(self.values, tablename, schema)
2934 else:
2935 db = filename_or_database
2936 db.append_values(self.values, tablename, schema)
2938 return db