Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Contains a class to process elections results (France)
5"""
6import random
7import numpy
8import pandas
11class ElectionResults:
12 """
13 Processes data coming from
14 `data.gouv.fr <http://www.data.gouv.fr/content/search?SortBy=Pertinence&SortOrder=0&SearchText=%C3%A9lections+2012>`_.
16 The class uses `pandas <http://pandas.pydata.org/>`_ to process the data.
17 See `Elections françaises <http://www.xavierdupre.fr/blog/2013-12-06_nojs.html>`_.
18 See `Optimisation sous contraintes appliquée au calcul du report des voix <http://www.xavierdupre.fr/blog/2013-12-07_nojs.html>`_.
19 """
21 def __init__(self, file, year=None, level="Départements"):
22 """
23 Loads the data downloaded from
24 `data.gouv.fr <http://www.data.gouv.fr/content/search?SortBy=Pertinence&SortOrder=0&SearchText=%C3%A9lections+2012>`_.
26 @param file xls file
27 @param year year (optional)
28 @param level ``Départements`` or ``Cantons``
29 """
30 self.year = year
31 self.level = level.lower().replace("s", "")
32 if isinstance(file, list):
33 self.tours = file
34 else:
35 self.tours = [pandas.read_excel(file, sheet_name="%s T1" % level, engine='openpyxl'),
36 pandas.read_excel(file, sheet_name="%s T2" % level, engine='openpyxl')]
37 for i, t in enumerate(self.tours):
38 if len(t) == 0:
39 raise Exception("no data for tour %d" % (i + 1))
40 self.tours = [self.process_tour(_) for _ in self.tours]
41 for i, t in enumerate(self.tours):
42 if len(t) == 0:
43 raise Exception("no data for tour %d" % i)
44 try:
45 self.tours = [
46 _.sort_values("Libellé du %s" % self.level, inplace=False) for _ in self.tours]
47 except Exception as e:
48 message = "unable to sort, shape={1} columns={0}".format(
49 ",".join(self.tours[0].columns), self.tours[0].shape)
50 raise Exception(message) from e
52 def get_candidates_votes(self, round):
53 """
54 Returns the numbers of voters for each candidate.
56 @param round 0 or 1
57 @return dictionary
58 """
59 cols0 = [_ for _ in self.tours[
60 round].columns if _ not in self.LevelCol]
61 sums = [self.tours[round][c].sum() for c in cols0]
62 return {c: s for c, s in zip(cols0, sums)}
64 def correct(self, method=None):
65 """
66 Corrects the second round in a way there is the same number of voters.
68 @param method some preprocess before going on (see below)
70 About ``method``:
72 - *'N'* --> correct the number of voters for each regions
73 - *'cand'* --> gives the same weights to every candidates
74 """
75 if method == "N":
76 if len(self.T0) != len(self.T1):
77 raise Exception(
78 "unable to proceed because of different numbers of regions")
79 cols0 = [_ for _ in self.tours[
80 0].columns if _ not in self.LevelCol]
81 cols1 = [_ for _ in self.tours[
82 1].columns if _ not in self.LevelCol]
83 for i in range(len(self.T0)):
84 s1 = self.T0.loc[i, cols0].sum()
85 s2 = self.T1.loc[i, cols1].sum()
86 coef = 1.0 * s1 / s2
87 for c in cols1:
88 self.T1.loc[i, c] *= coef
89 elif method == "cand":
90 cols0 = [_ for _ in self.tours[
91 0].columns if _ not in self.LevelCol]
92 sums = [self.T0[c].sum() for c in cols0]
93 total = sum(sums)
94 for c, s in zip(cols0, sums):
95 self.T0[c] = self.T0[c] * total / s
96 self.correct("N")
97 else:
98 raise NotImplementedError("unknown method: " + method)
100 def __str__(self):
101 """usual"""
102 message = "Year: {0} T1: {1} T2: {2}".format(
103 self.Year, len(self.tours[0]), len(self.tours[1]))
104 return message
106 def GetNbCandidates(self, round):
107 """
108 Returns the number of candidates.
109 @param round round (0 or 1)
110 @return number of candidates
111 """
112 return len(self.tours[round].columns) - 4
114 @property
115 def Year(self):
116 """
117 Returns the year.
118 """
119 return self.year
121 @property
122 def Level(self):
123 """
124 Returns the level (``département`` or ``canton``).
125 """
126 return self.level
128 @property
129 def LevelCol(self):
130 """
131 Returns the column associated to the level (their name depends on the level).
132 """
133 return ["Code du %s" % self.level, "Libellé du %s" % self.level]
135 @property
136 def T0(self):
137 """
138 Returns the dataframe for the first round.
139 """
140 return self.tours[0]
142 @property
143 def T1(self):
144 """
145 Returns the dataframe for the second round.
146 """
147 return self.tours[1]
149 def process_tour(self, tour):
150 """
151 Keeps the interesting columns, move candidates name as column name.
153 @param tour dataframe
154 @return dataframe
155 """
156 keep = [isinstance(_, (float, int, numpy.int64, numpy.float64)) and ~numpy.isnan(_)
157 for _ in tour["Abstentions"]]
158 tour = tour.loc[keep, :]
159 names = [_ for _ in tour.columns if _.startswith("Nom")]
160 res = []
161 for n in names:
162 c = list(tour[n])
163 res.extend(c)
164 unique = set(res)
165 unique = list(unique)
167 try:
168 unique.sort()
169 except TypeError as e:
170 raise Exception("unable to sort " + str(unique) +
171 "\ncolumns:{0}".format(",".join(tour.columns))) from e
173 columns0 = ["Code du %s" % self.level, "Libellé du %s" % self.level, ]
174 columns1 = ["Abstentions", "Blancs et nuls", ]
176 def proc(row):
177 res = {}
178 for i, v in enumerate(row):
179 k = tour.columns[i]
180 if k in columns0:
181 res[k] = row[i]
182 elif k in columns1:
183 res[k] = row[i]
184 elif k.startswith("Nom"):
185 res[v] = row[i + 2]
186 badkeys = [_ for _ in res if len(_) == 0]
187 if len(badkeys) > 0:
188 return None
189 return res
190 rows = list(map(lambda r: proc(r), tour.values))
191 rows = [_ for _ in rows if _ is not None]
192 return pandas.DataFrame(rows)
194 def vote_transfer(self):
195 """
196 Computes the votes between the two rounds using
197 contrainsts optimization, the optimization
198 requires :epkg:`cvxopt`.
200 See `Optimisation sous contraintes appliquée au calcul du report des voix <http://www.xavierdupre.fr/blog/2013-12-07_nojs.html>`_.
202 @return results (as a DataFrame)
203 """
204 cols0 = [_ for _ in self.tours[0] if _ not in self.LevelCol]
205 X = self.tours[0][cols0].values
206 X = numpy.matrix(X)
208 cols1 = [_ for _ in self.tours[1] if _ not in self.LevelCol]
209 Y = self.tours[1][cols1].values
210 Y = numpy.matrix(Y)
212 nbC = Y.shape[1]
213 lin, col = X.shape
215 # construction de Q
216 def _zeros(lin, col):
217 return [[0.0 for i in range(0, col)] for j in range(0, lin)]
218 bigX = [numpy.matrix(_zeros(lin, col * nbC)) for i in range(0, nbC)]
220 for i in range(0, nbC):
221 bigX[i][:, col * i:col * (i + 1)] = X[:, :]
223 pX = []
224 for m in bigX:
225 pX.append(m.transpose() * m)
227 Q = None
228 for m in pX:
229 if Q is None:
230 Q = +m
231 else:
232 Q += m * 2
234 # construction de p
235 p = None
236 for i in range(0, nbC):
237 tr = bigX[i].transpose()
238 y2 = Y[:, i] * (-2)
239 t = tr * y2
240 if p is None:
241 p = t
242 else:
243 p += t
245 # construction de G, h
246 def _identite(n):
247 return [[0.0 if i != j else 1.0 for i in range(0, n)] for j in range(0, n)]
248 h = numpy.matrix(_zeros(col * nbC, 1))
249 G = - numpy.matrix(_identite(col * nbC))
251 # construction de C,b
252 b = numpy.matrix(_zeros(col, 1))
253 b[:, :] = 1.0
254 C = numpy.matrix(_zeros(col, col * nbC))
255 for i in range(0, col):
256 for ni in range(0, nbC):
257 C[i, i + col * ni] = 1.0
259 # résolutation
260 from cvxopt import solvers
261 from cvxopt import matrix
263 Q = matrix(Q)
264 p = matrix(p)
265 G = matrix(G)
266 h = matrix(h)
267 C = matrix(C)
268 b = matrix(b)
270 old = solvers.options.get("show_progress", True)
271 solvers.options["show_progress"] = False
272 sol = solvers.qp(Q, p, G, h, C, b)
273 solvers.options["show_progress"] = old
274 coef = sol['x']
276 res = numpy.matrix(_zeros(col, nbC))
277 for i in range(0, nbC):
278 res[:, i] = coef[col * i:col * (i + 1)]
280 rown = [_ for _ in self.tours[0].columns if _ not in self.LevelCol]
281 coln = [_ for _ in self.tours[1].columns if _ not in self.LevelCol]
282 return pandas.DataFrame(data=res, index=rown, columns=coln)
284 def resample(self, method="uniform"):
285 """
286 Builds a new sample: it produces a results with the same number of
287 rows, but each rows is randomly drawn from the current data.
288 This is needed for the bootstrap procedures.
290 @param method ``weight`` or ``uniform``
291 @return two matrices
292 """
293 if len(self.T0) != len(self.T1):
294 raise Exception(
295 "unable to proceeed, we need to draw the same regions, assuming both matrices are sorted in the same order")
297 def resample_matrix(mat, h):
298 return mat.loc[h, :]
299 if method == "uniform":
300 n = len(self.T0)
301 h = [random.randint(0, n - 1) for i in range(0, n)]
302 else:
303 def find_index(x):
304 s = 0
305 for i, _ in enumerate(self.WeightsNorm):
306 s += _
307 if x < s:
308 return i
309 return len(self.WeightsNorm) - 1
310 n = len(self.T0)
311 h = [find_index(random.random()) for i in range(0, n)]
313 return ElectionResults([resample_matrix(self.T0, h),
314 resample_matrix(self.T1, h), ],
315 year=self.year, level=self.level)
317 def get_people(self, round=0):
318 """
319 Returns the number of people per regions.
320 @param round first (0) or second (1) round
321 @return series
322 """
323 return self.tours[round].apply(lambda row: sum([row[_] for _ in self.tours[round].columns if _ not in self.LevelCol]), axis=1)
325 @property
326 def WeightsNorm(self):
327 """
328 Returns the proportion of voters for each regions.
329 """
330 if "weightsnorm" not in self.__dict__:
331 self.weightsnorm = list(self.get_people())
332 s = sum(self.weightsnorm)
333 self.weightsnorm = [_ * 1.0 / s for _ in self.weightsnorm]
334 return self.weightsnorm
336 @staticmethod
337 def min_max_mean_std(series, alpha=0.05):
338 """
339 returns the mean standard deviation, bounds of the confidence interval
341 @param series list of numbers
342 @param alpha confidence level
343 @return mean, std, low, high
344 """
345 series = list(sorted(series))
346 a = int(len(series) * alpha / 2)
347 low, high = series[a], series[-a - 1]
348 mean = sum(series) / len(series)
349 std = sum([(x - mean) ** 2 for x in series]) / len(series)
350 return mean, std ** 0.5, low, high
352 def bootstrap(self, iter=1000, method="vote_transfer", alpha=0.05, fLOG=None, **params):
353 """
354 Uses the bootstrap method to compute confidence intervals
355 see `bootstrap <http://fr.wikipedia.org/wiki/Bootstrap_%28statistiques%29>`_.
357 @param iter number of iteration
358 @param method method to bootstrap
359 @param alpha confidence level
360 @param fLOG logging function or none
361 @param params parameters to give to ``method``
362 @return four matrices, averaged results, sigma, lower bound, higher bound
363 """
364 if fLOG is None:
365 fLOG = lambda *x: ""
366 fLOG("sampling", iter)
367 samples = [self.resample() for i in range(iter)]
368 if method == "vote_transfer":
369 matrices = [_.vote_transfer(**params) for _ in samples]
370 else:
371 raise NotImplementedError()
373 mean = matrices[0].copy()
374 std = matrices[0].copy()
375 low = matrices[0].copy()
376 high = matrices[0].copy()
378 shape = mean.shape
379 fLOG("level for each coefficient", shape)
380 for i in range(0, shape[0]):
381 for j in range(0, shape[1]):
382 series = [m.iloc[i, j] for m in matrices]
383 xmean, xstd, xlow, xhigh = ElectionResults.min_max_mean_std(
384 series, alpha=alpha)
385 mean.iloc[i, j] = xmean
386 std.iloc[i, j] = xstd
387 low.iloc[i, j] = xlow
388 high.iloc[i, j] = xhigh
389 return mean, std, low, high
391 @staticmethod
392 def combine_into_string(matrices, float_format=str, agg_format=str):
393 """
394 Combines two matrices into one before displaying it.
396 @param matrices list of matrices (same dimension)
397 @param float_format to format each float of all matrices
398 @param agg_format to build the aggregated string
399 @return matrixes (dataframe)
401 Example:
403 ::
405 def pour(x) :
406 if x < 0.01 : return ""
407 else : return "%2.0f" % (x*100) + "%"
409 boot = el.bootstrap(iter=10)
410 comb = el.combine_string( [boot[2],boot[3]], pour, lambda v : "%s-%s" % tuple(v))
411 """
412 shape = matrices[0].shape
413 res = [["" for i in range(shape[1])] for j in range(shape[0])]
414 for i in range(0, shape[0]):
415 for j in range(0, shape[1]):
416 series = [float_format(m.iloc[i, j]) for m in matrices]
417 res[i][j] = agg_format(series)
418 return pandas.DataFrame(data=res, columns=list(matrices[0].columns), index=list(matrices[0].index))