Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Various function to download data about population
5"""
6import gzip
7import os
8import numpy
9import pandas
10from pyquickhelper.loghelper import noLOG
11from pyensae.datasource import download_data
12from .data_exceptions import DataFormatException
15def population_france_year(url="https://www.insee.fr/fr/statistiques/fichier/1892086/pop-totale-france.xls",
16 sheet_name=0, year=2020):
17 """
18 Downloads the data for the French population from INSEE website
20 @param url url
21 @param sheet_name sheet index
22 @param year last year to find
23 @return DataFrame
25 The sheet index is 0 for the all France, 1 for metropolitean France.
26 The last row aggregates multiple ages ``1914 ou avant``, they will remain
27 aggregated but the label will be changed to 1914. ``100 ou plus`` is replaced by 100.
29 By default, the data is coming from `INSEE, Bilan Démographique <https://www.insee.fr/fr/statistiques/1892086?sommaire=1912926>`_.
31 **2017/01**: pandas does not seem to be able to read the format (old format).
32 You should convert the file in txt with Excel.
33 """
34 try:
35 df = pandas.read_excel(url, sheet_name=sheet_name)
36 skiprows = 5
37 except Exception as e: # pragma: no cover
38 # we try to find a local version
39 this = os.path.dirname(__file__)
40 name = os.path.join(this, "data_population", url.split(
41 "/")[-1].replace(".xls", ".xlsx"))
42 if not os.path.exists(name):
43 raise FileNotFoundError(
44 "Unable to find a replacement for '{0}' as '{1}'".format(url, name)) from e
45 df = pandas.read_excel(name, sheet_name=sheet_name)
46 url = name
47 skiprows = 0
48 col = df.columns[0]
49 if len(col) == 0:
50 raise DataFormatException( # pragma: no cover
51 "Unable to find {0} (year) in table at url '{1}'".format(year, url))
52 if skiprows > 0 and str(year) not in col:
53 raise DataFormatException( # pragma: no cover
54 "Unable to find {0} (year) in first column name '{1}' at url "
55 "'{2}'".format(year, col, url))
57 table = pandas.read_excel(url, sheet_name=sheet_name, skiprows=skiprows)
58 table.columns = ["naissance", "age", "hommes", "femmes", "ensemble"]
59 table = table[(table.naissance != 'Champ : France y c. Mayotte.') &
60 table.naissance.apply(lambda s: "Source" not in str(s))].copy()
61 table["naissance"] = table.apply(lambda r: r["naissance"] if isinstance(r["naissance"], (int, float)) else
62 r["naissance"].replace(" ou avant", ""), axis=1)
63 table["age"] = table.apply(lambda r: r["age"] if isinstance(r["age"], (int, float)) else
64 r["age"].replace(" ou plus", "") if isinstance(
65 r["age"], str) else r["age"],
66 axis=1)
67 table = table.dropna(axis=0)
68 for c in table.columns:
69 table[c] = table[c].astype(int)
70 return table
73def table_mortalite_france_00_02(homme=None, femme=None):
74 """
75 Download mortality table for France assuming they
76 are available in Excel format.
78 @param homme table for men
79 @param femme table for women
80 @return DataFrame
82 The final DataFrame merges both sheets.
83 The data is coming from
84 `Institut des Actuaires: Reférences de mortalité <http://www.institutdesactuaires.com/gene/main.php?base=2127>`_ or
85 `Références techniques <http://www.ressources-actuarielles.net/EXT/ISFA/fp-isfa.nsf/
86 34a14c286dfb0903c1256ffd00502d73/d62719e329025b94c12577c100545bb7?OpenDocument>`_.
87 """
88 this = os.path.join(os.path.abspath(
89 os.path.dirname(__file__)), "data_population")
90 if homme is None:
91 homme = os.path.join(this, "TH00-02_D.xls")
92 sheeth = "Table"
93 else:
94 sheeth = 0
95 if femme is None:
96 femme = os.path.join(this, "TF00-02_D.xls")
97 sheetf = "Table"
98 else:
99 sheetf = 0
100 isexch = os.path.splitext(homme)[-1] in (".xls", ".xlsx")
101 dfh = pandas.read_excel(
102 homme, sheet_name=sheeth) if isexch else pandas.read_csv(homme, sep=";")
103 if dfh.shape[1] > 2:
104 dfh = dfh[dfh.columns[:2]]
105 isexcf = os.path.splitext(femme)[-1] in (".xls", ".xlsx")
106 dff = pandas.read_excel(
107 femme, sheet_name=sheetf) if isexcf else pandas.read_csv(femme, sep=";")
108 if dff.shape[1] > 2:
109 dff = dff[dff.columns[:2]]
110 df = dfh.merge(dff, on="Age")
111 df.columns = ["Age", "Homme", "Femme"]
112 return df.dropna().reset_index(drop=True)
115def fecondite_france(url=None):
116 """
117 download fecondity table for France (Excel format)
119 @param url source (url or file)
120 @return DataFrame
122 By default, the data is coming from a local file
123 which is a copy of
124 `INSEE: Fécondité selon l'âge détaillé de la mère <https://www.insee.fr/fr/statistiques/2045366?sommaire=2045470&q=fecondite>`_.
125 The original file cannot be read by pandas so we convert it first.
126 See also `INSEE Bilan Démographique 2016 <https://www.insee.fr/fr/statistiques/1892259?sommaire=1912926>`_.
127 """
128 if url is None:
129 this = os.path.abspath(os.path.dirname(__file__))
130 url = os.path.join(this, "data_population", "irsocsd2014_G10.xlsx")
131 df = pandas.read_excel(url)
132 col = df.columns[0]
133 df[col] = df.apply(lambda r: r[col] if isinstance(r[col], int) else
134 r[col].replace(" ou plus", "").replace(" ans", "").replace(" (p)", "") if isinstance(
135 r[col], str) else r[col],
136 axis=1)
137 df = df[df[col].apply(lambda x: "0" <= x[0] <= "9" if isinstance(
138 x, str) else (isinstance(x, (int, float))))].copy()
139 df[col] = df[col].astype(float)
140 cp = df[df[col] == 15]
141 if len(cp) == 0:
142 ages = [str(_) for _ in set(df[col])] # pragma: no cover
143 raise DataFormatException( # pragma: no cover
144 "Unable to find 15 (age) in table at url: {0}\n{1}".format(
145 url, "\n".join(ages)))
146 if len(cp) != 1:
147 raise DataFormatException( # pragma: no cover
148 "too many values 15 in table at url: " + url)
149 cpe = df[df[col] == 50]
150 if len(cpe) == 0:
151 raise DataFormatException( # pragma: no cover
152 "Unable to find 50 (age) in table at url: " + url)
153 if len(cpe) != 1:
154 raise DataFormatException( # pragma: no cover
155 "Too many values 50 in table at url: " + url)
156 ind = cp.index[0]
157 ind2 = cpe.index[0]
158 table = df.iloc[ind:ind2, :3].copy()
159 table.columns = ["age", "2005", "2015"]
160 for c in table.columns:
161 table[c] = table[c].astype(float)
162 return table
165def table_mortalite_euro_stat(url="http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/",
166 name="demo_mlifetable.tsv.gz", final_name="mortalite.txt",
167 whereTo=".", stop_at=None, fLOG=noLOG):
168 """
169 This function retrieves mortality table from `EuroStat <http://ec.europa.eu/eurostat/fr>`_ through
170 `table de mortalité <http://www.data-publica.com/opendata/7098--population-et-conditions-sociales-table-de-mortalite-de-1960-a-2010>`_
171 (*this link is currently broken, data-publica does not provide such a database anymore, a copy is provided*).
173 @param url data source
174 @param name data table name
175 @param final_name the data is compressed, it needs to be uncompressed
176 into a file, this parameter defines its name
177 @param whereTo data needs to be downloaded, location of this place
178 @param stop_at the overall process is quite long, if not None,
179 it only keeps the first rows
180 @param fLOG logging function
181 @return data_frame
183 The function checks the file final_name exists.
184 If it is the case, the data is not downloaded twice.
186 The header contains a weird format as coordinates are separated by a comma::
188 indic_de,sex,age,geo\time 2013 2012 2011 2010 2009
190 We need to preprocess the data to split this information into columns.
191 The overall process takes 4-5 minutes, 10 seconds to download (< 10 Mb),
192 4-5 minutes to preprocess the data (it could be improved). The processed data
193 contains the following columns::
195 ['annee', 'valeur', 'age', 'age_num', 'indicateur', 'genre', 'pays']
197 Columns *age* and *age_num* look alike. *age_num* is numeric and is equal
198 to *age* except when *age_num* is 85. Everybody above that age fall
199 into the same category. The table contains many indicators:
201 * PROBSURV: Probabilité de survie entre deux âges exacts (px)
202 * LIFEXP: Esperance de vie à l'âge exact (ex)
203 * SURVIVORS: Nombre des survivants à l'âge exact (lx)
204 * PYLIVED: Nombre d'années personnes vécues entre deux âges exacts (Lx)
205 * DEATHRATE: Taux de mortalité à l'âge x (Mx)
206 * PROBDEATH: Probabilité de décès entre deux âges exacts (qx)
207 * TOTPYLIVED: Nombre total d'années personne vécues après l'âge exact (Tx)
208 """
209 if os.path.exists(final_name) and os.stat(final_name).st_size > 1e7:
210 return final_name
212 temp = final_name + ".remove.txt"
213 if not os.path.exists(temp) or os.stat(temp).st_size < 1e7:
214 local = download_data(name, url=url, whereTo=whereTo)
215 local = local[0] + ".gz"
216 with gzip.open(local, 'rb') as f:
217 file_content = f.read()
218 content = str(file_content, encoding="utf8")
219 with open(temp, "w", encoding="utf8") as f:
220 f.write(content)
222 def format_age(s):
223 if s.startswith("Y_"):
224 if s.startswith("Y_LT"): # pragma: no cover
225 return "YLT" + s[4:]
226 if s.startswith("Y_GE"): # pragma: no cover
227 return "YGE" + s[4:]
228 raise SyntaxError(s) # pragma: no cover
229 i = int(s.strip("Y"))
230 return "Y%02d" % i
232 def format_age_num(s):
233 if s.startswith("Y_"):
234 if s.startswith("Y_LT"): # pragma: no cover
235 return float(s.replace("Y_LT", ""))
236 if s.startswith("Y_GE"): # pragma: no cover
237 return float(s.replace("Y_GE", ""))
238 raise SyntaxError(s) # pragma: no cover
239 i = int(s.strip("Y"))
240 return float(i)
242 def format_value(s):
243 if s.strip() == ":":
244 return numpy.nan
245 return float(s.strip(" ebp"))
247 fLOG("step 0, reading")
248 dff = pandas.read_csv(temp, sep="\t", encoding="utf8")
250 if stop_at is not None:
251 fLOG("step 0, shortening")
252 dfsmall = dff.head(n=stop_at)
253 df = dfsmall
254 else:
255 df = dff
257 fLOG("step 1, size=", df.shape)
258 dfi = df.reset_index().set_index("indic_de,sex,age,geo\\time")
259 dfi = dfi.drop('index', axis=1)
260 dfs = dfi.stack()
261 dfs = pandas.DataFrame({"valeur": dfs})
263 fLOG("step 2, size=", dfs.shape)
264 dfs["valeur"] = dfs["valeur"].astype(str)
265 dfs["valeur"] = dfs["valeur"].apply(format_value)
266 dfs = dfs[dfs.valeur >= 0].copy()
267 dfs = dfs.reset_index()
268 dfs.columns = ["index", "annee", "valeur"]
270 fLOG("step 3, size=", dfs.shape)
271 dfs["age"] = dfs["index"].apply(lambda i: format_age(i.split(",")[2]))
272 dfs["age_num"] = dfs["index"].apply(
273 lambda i: format_age_num(i.split(",")[2]))
274 dfs["indicateur"] = dfs["index"].apply(lambda i: i.split(",")[0])
275 dfs["genre"] = dfs["index"].apply(lambda i: i.split(",")[1])
276 dfs["pays"] = dfs["index"].apply(lambda i: i.split(",")[3])
278 fLOG("step 4")
279 dfy = dfs.drop('index', axis=1)
280 dfy.to_csv(final_name, sep="\t", encoding="utf8", index=False)
281 return final_name