Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Various function to download data about population 

5""" 

6import gzip 

7import os 

8import numpy 

9import pandas 

10from pyquickhelper.loghelper import noLOG 

11from pyensae.datasource import download_data 

12from .data_exceptions import DataFormatException 

13 

14 

15def population_france_year(url="https://www.insee.fr/fr/statistiques/fichier/1892086/pop-totale-france.xls", 

16 sheet_name=0, year=2020): 

17 """ 

18 Downloads the data for the French population from INSEE website 

19 

20 @param url url 

21 @param sheet_name sheet index 

22 @param year last year to find 

23 @return DataFrame 

24 

25 The sheet index is 0 for the all France, 1 for metropolitean France. 

26 The last row aggregates multiple ages ``1914 ou avant``, they will remain 

27 aggregated but the label will be changed to 1914. ``100 ou plus`` is replaced by 100. 

28 

29 By default, the data is coming from `INSEE, Bilan Démographique <https://www.insee.fr/fr/statistiques/1892086?sommaire=1912926>`_. 

30 

31 **2017/01**: pandas does not seem to be able to read the format (old format). 

32 You should convert the file in txt with Excel. 

33 """ 

34 try: 

35 df = pandas.read_excel(url, sheet_name=sheet_name) 

36 skiprows = 5 

37 except Exception as e: # pragma: no cover 

38 # we try to find a local version 

39 this = os.path.dirname(__file__) 

40 name = os.path.join(this, "data_population", url.split( 

41 "/")[-1].replace(".xls", ".xlsx")) 

42 if not os.path.exists(name): 

43 raise FileNotFoundError( 

44 "Unable to find a replacement for '{0}' as '{1}'".format(url, name)) from e 

45 df = pandas.read_excel(name, sheet_name=sheet_name) 

46 url = name 

47 skiprows = 0 

48 col = df.columns[0] 

49 if len(col) == 0: 

50 raise DataFormatException( # pragma: no cover 

51 "Unable to find {0} (year) in table at url '{1}'".format(year, url)) 

52 if skiprows > 0 and str(year) not in col: 

53 raise DataFormatException( # pragma: no cover 

54 "Unable to find {0} (year) in first column name '{1}' at url " 

55 "'{2}'".format(year, col, url)) 

56 

57 table = pandas.read_excel(url, sheet_name=sheet_name, skiprows=skiprows) 

58 table.columns = ["naissance", "age", "hommes", "femmes", "ensemble"] 

59 table = table[(table.naissance != 'Champ : France y c. Mayotte.') & 

60 table.naissance.apply(lambda s: "Source" not in str(s))].copy() 

61 table["naissance"] = table.apply(lambda r: r["naissance"] if isinstance(r["naissance"], (int, float)) else 

62 r["naissance"].replace(" ou avant", ""), axis=1) 

63 table["age"] = table.apply(lambda r: r["age"] if isinstance(r["age"], (int, float)) else 

64 r["age"].replace(" ou plus", "") if isinstance( 

65 r["age"], str) else r["age"], 

66 axis=1) 

67 table = table.dropna(axis=0) 

68 for c in table.columns: 

69 table[c] = table[c].astype(int) 

70 return table 

71 

72 

73def table_mortalite_france_00_02(homme=None, femme=None): 

74 """ 

75 Download mortality table for France assuming they 

76 are available in Excel format. 

77 

78 @param homme table for men 

79 @param femme table for women 

80 @return DataFrame 

81 

82 The final DataFrame merges both sheets. 

83 The data is coming from 

84 `Institut des Actuaires: Reférences de mortalité <http://www.institutdesactuaires.com/gene/main.php?base=2127>`_ or 

85 `Références techniques <http://www.ressources-actuarielles.net/EXT/ISFA/fp-isfa.nsf/ 

86 34a14c286dfb0903c1256ffd00502d73/d62719e329025b94c12577c100545bb7?OpenDocument>`_. 

87 """ 

88 this = os.path.join(os.path.abspath( 

89 os.path.dirname(__file__)), "data_population") 

90 if homme is None: 

91 homme = os.path.join(this, "TH00-02_D.xls") 

92 sheeth = "Table" 

93 else: 

94 sheeth = 0 

95 if femme is None: 

96 femme = os.path.join(this, "TF00-02_D.xls") 

97 sheetf = "Table" 

98 else: 

99 sheetf = 0 

100 isexch = os.path.splitext(homme)[-1] in (".xls", ".xlsx") 

101 dfh = pandas.read_excel( 

102 homme, sheet_name=sheeth) if isexch else pandas.read_csv(homme, sep=";") 

103 if dfh.shape[1] > 2: 

104 dfh = dfh[dfh.columns[:2]] 

105 isexcf = os.path.splitext(femme)[-1] in (".xls", ".xlsx") 

106 dff = pandas.read_excel( 

107 femme, sheet_name=sheetf) if isexcf else pandas.read_csv(femme, sep=";") 

108 if dff.shape[1] > 2: 

109 dff = dff[dff.columns[:2]] 

110 df = dfh.merge(dff, on="Age") 

111 df.columns = ["Age", "Homme", "Femme"] 

112 return df.dropna().reset_index(drop=True) 

113 

114 

115def fecondite_france(url=None): 

116 """ 

117 download fecondity table for France (Excel format) 

118 

119 @param url source (url or file) 

120 @return DataFrame 

121 

122 By default, the data is coming from a local file 

123 which is a copy of 

124 `INSEE: Fécondité selon l'âge détaillé de la mère <https://www.insee.fr/fr/statistiques/2045366?sommaire=2045470&q=fecondite>`_. 

125 The original file cannot be read by pandas so we convert it first. 

126 See also `INSEE Bilan Démographique 2016 <https://www.insee.fr/fr/statistiques/1892259?sommaire=1912926>`_. 

127 """ 

128 if url is None: 

129 this = os.path.abspath(os.path.dirname(__file__)) 

130 url = os.path.join(this, "data_population", "irsocsd2014_G10.xlsx") 

131 df = pandas.read_excel(url) 

132 col = df.columns[0] 

133 df[col] = df.apply(lambda r: r[col] if isinstance(r[col], int) else 

134 r[col].replace(" ou plus", "").replace(" ans", "").replace(" (p)", "") if isinstance( 

135 r[col], str) else r[col], 

136 axis=1) 

137 df = df[df[col].apply(lambda x: "0" <= x[0] <= "9" if isinstance( 

138 x, str) else (isinstance(x, (int, float))))].copy() 

139 df[col] = df[col].astype(float) 

140 cp = df[df[col] == 15] 

141 if len(cp) == 0: 

142 ages = [str(_) for _ in set(df[col])] # pragma: no cover 

143 raise DataFormatException( # pragma: no cover 

144 "Unable to find 15 (age) in table at url: {0}\n{1}".format( 

145 url, "\n".join(ages))) 

146 if len(cp) != 1: 

147 raise DataFormatException( # pragma: no cover 

148 "too many values 15 in table at url: " + url) 

149 cpe = df[df[col] == 50] 

150 if len(cpe) == 0: 

151 raise DataFormatException( # pragma: no cover 

152 "Unable to find 50 (age) in table at url: " + url) 

153 if len(cpe) != 1: 

154 raise DataFormatException( # pragma: no cover 

155 "Too many values 50 in table at url: " + url) 

156 ind = cp.index[0] 

157 ind2 = cpe.index[0] 

158 table = df.iloc[ind:ind2, :3].copy() 

159 table.columns = ["age", "2005", "2015"] 

160 for c in table.columns: 

161 table[c] = table[c].astype(float) 

162 return table 

163 

164 

165def table_mortalite_euro_stat(url="http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/", 

166 name="demo_mlifetable.tsv.gz", final_name="mortalite.txt", 

167 whereTo=".", stop_at=None, fLOG=noLOG): 

168 """ 

169 This function retrieves mortality table from `EuroStat <http://ec.europa.eu/eurostat/fr>`_ through 

170 `table de mortalité <http://www.data-publica.com/opendata/7098--population-et-conditions-sociales-table-de-mortalite-de-1960-a-2010>`_ 

171 (*this link is currently broken, data-publica does not provide such a database anymore, a copy is provided*). 

172 

173 @param url data source 

174 @param name data table name 

175 @param final_name the data is compressed, it needs to be uncompressed 

176 into a file, this parameter defines its name 

177 @param whereTo data needs to be downloaded, location of this place 

178 @param stop_at the overall process is quite long, if not None, 

179 it only keeps the first rows 

180 @param fLOG logging function 

181 @return data_frame 

182 

183 The function checks the file final_name exists. 

184 If it is the case, the data is not downloaded twice. 

185 

186 The header contains a weird format as coordinates are separated by a comma:: 

187 

188 indic_de,sex,age,geo\time 2013 2012 2011 2010 2009 

189 

190 We need to preprocess the data to split this information into columns. 

191 The overall process takes 4-5 minutes, 10 seconds to download (< 10 Mb), 

192 4-5 minutes to preprocess the data (it could be improved). The processed data 

193 contains the following columns:: 

194 

195 ['annee', 'valeur', 'age', 'age_num', 'indicateur', 'genre', 'pays'] 

196 

197 Columns *age* and *age_num* look alike. *age_num* is numeric and is equal 

198 to *age* except when *age_num* is 85. Everybody above that age fall 

199 into the same category. The table contains many indicators: 

200 

201 * PROBSURV: Probabilité de survie entre deux âges exacts (px) 

202 * LIFEXP: Esperance de vie à l'âge exact (ex) 

203 * SURVIVORS: Nombre des survivants à l'âge exact (lx) 

204 * PYLIVED: Nombre d'années personnes vécues entre deux âges exacts (Lx) 

205 * DEATHRATE: Taux de mortalité à l'âge x (Mx) 

206 * PROBDEATH: Probabilité de décès entre deux âges exacts (qx) 

207 * TOTPYLIVED: Nombre total d'années personne vécues après l'âge exact (Tx) 

208 """ 

209 if os.path.exists(final_name) and os.stat(final_name).st_size > 1e7: 

210 return final_name 

211 

212 temp = final_name + ".remove.txt" 

213 if not os.path.exists(temp) or os.stat(temp).st_size < 1e7: 

214 local = download_data(name, url=url, whereTo=whereTo) 

215 local = local[0] + ".gz" 

216 with gzip.open(local, 'rb') as f: 

217 file_content = f.read() 

218 content = str(file_content, encoding="utf8") 

219 with open(temp, "w", encoding="utf8") as f: 

220 f.write(content) 

221 

222 def format_age(s): 

223 if s.startswith("Y_"): 

224 if s.startswith("Y_LT"): # pragma: no cover 

225 return "YLT" + s[4:] 

226 if s.startswith("Y_GE"): # pragma: no cover 

227 return "YGE" + s[4:] 

228 raise SyntaxError(s) # pragma: no cover 

229 i = int(s.strip("Y")) 

230 return "Y%02d" % i 

231 

232 def format_age_num(s): 

233 if s.startswith("Y_"): 

234 if s.startswith("Y_LT"): # pragma: no cover 

235 return float(s.replace("Y_LT", "")) 

236 if s.startswith("Y_GE"): # pragma: no cover 

237 return float(s.replace("Y_GE", "")) 

238 raise SyntaxError(s) # pragma: no cover 

239 i = int(s.strip("Y")) 

240 return float(i) 

241 

242 def format_value(s): 

243 if s.strip() == ":": 

244 return numpy.nan 

245 return float(s.strip(" ebp")) 

246 

247 fLOG("step 0, reading") 

248 dff = pandas.read_csv(temp, sep="\t", encoding="utf8") 

249 

250 if stop_at is not None: 

251 fLOG("step 0, shortening") 

252 dfsmall = dff.head(n=stop_at) 

253 df = dfsmall 

254 else: 

255 df = dff 

256 

257 fLOG("step 1, size=", df.shape) 

258 dfi = df.reset_index().set_index("indic_de,sex,age,geo\\time") 

259 dfi = dfi.drop('index', axis=1) 

260 dfs = dfi.stack() 

261 dfs = pandas.DataFrame({"valeur": dfs}) 

262 

263 fLOG("step 2, size=", dfs.shape) 

264 dfs["valeur"] = dfs["valeur"].astype(str) 

265 dfs["valeur"] = dfs["valeur"].apply(format_value) 

266 dfs = dfs[dfs.valeur >= 0].copy() 

267 dfs = dfs.reset_index() 

268 dfs.columns = ["index", "annee", "valeur"] 

269 

270 fLOG("step 3, size=", dfs.shape) 

271 dfs["age"] = dfs["index"].apply(lambda i: format_age(i.split(",")[2])) 

272 dfs["age_num"] = dfs["index"].apply( 

273 lambda i: format_age_num(i.split(",")[2])) 

274 dfs["indicateur"] = dfs["index"].apply(lambda i: i.split(",")[0]) 

275 dfs["genre"] = dfs["index"].apply(lambda i: i.split(",")[1]) 

276 dfs["pays"] = dfs["index"].apply(lambda i: i.split(",")[3]) 

277 

278 fLOG("step 4") 

279 dfy = dfs.drop('index', axis=1) 

280 dfy.to_csv(final_name, sep="\t", encoding="utf8", index=False) 

281 return final_name