Coverage for src/actuariat_python/data/population.py: 97%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1# -*- coding: utf-8 -*-

2"""

3@file

4@brief Various function to download data about population

5"""

6import gzip

7import os

8import numpy

9import pandas

10from pyquickhelper.loghelper import noLOG

11from pyensae.datasource import download_data

12from .data_exceptions import DataFormatException

15def population_france_year(url="https://www.insee.fr/fr/statistiques/fichier/1892086/pop-totale-france.xls",

16 sheet_name=0, year=2020):

17 """

18 Downloads the data for the French population from INSEE website

20 @param url url

21 @param sheet_name sheet index

22 @param year last year to find

23 @return DataFrame

25 The sheet index is 0 for the all France, 1 for metropolitean France.

26 The last row aggregates multiple ages ``1914 ou avant``, they will remain

27 aggregated but the label will be changed to 1914. ``100 ou plus`` is replaced by 100.

29 By default, the data is coming from `INSEE, Bilan Démographique <https://www.insee.fr/fr/statistiques/1892086?sommaire=1912926>`_.

31 **2017/01**: pandas does not seem to be able to read the format (old format).

32 You should convert the file in txt with Excel.

33 """

34 try:

35 df = pandas.read_excel(url, sheet_name=sheet_name)

36 skiprows = 5

37 except Exception as e: # pragma: no cover

38 # we try to find a local version

39 this = os.path.dirname(__file__)

40 name = os.path.join(this, "data_population", url.split(

41 "/")[-1].replace(".xls", ".xlsx"))

42 if not os.path.exists(name):

43 raise FileNotFoundError(

44 "Unable to find a replacement for '{0}' as '{1}'".format(url, name)) from e

45 df = pandas.read_excel(name, sheet_name=sheet_name)

46 url = name

47 skiprows = 0

48 col = df.columns[0]

49 if len(col) == 0:

50 raise DataFormatException( # pragma: no cover

51 "Unable to find {0} (year) in table at url '{1}'".format(year, url))

52 if skiprows > 0 and str(year) not in col:

53 raise DataFormatException( # pragma: no cover

54 "Unable to find {0} (year) in first column name '{1}' at url "

55 "'{2}'".format(year, col, url))

57 table = pandas.read_excel(url, sheet_name=sheet_name, skiprows=skiprows)

58 table.columns = ["naissance", "age", "hommes", "femmes", "ensemble"]

59 table = table[(table.naissance != 'Champ : France y c. Mayotte.') &

60 table.naissance.apply(lambda s: "Source" not in str(s))].copy()

61 table["naissance"] = table.apply(lambda r: r["naissance"] if isinstance(r["naissance"], (int, float)) else

62 r["naissance"].replace(" ou avant", ""), axis=1)

63 table["age"] = table.apply(lambda r: r["age"] if isinstance(r["age"], (int, float)) else

64 r["age"].replace(" ou plus", "") if isinstance(

65 r["age"], str) else r["age"],

66 axis=1)

67 table = table.dropna(axis=0)

68 for c in table.columns:

69 table[c] = table[c].astype(int)

70 return table

73def table_mortalite_france_00_02(homme=None, femme=None):

74 """

75 Download mortality table for France assuming they

76 are available in Excel format.

78 @param homme table for men

79 @param femme table for women

80 @return DataFrame

82 The final DataFrame merges both sheets.

83 The data is coming from

84 `Institut des Actuaires: Reférences de mortalité <http://www.institutdesactuaires.com/gene/main.php?base=2127>`_ or

85 `Références techniques <http://www.ressources-actuarielles.net/EXT/ISFA/fp-isfa.nsf/

86 34a14c286dfb0903c1256ffd00502d73/d62719e329025b94c12577c100545bb7?OpenDocument>`_.

87 """

88 this = os.path.join(os.path.abspath(

89 os.path.dirname(__file__)), "data_population")

90 if homme is None:

91 homme = os.path.join(this, "TH00-02_D.xls")

92 sheeth = "Table"

93 else:

94 sheeth = 0

95 if femme is None:

96 femme = os.path.join(this, "TF00-02_D.xls")

97 sheetf = "Table"

98 else:

99 sheetf = 0

100 isexch = os.path.splitext(homme)[-1] in (".xls", ".xlsx")

101 dfh = pandas.read_excel(

102 homme, sheet_name=sheeth) if isexch else pandas.read_csv(homme, sep=";")

103 if dfh.shape[1] > 2:

104 dfh = dfh[dfh.columns[:2]]

105 isexcf = os.path.splitext(femme)[-1] in (".xls", ".xlsx")

106 dff = pandas.read_excel(

107 femme, sheet_name=sheetf) if isexcf else pandas.read_csv(femme, sep=";")

108 if dff.shape[1] > 2:

109 dff = dff[dff.columns[:2]]

110 df = dfh.merge(dff, on="Age")

111 df.columns = ["Age", "Homme", "Femme"]

112 return df.dropna().reset_index(drop=True)

113

114

115def fecondite_france(url=None):

116 """

117 download fecondity table for France (Excel format)

118

119 @param url source (url or file)

120 @return DataFrame

121

122 By default, the data is coming from a local file

123 which is a copy of

124 `INSEE: Fécondité selon l'âge détaillé de la mère <https://www.insee.fr/fr/statistiques/2045366?sommaire=2045470&q=fecondite>`_.

125 The original file cannot be read by pandas so we convert it first.

126 See also `INSEE Bilan Démographique 2016 <https://www.insee.fr/fr/statistiques/1892259?sommaire=1912926>`_.

127 """

128 if url is None:

129 this = os.path.abspath(os.path.dirname(__file__))

130 url = os.path.join(this, "data_population", "irsocsd2014_G10.xlsx")

131 df = pandas.read_excel(url)

132 col = df.columns[0]

133 df[col] = df.apply(lambda r: r[col] if isinstance(r[col], int) else

134 r[col].replace(" ou plus", "").replace(" ans", "").replace(" (p)", "") if isinstance(

135 r[col], str) else r[col],

136 axis=1)

137 df = df[df[col].apply(lambda x: "0" <= x[0] <= "9" if isinstance(

138 x, str) else (isinstance(x, (int, float))))].copy()

139 df[col] = df[col].astype(float)

140 cp = df[df[col] == 15]

141 if len(cp) == 0:

142 ages = [str(_) for _ in set(df[col])] # pragma: no cover

143 raise DataFormatException( # pragma: no cover

144 "Unable to find 15 (age) in table at url: {0}\n{1}".format(

145 url, "\n".join(ages)))

146 if len(cp) != 1:

147 raise DataFormatException( # pragma: no cover

148 "too many values 15 in table at url: " + url)

149 cpe = df[df[col] == 50]

150 if len(cpe) == 0:

151 raise DataFormatException( # pragma: no cover

152 "Unable to find 50 (age) in table at url: " + url)

153 if len(cpe) != 1:

154 raise DataFormatException( # pragma: no cover

155 "Too many values 50 in table at url: " + url)

156 ind = cp.index[0]

157 ind2 = cpe.index[0]

158 table = df.iloc[ind:ind2, :3].copy()

159 table.columns = ["age", "2005", "2015"]

160 for c in table.columns:

161 table[c] = table[c].astype(float)

162 return table

163

164

165def table_mortalite_euro_stat(url="http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/BulkDownloadListing?file=data/",

166 name="demo_mlifetable.tsv.gz", final_name="mortalite.txt",

167 whereTo=".", stop_at=None, fLOG=noLOG):

168 """

169 This function retrieves mortality table from `EuroStat <http://ec.europa.eu/eurostat/fr>`_ through

170 `table de mortalité <http://www.data-publica.com/opendata/7098--population-et-conditions-sociales-table-de-mortalite-de-1960-a-2010>`_

171 (*this link is currently broken, data-publica does not provide such a database anymore, a copy is provided*).

172

173 @param url data source

174 @param name data table name

175 @param final_name the data is compressed, it needs to be uncompressed

176 into a file, this parameter defines its name

177 @param whereTo data needs to be downloaded, location of this place

178 @param stop_at the overall process is quite long, if not None,

179 it only keeps the first rows

180 @param fLOG logging function

181 @return data_frame

182

183 The function checks the file final_name exists.

184 If it is the case, the data is not downloaded twice.

185

186 The header contains a weird format as coordinates are separated by a comma::

187

188 indic_de,sex,age,geo\time 2013 2012 2011 2010 2009

189

190 We need to preprocess the data to split this information into columns.

191 The overall process takes 4-5 minutes, 10 seconds to download (< 10 Mb),

192 4-5 minutes to preprocess the data (it could be improved). The processed data

193 contains the following columns::

194

195 ['annee', 'valeur', 'age', 'age_num', 'indicateur', 'genre', 'pays']

196

197 Columns *age* and *age_num* look alike. *age_num* is numeric and is equal

198 to *age* except when *age_num* is 85. Everybody above that age fall

199 into the same category. The table contains many indicators:

200

201 * PROBSURV: Probabilité de survie entre deux âges exacts (px)

202 * LIFEXP: Esperance de vie à l'âge exact (ex)

203 * SURVIVORS: Nombre des survivants à l'âge exact (lx)

204 * PYLIVED: Nombre d'années personnes vécues entre deux âges exacts (Lx)

205 * DEATHRATE: Taux de mortalité à l'âge x (Mx)

206 * PROBDEATH: Probabilité de décès entre deux âges exacts (qx)

207 * TOTPYLIVED: Nombre total d'années personne vécues après l'âge exact (Tx)

208 """

209 if os.path.exists(final_name) and os.stat(final_name).st_size > 1e7:

210 return final_name

211

212 temp = final_name + ".remove.txt"

213 if not os.path.exists(temp) or os.stat(temp).st_size < 1e7:

214 local = download_data(name, url=url, whereTo=whereTo)

215 local = local[0] + ".gz"

216 with gzip.open(local, 'rb') as f:

217 file_content = f.read()

218 content = str(file_content, encoding="utf8")

219 with open(temp, "w", encoding="utf8") as f:

220 f.write(content)

221

222 def format_age(s):

223 if s.startswith("Y_"):

224 if s.startswith("Y_LT"): # pragma: no cover

225 return "YLT" + s[4:]

226 if s.startswith("Y_GE"): # pragma: no cover

227 return "YGE" + s[4:]

228 raise SyntaxError(s) # pragma: no cover

229 i = int(s.strip("Y"))

230 return "Y%02d" % i

231

232 def format_age_num(s):

233 if s.startswith("Y_"):

234 if s.startswith("Y_LT"): # pragma: no cover

235 return float(s.replace("Y_LT", ""))

236 if s.startswith("Y_GE"): # pragma: no cover

237 return float(s.replace("Y_GE", ""))

238 raise SyntaxError(s) # pragma: no cover

239 i = int(s.strip("Y"))

240 return float(i)

241

242 def format_value(s):

243 if s.strip() == ":":

244 return numpy.nan

245 return float(s.strip(" ebp"))

246

247 fLOG("step 0, reading")

248 dff = pandas.read_csv(temp, sep="\t", encoding="utf8")

249

250 if stop_at is not None:

251 fLOG("step 0, shortening")

252 dfsmall = dff.head(n=stop_at)

253 df = dfsmall

254 else:

255 df = dff

256

257 fLOG("step 1, size=", df.shape)

258 dfi = df.reset_index().set_index("indic_de,sex,age,geo\\time")

259 dfi = dfi.drop('index', axis=1)

260 dfs = dfi.stack()

261 dfs = pandas.DataFrame({"valeur": dfs})

262

263 fLOG("step 2, size=", dfs.shape)

264 dfs["valeur"] = dfs["valeur"].astype(str)

265 dfs["valeur"] = dfs["valeur"].apply(format_value)

266 dfs = dfs[dfs.valeur >= 0].copy()

267 dfs = dfs.reset_index()

268 dfs.columns = ["index", "annee", "valeur"]

269

270 fLOG("step 3, size=", dfs.shape)

271 dfs["age"] = dfs["index"].apply(lambda i: format_age(i.split(",")[2]))

272 dfs["age_num"] = dfs["index"].apply(

273 lambda i: format_age_num(i.split(",")[2]))

274 dfs["indicateur"] = dfs["index"].apply(lambda i: i.split(",")[0])

275 dfs["genre"] = dfs["index"].apply(lambda i: i.split(",")[1])

276 dfs["pays"] = dfs["index"].apply(lambda i: i.split(",")[3])

277

278 fLOG("step 4")

279 dfy = dfs.drop('index', axis=1)

280 dfy.to_csv(final_name, sep="\t", encoding="utf8", index=False)

281 return final_name

Coverage for src/actuariat_python/data/population.py : 97%

119 statements

Coverage for src/actuariat_python/data/population.py : 97%

119 statements 115 run 4 missing 25 excluded

119 statements