Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Jeux de données reliés aux données carroyées. 

5""" 

6import os 

7import zipfile 

8import collections 

9import datetime 

10import tempfile 

11from io import BytesIO 

12from dbfread.field_parser import FieldParser 

13from dbfread import DBF 

14from dbfread.dbf import expand_year 

15import pandas 

16from .data_helper import get_data_folder 

17 

18 

19class DBFInMemory(DBF): 

20 """ 

21 Overwrites `DBF <https://github.com/olemb/dbfread/blob/master/dbfread/dbf.py#L77>`_ 

22 to read data from memory and not from a file. The object 

23 `DBF <http://dbfread.readthedocs.io/en/latest/dbf_objects.html>`_ 

24 needs a file by default. This class avoids creating an intermediate 

25 file when the data is compressed in a :epkg:`zip` file. 

26 """ 

27 

28 def __init__(self, filename, encoding=None, ignorecase=True, 

29 lowernames=False, parserclass=FieldParser, 

30 recfactory=collections.OrderedDict, 

31 load=False, raw=False, ignore_missing_memofile=False, 

32 char_decode_errors='strict'): 

33 

34 if isinstance(filename, str): 

35 DBF.__init__(self, filename, encoding=encoding, ignorecase=ignorecase, 

36 lowernames=lowernames, parserclass=parserclass, 

37 recfactory=recfactory, load=load, 

38 raw=raw, ignore_missing_memofile=ignore_missing_memofile, 

39 char_decode_errors=char_decode_errors) 

40 else: 

41 self.encoding = encoding 

42 self.ignorecase = ignorecase 

43 self.lowernames = lowernames 

44 self.parserclass = parserclass 

45 self.raw = raw 

46 self.ignore_missing_memofile = ignore_missing_memofile 

47 self.char_decode_errors = char_decode_errors 

48 

49 if recfactory is None: 

50 self.recfactory = lambda items: items 

51 else: 

52 self.recfactory = recfactory 

53 

54 self.name = None 

55 self.filename = None 

56 self.content = filename 

57 

58 self._records = None 

59 self._deleted = None 

60 

61 # Filled in by self._read_headers() 

62 self.memofilename = None 

63 self.header = None 

64 self.fields = [] # namedtuples 

65 self.field_names = [] # strings 

66 

67 obj = BytesIO(filename) 

68 self._read_header(obj) 

69 self._read_field_headers(obj) 

70 self._check_headers() 

71 

72 try: 

73 self.date = datetime.date(expand_year(self.header.year), 

74 self.header.month, self.header.day) 

75 except ValueError: 

76 # Invalid date or '\x00\x00\x00'. 

77 self.date = None 

78 

79 self.memofilename = self._get_memofilename() 

80 

81 if load: 

82 self.load() 

83 

84 def _iter_records(self, record_type=b' '): 

85 infile = BytesIO(self.content) 

86 with self._open_memofile() as memofile: 

87 

88 # Skip to first record. 

89 infile.seek(self.header.headerlen, 0) 

90 

91 if not self.raw: 

92 field_parser = self.parserclass(self, memofile) 

93 parse = field_parser.parse 

94 

95 # Shortcuts for speed. 

96 skip_record = self._skip_record 

97 read = infile.read 

98 

99 while True: 

100 sep = read(1) 

101 

102 if sep == record_type: 

103 if self.raw: 

104 items = [(field.name, read(field.length)) 

105 for field in self.fields] 

106 else: 

107 items = [(field.name, 

108 parse(field, read(field.length))) 

109 for field in self.fields] 

110 

111 yield self.recfactory(items) 

112 

113 elif sep in (b'\x1a', b''): 

114 # End of records. 

115 break 

116 else: 

117 skip_record(infile) 

118 

119 

120def load_dbf_from_zip(filename): 

121 """ 

122 Loads a *.dbf* file compressed into a zip file. 

123 It only takes the first *.dbf* file from the zip. 

124 

125 @param filename zip file 

126 @return dataframe 

127 """ 

128 with zipfile.ZipFile(filename) as myzip: 

129 names0 = myzip.infolist() 

130 names = [_.filename for _ in names0 if _.filename.endswith(".dbf")] 

131 if len(names) == 0: 

132 raise FileNotFoundError("No dbf file in '{0}'".format(filename)) 

133 with myzip.open(names[0], "r") as f: 

134 content = f.read() 

135 data = list(DBFInMemory(content)) 

136 return pandas.DataFrame(data) 

137 

138 

139def _read_geopandas_from_bytes(mif, mid, **kwargs): 

140 """ 

141 Returns a :epkg:`GeoDataFrame` from two sequences of bytes, 

142 one for file *.mif*, one from file *.mid*. 

143 Unfortunately, :epkg:`geopandas` does not read from 

144 a buffer, and :epkg:`fiona` does it after writing 

145 in a virtual file (not clear if it is a temporary file or not). 

146 """ 

147 # Delayed import because the import fails sometimes 

148 # on Windows. 

149 from geopandas import GeoDataFrame 

150 

151 with tempfile.NamedTemporaryFile(mode='w+b', delete=False, suffix='.mif') as temp: 

152 temp.write(mif) 

153 name_mif = temp.name 

154 name_mid = temp.name.replace(".mif", ".mid") 

155 with open(name_mid, "wb") as f: 

156 f.write(mid) 

157 gdf = GeoDataFrame.from_file(name_mid, **kwargs) 

158 if os.path.exists(name_mid): 

159 os.remove(name_mid) 

160 if os.path.exists(name_mif): 

161 os.remove(name_mif) 

162 return gdf 

163 

164 

165def load_shapes_from_zip(filename): 

166 """ 

167 Loads a *.mif* and a *.mid* file compressed into a zip file. 

168 It only takes the first *.mid* and *.mif* files from the zip. 

169 

170 @param filename zip file 

171 @return dataframe 

172 """ 

173 with zipfile.ZipFile(filename) as myzip: 

174 names0 = myzip.infolist() 

175 names = [_.filename for _ in names0 if _.filename.endswith(".mif")] 

176 if len(names) == 0: 

177 raise FileNotFoundError("No mif file in '{0}'".format(filename)) 

178 with myzip.open(names[0], "r") as f: 

179 mif = f.read() 

180 names = [_.filename for _ in names0 if _.filename.endswith(".mid")] 

181 if len(names) == 0: 

182 raise FileNotFoundError("No mid file in '{0}'".format(filename)) 

183 with myzip.open(names[0], "r") as f: 

184 mid = f.read() 

185 

186 data = _read_geopandas_from_bytes(mif, mid) 

187 return data 

188 

189 

190def load_carreau_from_zip(file_car=None, file_rect=None): 

191 """ 

192 Retourne un exemple de données carroyées. 

193 Les données sont disponibles dans le répertoire 

194 `data <https://github.com/sdpython/papierstat/tree/master/papierstat/datasets/data>`_. 

195 Notebooks associés à ce jeu de données : 

196 

197 .. runpython:: 

198 :rst: 

199 

200 from papierstat.datasets.documentation import list_notebooks_rst_links 

201 links = list_notebooks_rst_links('visualisation', 'carte_carreau') 

202 links = [' * %s' % s for s in links] 

203 print('\\n'.join(links)) 

204 

205 @param file_car les carreaux 

206 @param file_rect les données 

207 @return 4 dataframes 

208 

209 Résultats: 

210 

211 * données sur la population par carreaux 

212 * shapefiles des carreaux 

213 * données sur la population par rectangles 

214 * shapefiles des rectangles 

215 

216 .. note:: 

217 

218 Afin de respecter la règle de diffusion des données sur les 

219 revenus fiscaux des ménages, aucune information 

220 statistique (à l'exception du nombre total d'individus) n'est 

221 diffusée sur des carreaux de moins de 11 ménages. Ces carreaux de 

222 faibles effectifs sont donc regroupés en rectangles de taille plus 

223 importante et satisfaisant à cette règle des 11 ménages minimum. 

224 `source : INSEE <https://www.insee.fr/fr/statistiques/2520034>`_. 

225 """ 

226 if file_rect is None and file_car is None: 

227 data = get_data_folder() 

228 file_rect = os.path.join(data, 'reunion_rect.zip') 

229 file_car = os.path.join(data, 'reunion.zip') 

230 dfcar = load_dbf_from_zip(file_car) 

231 shpcar = load_shapes_from_zip(file_car) 

232 dfrect = load_dbf_from_zip(file_rect) 

233 shprect = load_shapes_from_zip(file_rect) 

234 return dfcar, shpcar, dfrect, shprect