Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Various function to download data about **French** elections. 

5""" 

6import os 

7import warnings 

8import pandas 

9import numpy 

10 

11 

12def geocode(df, col_city="city", col_place="place", col_zip="zip", col_address="address", 

13 col_latitude="latitude", col_longitude="longitude", col_full="full_address", 

14 col_geo="geo_address", save_every=None, every=100, exc=True, fLOG=None, 

15 coders=("Nominatim",), country=None, **options): 

16 """ 

17 geocode addresses 

18 

19 @param df dataframe 

20 @param col_city city 

21 @param col_place place 

22 @param col_zip zip 

23 @param col_address address 

24 @param col_latitude latitude 

25 @param col_longitude longitude 

26 @param col_full full address (send to the geocoder) 

27 @param col_geo address returned by the geocoder 

28 @param save_every to make regular dump 

29 @param every save every *every* 

30 @param exc raises exception or warning (False) 

31 @param options options for `read_csv 

32 <http://pandas.pydata.org/pandas-docs/stable/ 

33 generated/pandas.read_csv.html>`_ 

34 to do regular dumps 

35 @param coders list of coders to try 

36 @param country append the country before geocoding 

37 @param fLOG logging function 

38 @return modified dataframe 

39 

40 If *save_every_100* is filled, the function will save the dataframe 

41 every 100 geocoded addresses. If the file is already present, 

42 it will be loaded the function will continue geocoding where it stopped. 

43 

44 The function does not work well if it is called from multiple 

45 threads or processes. It might slow on purpose. 

46 Example for *coder*: 

47 

48 :: 

49 

50 ["Nominatim", ("bing", <bing_key>)] 

51 

52 The function tries the first one and then the second one. 

53 The function also caches the results. If the same address appears twice, 

54 the geocoder will not be called a second time, it will reuse the cache results 

55 unless there was no answer on the first call. 

56 """ 

57 from geopy.geocoders import Nominatim, Bing 

58 

59 def get_coder(d): 

60 if isinstance(d, str): 

61 if d == "Nominatim": 

62 return Nominatim(user_agent="actuariat_python") 

63 raise ValueError( # pragma: no cover 

64 "Unknown geocoder '{0}'".format(d)) 

65 if isinstance(d, tuple): 

66 name, key = d 

67 if name == "bing": 

68 return Bing(key) 

69 raise ValueError( # pragma: no cover 

70 "Unknown geocoder '{0}'".format(d)) 

71 raise TypeError( # pragma: no cover 

72 "Unexpected type '{0}'".format(type(d))) 

73 

74 if every < 1: 

75 raise ValueError("every should be >= 1, not {0}".format(every)) 

76 from geopy.exc import GeocoderServiceError 

77 geocoder = [get_coder(_) for _ in coders] 

78 cache = {} 

79 if len(geocoder) == 0: 

80 raise ValueError( # pragma: no cover 

81 "No geocoder, the function cannot retrieve addresses.") 

82 

83 class DummyClass: 

84 

85 def __init__(self, **kwargs): 

86 for k, v in kwargs.items(): 

87 setattr(self, k, v) 

88 

89 if save_every is not None and os.path.exists(save_every): 

90 if "index" in options: 

91 options_read = options.copy() 

92 del options_read["index"] 

93 else: 

94 options_read = options 

95 if fLOG: 

96 fLOG("load ", save_every) 

97 read = pandas.read_csv(save_every, **options_read) 

98 cols = list(read.columns) 

99 add = [_ for _ in [col_full, col_latitude, 

100 col_longitude, col_geo] if _ not in df.columns] 

101 oris = list(df.columns) + add 

102 if oris != cols: 

103 raise ValueError( # pragma: no cover 

104 "Unexpected differences in schemas:\nORIGINAL\n{0}\nSAVE" 

105 "\n{1}".format(oris, cols)) 

106 df = read 

107 else: 

108 df = df.copy() 

109 df[col_full] = numpy.nan 

110 df[col_latitude] = numpy.nan 

111 df[col_longitude] = numpy.nan 

112 df[col_geo] = numpy.nan 

113 

114 errors = 0 

115 no_result = 0 

116 lasti = 0 

117 for i in range(0, len(df)): 

118 lasti = i 

119 if i % every == 0: 

120 if save_every is not None: 

121 if fLOG is not None: 

122 fLOG( 

123 "saving place {0}/{1} - errors={2} - no-result={3}".format(i, len(df), errors, no_result)) 

124 df.to_csv(save_every, **options) 

125 elif fLOG is not None: 

126 fLOG( 

127 "geocode place {0}/{1} - errors={2} - no-result={3}".format(i, len(df), errors, no_result)) 

128 

129 place, zips, city, address = df.loc[ 

130 i, [col_place, col_zip, col_city, col_address]] 

131 if not isinstance(zips, str): 

132 zips = "%05d" % zips 

133 

134 def concat(s1, s2): 

135 if isinstance(s1, str) and len(s1) > 0: 

136 return s1 

137 if isinstance(s2, str) and len(s2) > 0: 

138 return s2 

139 return "" 

140 

141 ad = "{0} {1} {2}".format(concat(address, place), zips, city).strip() 

142 if country is not None: 

143 ad += " " + country 

144 df.loc[i, col_full] = ad 

145 

146 if (numpy.isnan(df.loc[i, col_latitude]) or 

147 numpy.isnan(df.loc[i, col_longitude])): 

148 

149 if ad in cache: 

150 geo = cache[ad] 

151 if geo is None: 

152 raise ValueError( # pragma: no cover 

153 "Do not populate the cache with None values for key " 

154 "'{0}'".format(ad)) 

155 rexc = None 

156 else: 

157 geo = None 

158 for cod in geocoder: 

159 try: 

160 geo = cod.geocode(ad, exactly_one=True, timeout=30) 

161 rexc = None 

162 if geo is not None: 

163 break 

164 except (TimeoutError, GeocoderServiceError) as e: 

165 geo = None 

166 rexc = e 

167 

168 if geo is not None: 

169 df.loc[i, col_longitude] = geo.longitude 

170 df.loc[i, col_latitude] = geo.latitude 

171 df.loc[i, col_geo] = geo.address 

172 elif rexc: 

173 no_result += 1 

174 errors += 1 

175 if exc: 

176 if save_every is not None: 

177 df.to_csv(save_every, **options) 

178 raise rexc 

179 warnings.warn(str(rexc)) 

180 continue 

181 else: 

182 no_result += 1 

183 

184 if ad not in cache: 

185 cache[ad] = DummyClass(longitude=df.loc[i, col_longitude], 

186 latitude=df.loc[i, col_latitude], 

187 address=df.loc[i, col_geo]) 

188 

189 if fLOG is not None: # pragma: no cover 

190 fLOG( 

191 "geocode place {0}/{1} - errors={2} - no-result={3}" 

192 "".format(lasti, len(df), errors, no_result)) 

193 if save_every is not None: 

194 df.to_csv(save_every, **options) 

195 return df