Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Various function to download data about **French** elections.
5"""
6import os
7import warnings
8import pandas
9import numpy
12def geocode(df, col_city="city", col_place="place", col_zip="zip", col_address="address",
13 col_latitude="latitude", col_longitude="longitude", col_full="full_address",
14 col_geo="geo_address", save_every=None, every=100, exc=True, fLOG=None,
15 coders=("Nominatim",), country=None, **options):
16 """
17 geocode addresses
19 @param df dataframe
20 @param col_city city
21 @param col_place place
22 @param col_zip zip
23 @param col_address address
24 @param col_latitude latitude
25 @param col_longitude longitude
26 @param col_full full address (send to the geocoder)
27 @param col_geo address returned by the geocoder
28 @param save_every to make regular dump
29 @param every save every *every*
30 @param exc raises exception or warning (False)
31 @param options options for `read_csv
32 <http://pandas.pydata.org/pandas-docs/stable/
33 generated/pandas.read_csv.html>`_
34 to do regular dumps
35 @param coders list of coders to try
36 @param country append the country before geocoding
37 @param fLOG logging function
38 @return modified dataframe
40 If *save_every_100* is filled, the function will save the dataframe
41 every 100 geocoded addresses. If the file is already present,
42 it will be loaded the function will continue geocoding where it stopped.
44 The function does not work well if it is called from multiple
45 threads or processes. It might slow on purpose.
46 Example for *coder*:
48 ::
50 ["Nominatim", ("bing", <bing_key>)]
52 The function tries the first one and then the second one.
53 The function also caches the results. If the same address appears twice,
54 the geocoder will not be called a second time, it will reuse the cache results
55 unless there was no answer on the first call.
56 """
57 from geopy.geocoders import Nominatim, Bing
59 def get_coder(d):
60 if isinstance(d, str):
61 if d == "Nominatim":
62 return Nominatim(user_agent="actuariat_python")
63 raise ValueError( # pragma: no cover
64 "Unknown geocoder '{0}'".format(d))
65 if isinstance(d, tuple):
66 name, key = d
67 if name == "bing":
68 return Bing(key)
69 raise ValueError( # pragma: no cover
70 "Unknown geocoder '{0}'".format(d))
71 raise TypeError( # pragma: no cover
72 "Unexpected type '{0}'".format(type(d)))
74 if every < 1:
75 raise ValueError("every should be >= 1, not {0}".format(every))
76 from geopy.exc import GeocoderServiceError
77 geocoder = [get_coder(_) for _ in coders]
78 cache = {}
79 if len(geocoder) == 0:
80 raise ValueError( # pragma: no cover
81 "No geocoder, the function cannot retrieve addresses.")
83 class DummyClass:
85 def __init__(self, **kwargs):
86 for k, v in kwargs.items():
87 setattr(self, k, v)
89 if save_every is not None and os.path.exists(save_every):
90 if "index" in options:
91 options_read = options.copy()
92 del options_read["index"]
93 else:
94 options_read = options
95 if fLOG:
96 fLOG("load ", save_every)
97 read = pandas.read_csv(save_every, **options_read)
98 cols = list(read.columns)
99 add = [_ for _ in [col_full, col_latitude,
100 col_longitude, col_geo] if _ not in df.columns]
101 oris = list(df.columns) + add
102 if oris != cols:
103 raise ValueError( # pragma: no cover
104 "Unexpected differences in schemas:\nORIGINAL\n{0}\nSAVE"
105 "\n{1}".format(oris, cols))
106 df = read
107 else:
108 df = df.copy()
109 df[col_full] = numpy.nan
110 df[col_latitude] = numpy.nan
111 df[col_longitude] = numpy.nan
112 df[col_geo] = numpy.nan
114 errors = 0
115 no_result = 0
116 lasti = 0
117 for i in range(0, len(df)):
118 lasti = i
119 if i % every == 0:
120 if save_every is not None:
121 if fLOG is not None:
122 fLOG(
123 "saving place {0}/{1} - errors={2} - no-result={3}".format(i, len(df), errors, no_result))
124 df.to_csv(save_every, **options)
125 elif fLOG is not None:
126 fLOG(
127 "geocode place {0}/{1} - errors={2} - no-result={3}".format(i, len(df), errors, no_result))
129 place, zips, city, address = df.loc[
130 i, [col_place, col_zip, col_city, col_address]]
131 if not isinstance(zips, str):
132 zips = "%05d" % zips
134 def concat(s1, s2):
135 if isinstance(s1, str) and len(s1) > 0:
136 return s1
137 if isinstance(s2, str) and len(s2) > 0:
138 return s2
139 return ""
141 ad = "{0} {1} {2}".format(concat(address, place), zips, city).strip()
142 if country is not None:
143 ad += " " + country
144 df.loc[i, col_full] = ad
146 if (numpy.isnan(df.loc[i, col_latitude]) or
147 numpy.isnan(df.loc[i, col_longitude])):
149 if ad in cache:
150 geo = cache[ad]
151 if geo is None:
152 raise ValueError( # pragma: no cover
153 "Do not populate the cache with None values for key "
154 "'{0}'".format(ad))
155 rexc = None
156 else:
157 geo = None
158 for cod in geocoder:
159 try:
160 geo = cod.geocode(ad, exactly_one=True, timeout=30)
161 rexc = None
162 if geo is not None:
163 break
164 except (TimeoutError, GeocoderServiceError) as e:
165 geo = None
166 rexc = e
168 if geo is not None:
169 df.loc[i, col_longitude] = geo.longitude
170 df.loc[i, col_latitude] = geo.latitude
171 df.loc[i, col_geo] = geo.address
172 elif rexc:
173 no_result += 1
174 errors += 1
175 if exc:
176 if save_every is not None:
177 df.to_csv(save_every, **options)
178 raise rexc
179 warnings.warn(str(rexc))
180 continue
181 else:
182 no_result += 1
184 if ad not in cache:
185 cache[ad] = DummyClass(longitude=df.loc[i, col_longitude],
186 latitude=df.loc[i, col_latitude],
187 address=df.loc[i, col_geo])
189 if fLOG is not None: # pragma: no cover
190 fLOG(
191 "geocode place {0}/{1} - errors={2} - no-result={3}"
192 "".format(lasti, len(df), errors, no_result))
193 if save_every is not None:
194 df.to_csv(save_every, **options)
195 return df