Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding:utf-8 -*- 

2""" 

3@file 

4@brief The file contains a class which collects data coming from :epkg:`Velib`. 

5 

6""" 

7 

8import os 

9import os.path 

10import datetime 

11import json 

12import time 

13import re 

14import math 

15import random 

16import urllib 

17import urllib.error 

18import urllib.request 

19import pandas 

20import numpy 

21 

22 

23class DataCollectJCDecaux: 

24 

25 """ 

26 This class automates data collecting from :epkg:`JCDecaux`. 

27 The service is provided at `JCDecaux developer <https://developer.jcdecaux.com/#/home>`_. 

28 

29 See also `notebook on Velib <http://nbviewer.ipython.org/5520933>`_ 

30 The list of contracts for :epkg:`JCDecaux` can be obtained at: 

31 `Données statiques <https://developer.jcdecaux.com/#/opendata/vls?page=static>`_. 

32 The API provided by :epkg:`JCDecaux` is described 

33 `here <https://developer.jcdecaux.com/#/opendata/vls?page=dynamic>`_. 

34 

35 .. exref:: 

36 :title: Simple code to fetch velib data 

37 

38 :: 

39 

40 private_key = 'your_key' 

41 

42 from manydataapi.velib import DataCollectJCDecaux 

43 DataCollectJCDecaux.run_collection(private_key, contract="besancon", 

44 delayms=30000, single_file=False, stop_datetime=None, 

45 log_every=1) 

46 """ 

47 

48 #: list of available cities = contract (subset) 

49 _contracts_static = {k: 1 for k in [ 

50 'arcueil', 'besancon', 'lyon', 'nancy']} 

51 

52 # api: two substring to replace (contract, apiKey) 

53 _url_api = "https://api.jcdecaux.com/vls/v1/stations?contract=%s&apiKey=%s" 

54 _url_apic = "https://api.jcdecaux.com/vls/v1/contracts?apiKey=%s" 

55 

56 def __init__(self, apiKey, fetch_contracts=False): 

57 """ 

58 @param apiKey api key 

59 @param fetch_contracts if True, it uses a short list of known contracts, 

60 otherwise, it will updated through the website API 

61 """ 

62 self.apiKey = apiKey 

63 self.contracts = DataCollectJCDecaux._contracts_static if not fetch_contracts else self.get_contracts() 

64 

65 # sometimes, lng and lat are null, check if some past retrieving 

66 # returned non null coordinates 

67 self.memoGeoStation = {} 

68 

69 def get_contracts(self): 

70 """ 

71 Returns the list of contracts. 

72 

73 @return dictionary, something like ``{'station': 1}`` 

74 """ 

75 url = DataCollectJCDecaux._url_apic % (self.apiKey) 

76 try: 

77 with urllib.request.urlopen(url) as u: 

78 js = u.read() 

79 except (urllib.error.HTTPError, urllib.error.URLError) as exc: # pragma: no cover 

80 # there was probably a mistake 

81 # We try again after a given amount of time 

82 time.sleep(0.5) 

83 try: 

84 with urllib.request.urlopen(url) as u: 

85 js = u.read() 

86 except (urllib.error.HTTPError, urllib.error.URLError) as exc: 

87 # there was probably a mistake 

88 # we stop 

89 raise Exception("unable to access url: " + url) from exc 

90 

91 js = str(js, encoding="utf8") 

92 js = json.loads(js) 

93 cont = {k["name"]: 1 for k in js} 

94 return cont 

95 

96 def get_json(self, contract): 

97 """ 

98 Returns the data associated to a contract. 

99 

100 @param contract contract name, @see te _contracts 

101 @return :epkg:`json` string 

102 """ 

103 if contract not in self.contracts: 

104 raise RuntimeError( # pragma: no cover 

105 "Unable to find contract '{0}' in:\n{1}".format(contract, "\n".join( 

106 self.contracts.keys()))) 

107 url = DataCollectJCDecaux._url_api % (contract, self.apiKey) 

108 

109 try: 

110 with urllib.request.urlopen(url) as u: 

111 js = u.read() 

112 except (urllib.error.HTTPError, urllib.error.URLError): # pragma: no cover 

113 # there was probably a mistake 

114 # We try again after a given amount of time 

115 time.sleep(0.5) 

116 try: 

117 with urllib.request.urlopen(url) as u: 

118 js = u.read() 

119 except (urllib.error.HTTPError, urllib.error.URLError): 

120 # there was probably a mistake 

121 # we stop 

122 return json.loads("[]") 

123 

124 js = str(js, encoding="utf8") 

125 js = json.loads(js) 

126 now = datetime.datetime.now() 

127 for o in js: 

128 o["number"] = int(o["number"]) 

129 o["banking"] = 1 if o["banking"] == "True" else 0 

130 o["bonus"] = 1 if o["bonus"] == "True" else 0 

131 

132 o["bike_stands"] = int(o["bike_stands"]) 

133 o["available_bike_stands"] = int(o["available_bike_stands"]) 

134 o["available_bikes"] = int(o["available_bikes"]) 

135 o["collect_date"] = now 

136 

137 try: 

138 ds = float(o["last_update"]) 

139 dt = datetime.datetime.fromtimestamp(ds / 1000) 

140 except ValueError: # pragma: no cover 

141 dt = datetime.datetime.now() 

142 except TypeError: # pragma: no cover 

143 dt = datetime.datetime.now() 

144 o["last_update"] = dt 

145 

146 try: 

147 o["lat"] = float( 

148 o["position"]["lat"]) if o["position"]["lat"] is not None else None 

149 o["lng"] = float( 

150 o["position"]["lng"]) if o["position"]["lng"] is not None else None 

151 except TypeError as e: # pragma: no cover 

152 raise TypeError( # pylint: disable=W0707 

153 "Unable to convert geocode for the following row: %s\n%s" % 

154 (str(o), str(e))) 

155 

156 key = contract, o["number"] 

157 if key in self.memoGeoStation: 

158 if o["lat"] == 0 or o["lng"] == 0: 

159 o["lat"], o["lng"] = self.memoGeoStation[key] 

160 elif o["lat"] != 0 and o["lng"] != 0: 

161 self.memoGeoStation[key] = o["lat"], o["lng"] 

162 

163 del o["position"] 

164 

165 return js 

166 

167 def collecting_data(self, contract, delayms=1000, outfile="velib_data.txt", 

168 single_file=True, stop_datetime=None, log_every=10, 

169 fLOG=print): 

170 """ 

171 Collects data for a period of time. 

172 

173 @param contract contract name, @see te _contracts 

174 @param delayms delay between two collections (in ms) 

175 @param outfile write data in this file (json), if single_file is True, outfile is used as a prefix 

176 @param single_file if True, one file, else, many files with timestamp as a suffix 

177 @param stop_datetime if None, never stops, else stops when the date is reached 

178 @param log_every print something every <log_every> times data were collected 

179 @param fLOG logging function (None to disable) 

180 @return list of created file 

181 """ 

182 delay = datetime.timedelta(seconds=delayms / 1000) 

183 now = datetime.datetime.now() 

184 cloc = now 

185 delayms /= 50 

186 delays = delayms / 1000.0 

187 

188 nb = 0 

189 while stop_datetime is None or now < stop_datetime: 

190 now = datetime.datetime.now() 

191 cloc += delay 

192 js = self.get_json(contract) 

193 

194 if single_file: 

195 with open(outfile, "a", encoding="utf8") as f: 

196 f.write("%s\t%s\n" % (str(now), str(js))) 

197 else: 

198 name = outfile + "." + \ 

199 str(now).replace(":", 

200 "-").replace("/", 

201 "-").replace(" ", 

202 "_") + ".txt" 

203 with open(name, "w", encoding="utf8") as f: 

204 f.write(str(js)) 

205 

206 nb += 1 

207 if fLOG and nb % log_every == 0: 

208 fLOG("DataCollectJCDecaux.collecting_data: nb={0} {1} delay={2}".format( 

209 nb, now, delay)) 

210 

211 while now < cloc: 

212 now = datetime.datetime.now() 

213 time.sleep(delays) 

214 

215 @staticmethod 

216 def run_collection(key=None, contract="Paris", delayms=60000, folder_file="velib_data", 

217 stop_datetime=None, single_file=False, log_every=1, fLOG=print): 

218 """ 

219 Runs the collection of the data for velib, data are stored using :epkg:`json` format. 

220 The function creates a file every time a new status is downloaded. 

221 

222 @param key (str|None), not implemented if None 

223 @param contract a city 

224 @param delayms gets a status every delayms milliseconds 

225 @param folder_file prefix used to create one file or several, it depends on single_file) where to place downloaded files) 

226 @param stop_datetime (datetime) stop when this datetime is reached or None for never stops 

227 @param single_file if True, every json status will be stored in a single file, if False, it will be 

228 a different file each time, if True, then folder_file is a file 

229 @param log_every log some information every 1 (minutes) 

230 @param fLOG logging function (None to disable) 

231 

232 .. exref:: 

233 :title: collect Velib data 

234 

235 The following example produces a file every minute in json format about the status of all 

236 Velib stations in Paris. They will be put in a folder call ``velib_data``. 

237 

238 :: 

239 

240 from manydataapi.velib.data_jcdecaux import DataCollectJCDecaux 

241 DataCollectJCDecaux.run_collection(private_key, contract="Paris", 

242 delayms=60000, single_file=False, stop_datetime=None, 

243 log_every=1) 

244 """ 

245 if key is None: 

246 raise NotImplementedError( # pragma: no cover 

247 "key cannot be None") 

248 velib = DataCollectJCDecaux(key, True) 

249 velib.collecting_data(contract, delayms, folder_file, stop_datetime=stop_datetime, 

250 single_file=single_file, log_every=log_every, fLOG=fLOG) 

251 

252 @staticmethod 

253 def to_df(folder, regex="velib_data.*[.]txt"): 

254 """ 

255 Reads all files in a folder (assuming there were produced by this class) and 

256 returns a dataframe with it. 

257 

258 @param folder folder where to find the files 

259 @param regex regular expression which filter the files 

260 @return pandas DataFrame 

261 

262 Each file is a status of all stations, a row per 

263 station will be added to the file. 

264 It produces a table with the following columns: 

265 

266 - address 

267 - available_bike_stands 

268 - available_bikes 

269 - banking 

270 - bike_stands 

271 - bonus 

272 - collect_date 

273 - contract_name 

274 - last_update 

275 - lat 

276 - lng 

277 - name 

278 - number 

279 - status 

280 - file 

281 """ 

282 if regex is None: 

283 regex = ".*" 

284 reg = re.compile(regex) 

285 

286 files_ = os.listdir(folder) 

287 files = [_ for _ in files_ if reg.search(_)] 

288 

289 if len(files) == 0: 

290 raise FileNotFoundError( # pragma: no cover 

291 "No found files in directory: '{}'\nregex: '{}'.".format( 

292 folder, regex)) 

293 

294 rows = [] 

295 for file_ in files: 

296 file = os.path.join(folder, file_) 

297 with open(file, "r", encoding="utf8") as f: 

298 lines = f.readlines() 

299 for i, line in enumerate(lines): 

300 dl = eval(line.strip("\n\r\t ")) # pylint: disable=W0123 

301 if not isinstance(dl, list): 

302 raise TypeError( # pragma: no cover 

303 "Expects a list for line {0} in file {1}".format( 

304 i, 

305 file)) 

306 for d in dl: 

307 d["file"] = file_ 

308 rows.extend(dl) 

309 

310 return pandas.DataFrame(rows) 

311 

312 @staticmethod 

313 def draw(df, use_folium=False, **args): 

314 """ 

315 Draws a graph using four columns: *lng*, *lat*, *available_bike_stands*, *available_bikes*. 

316 

317 @param df dataframe 

318 @param args other parameters to give method ``plt.subplots`` or :epkg:`folium` 

319 @param use_folium use folium to create the map 

320 @return fig, ax, plt, (fig,ax) comes plt.subplot, plt is matplotlib.pyplot 

321 

322 Additional parameters: 

323 

324 * size: change the size of points 

325 """ 

326 size = args.get('size', 1) 

327 if 'size' in args: 

328 del args['size'] 

329 

330 if not use_folium: 

331 import matplotlib.pyplot as plt 

332 fig, ax = plt.subplots(**args) 

333 

334 x = df["lng"] 

335 y = df["lat"] 

336 areaf = df.apply( 

337 lambda r: r["available_bike_stands"] ** 0.5 * size, axis=1) 

338 areab = df.apply( 

339 lambda r: r["available_bikes"] ** 0.5 * size, axis=1) 

340 ax.scatter(x, y, areaf, alpha=0.5, label="place", color="r") 

341 ax.scatter(x, y, areab, alpha=0.5, label="bike", color="g") 

342 ax.grid(True) 

343 ax.legend() 

344 ax.set_xlabel("longitude") 

345 ax.set_ylabel("latitude") 

346 

347 return fig, ax, plt 

348 else: 

349 import folium 

350 x = df["lat"].mean() 

351 y = df["lng"].mean() 

352 map_osm = folium.Map(location=[x, y], zoom_start=13) 

353 

354 def add_marker(row): 

355 "add marker" 

356 t = "+ {0} o {1}".format(row["available_bikes"], 

357 row["available_bike_stands"]) 

358 folium.CircleMarker([row["lat"], row["lng"]], color='#3186cc', fill_color='#3186cc', 

359 popup=t, radius=(row["available_bikes"] / numpy.pi) ** 0.5 * 30 * size).add_to(map_osm) 

360 folium.CircleMarker([row["lat"], row["lng"]], color='#cc8631', fill_color='#cc8631', 

361 popup=t, radius=(row["available_bike_stands"] / numpy.pi) ** 0.5 * 30 * size).add_to(map_osm) 

362 

363 df.apply(lambda row: add_marker(row), axis=1) 

364 return map_osm 

365 

366 @staticmethod 

367 def animation(df, interval=20, module="matplotlib", **args): 

368 """ 

369 Displays a javascript animation, 

370 see `animation.FuncAnimation 

371 <http://matplotlib.org/api/animation_api.html#matplotlib.animation.FuncAnimation>`_. 

372 

373 @param df dataframe 

374 @param interval see `animation.FuncAnimation 

375 <http://matplotlib.org/api/animation_api.html#matplotlib.animation.FuncAnimation>`_ 

376 @param module module to build the animation 

377 @param args other parameters to give method ``plt.figure`` 

378 @return animation 

379 

380 Available modules for animation: 

381 

382 * :epkg:`matplotlib` 

383 * :epkg:`moviepy` 

384 

385 Additional arguments: 

386 

387 * size: size of scatter plots 

388 * duration: if module is 'moviepy', duration of the animation 

389 """ 

390 size = args.get('size', 1) 

391 if 'size' in args: 

392 del args['size'] 

393 duration = args.get('duration', 2) 

394 if 'duration' in args: 

395 del args['duration'] 

396 

397 dates = list(sorted(set(df["file"]))) 

398 datas = [] 

399 for d in dates: 

400 sub = df[df["file"] == d] 

401 x = sub["lng"] 

402 y = sub["lat"] 

403 colp = sub.apply( 

404 lambda r: r["available_bike_stands"] ** 0.5 * size, axis=1) 

405 colb = sub.apply( 

406 lambda r: r["available_bikes"] ** 0.5 * size, axis=1) 

407 x = tuple(x) 

408 y = tuple(y) 

409 colp = tuple(colp) 

410 colb = tuple(colb) 

411 data = (x, y, colp, colb) 

412 datas.append(data) 

413 

414 import matplotlib.pyplot as plt 

415 

416 def scatter_fig(i=0): 

417 "scatter plot" 

418 fig, ax = plt.subplots(**args) 

419 x, y, c, d = datas[i] 

420 

421 scat1 = ax.scatter(x, y, c, alpha=0.5, color="r", label="place") 

422 scat2 = ax.scatter(x, y, d, alpha=0.5, color="g", label="bike") 

423 ax.grid(True) 

424 ax.legend() 

425 ax.set_xlabel("longitude") 

426 ax.set_ylabel("latitude") 

427 return fig, ax, scat1, scat2 

428 

429 if module == "matplotlib": 

430 from matplotlib import animation 

431 

432 def animate(i, datas, scat1, scat2): 

433 "animation" 

434 _, __, c, d = datas[i] 

435 # scat1.set_array(numpy.array(c)) 

436 # scat2.set_array(numpy.array(d)) 

437 #scat1.set_array(numpy.array(x + y)) 

438 #scat2.set_array(numpy.array(x + y)) 

439 scat1._sizes = c 

440 scat2._sizes = d 

441 return scat1, scat2 

442 

443 fig, _, scat1, scat2 = scatter_fig() 

444 anim = animation.FuncAnimation(fig, animate, frames=len(datas), 

445 interval=interval, fargs=(datas, scat1, scat2), blit=True) 

446 plt.close('all') 

447 return anim 

448 

449 elif module == "moviepy": 

450 from moviepy.video.io.bindings import mplfig_to_npimage 

451 import moviepy.editor as mpy 

452 

453 def make_frame_mpl(t): 

454 "mpl=matplotlib" 

455 i = min(int(t * len(datas)), len(datas) - 1) 

456 __, _, c, d = datas[i] 

457 # scat1.set_xdata(x) # <= Update the curve 

458 # scat1.set_ydata(y) # <= Update the curve 

459 scat1._sizes = c 

460 scat2._sizes = d 

461 res = mplfig_to_npimage(fig) 

462 return res 

463 

464 fig, _, scat1, scat2 = scatter_fig(0) 

465 animation = mpy.VideoClip(make_frame_mpl, duration=duration) 

466 return animation 

467 else: 

468 raise ValueError( # pragma: no cover 

469 "Unsupported module '{0}'".format(module)) 

470 

471 @staticmethod 

472 def distance_haversine(lat1, lon1, lat2, lon2): 

473 """ 

474 Computes the `haversine <https://en.wikipedia.org/wiki/Haversine_formula>`_ distance. 

475 

476 @return double 

477 """ 

478 radius = 6371 

479 dlat = math.radians(lat2 - lat1) 

480 dlon = math.radians(lon2 - lon1) 

481 a = math.sin(dlat / 2) * math.sin(dlat / 2) + math.cos(math.radians(lat1)) \ 

482 * math.cos(math.radians(lat2)) * math.sin(dlon / 2) * math.sin(dlon / 2) 

483 c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a)) 

484 d = radius * c 

485 return d 

486 

487 @staticmethod 

488 def simulate(df, nbbike, speed, 

489 period=datetime.timedelta(minutes=1), 

490 iteration=500, min_min=10, delta_speed=2.5, 

491 fLOG=print): 

492 """ 

493 Simulates velibs on a set of stations given by *df*. 

494 

495 @param df dataframe with station information 

496 @param nbbike number of bicycles 

497 @param period period 

498 @param speed average speed (in km/h) 

499 @param iteration number of iterations 

500 @param min_min minimum duration of a trip 

501 @param delta_speed allowed speed difference 

502 @param fLOG logging function 

503 @return simulated paths, data (as DataFrame) 

504 """ 

505 cities = df[["lat", "lng", "name", "number"]] 

506 start = cities.drop_duplicates() 

507 idvelo = 0 

508 

509 current = {} 

510 for row in start.values: 

511 r = [] 

512 for i in range(0, 5): 

513 r.append(idvelo) 

514 idvelo += 1 

515 r.extend([-1, -1, -1, -1, -1]) 

516 ids = tuple(row) 

517 current[ids] = r 

518 

519 running = [] 

520 

521 def free(v): 

522 "free bycicles" 

523 nb = [_ for _ in v if _ == -1] 

524 return len(nb) > 0 

525 

526 def bike(v): 

527 "bicycles" 

528 nb = [_ for _ in v if _ == -1] 

529 return len(nb) < len(v) 

530 

531 def pop(v): 

532 "pop" 

533 for i, _ in enumerate(v): 

534 if _ != -1: 

535 r = v[i] 

536 v[i] = -1 

537 fLOG(" pop", v) 

538 return r 

539 raise RuntimeError("no free bike") # pragma: no cover 

540 

541 def push(v, idv): 

542 "push" 

543 for i, _ in enumerate(v): 

544 if _ == -1: 

545 v[i] = idv 

546 fLOG(" push", v) 

547 return None 

548 raise RuntimeError("no free spot: " + str(v)) # pragma: no cover 

549 

550 def give_status(conf, ti): 

551 "give status" 

552 rows = [] 

553 for k, v in conf.items(): 

554 lat, lng, name, number = k 

555 obs = {"lat": lat, "lng": lng, "name": name, "number": number} 

556 nb = [_ for _ in v if _ == -1] 

557 obs["available_bike_stands"] = len(nb) 

558 obs["available_bikes"] = len(v) - len(nb) 

559 obs["collect_date"] = ti 

560 obs["file"] = str(ti) 

561 rows.append(obs) 

562 return rows 

563 

564 simulation = [] 

565 paths = [] 

566 keys = list(current.keys()) 

567 iter = 0 

568 tim = datetime.datetime.now() 

569 while iter < iteration: 

570 

571 status = give_status(current, tim) 

572 simulation.extend(status) 

573 

574 # a bike 

575 if len(running) < nbbike: 

576 rnd = random.randint(0, len(keys) - 1) 

577 v = current[keys[rnd]] 

578 if bike(v): 

579 v = (tim, pop(v), keys[rnd], "begin") 

580 running.append(v) 

581 lat, lng, name, number = keys[rnd] 

582 dv = { 

583 "lat0": lat, 

584 "lng0": lng, 

585 "name0": name, 

586 "number0": number} 

587 dv.update({"time": v[0], "idvelo": v[1], "beginend": v[-1], 

588 "hours": 0.0, "dist": 0.0}) 

589 paths.append(dv) 

590 

591 # do we put the bike back 

592 rem = [] 

593 for i, r in enumerate(running): 

594 delta = tim - r[0] 

595 h = delta.total_seconds() / 3600 

596 if h * 60 > min_min: 

597 for _ in cities.values: 

598 row = cities.values[random.randint(0, len(cities) - 1)] 

599 keycity = tuple(row) 

600 station = current[keycity] 

601 if free(station): 

602 vlat, vlng = r[2][0], r[2][1] 

603 clat, clng = row[0], row[1] 

604 dist = DataCollectJCDecaux.distance_haversine( 

605 vlat, 

606 vlng, 

607 clat, 

608 clng) 

609 sp = dist / h 

610 dsp = abs(sp - speed) 

611 if (dsp < delta_speed or (sp < speed and h >= 1)) \ 

612 and random.randint(0, 10) == 0: 

613 # we put it back 

614 push(station, r[1]) 

615 rem.append(i) 

616 

617 lat, lng, name, number = r[2] 

618 dv = { 

619 "lat0": lat, 

620 "lng0": lng, 

621 "name0": name, 

622 "number0": number} 

623 lat, lng, name, number = keycity 

624 dv.update({"lat1": lat, 

625 "lng1": lng, 

626 "name1": name, 

627 "number1": number}) 

628 dv.update({"time": tim, 

629 "idvelo": r[1], 

630 "beginend": "end", 

631 "hours": h, 

632 "dist": dist}) 

633 paths.append(dv) 

634 break 

635 

636 running = [r for i, r in enumerate(running) if i not in rem] 

637 

638 if fLOG: 

639 fLOG("[DataCollectJCDecaux.simulate] iter", "time ", tim, " - ", len(running), 

640 "/", nbbike, " paths ", len(paths)) 

641 

642 # end of loop 

643 tim += period 

644 iter += 1 

645 

646 return pandas.DataFrame(paths), pandas.DataFrame(simulation)