Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Various function to download data about population 

5""" 

6import os 

7import re 

8from pyquickhelper.loghelper import noLOG 

9from pymyinstall.installcustom import download_page 

10from pyensae import download_data 

11from pyrsslocal.xmlhelper import xml_filter_iterator 

12from .data_exceptions import LinkNotFoundError 

13 

14 

15def wolf_xml(url="http://pauillac.inria.fr/~sagot/index.html", temp_folder=".", fLOG=noLOG): 

16 """ 

17 The `WOLF <http://alpage.inria.fr/~sagot/wolf-en.html>`_ 

18 (Wordnet Libre du Français, Free French Wordnet) is a free semantic 

19 lexical resource (wordnet) for French. 

20 

21 This data is licensed under `Cecill-C license 

22 <http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.html>`_. 

23 Language is French. 

24 

25 @param url url 

26 @param fLOG logging function 

27 @param temp_folder where to download 

28 @return list of files 

29 """ 

30 link = url 

31 page = download_page(link) 

32 reg = re.compile("href=\\\"(https.*?wolf.*?[.]bz2)\\\"") 

33 alls = reg.findall(page) 

34 if len(alls) == 0: 

35 raise LinkNotFoundError( # pragma: no cover 

36 "unable to find a link on a .bz2 file on page\n{}".format(page)) 

37 

38 url = alls[0] 

39 spl = url.split("/") 

40 url = "/".join(spl[:-1]) + "/" 

41 url2 = "/".join(spl[:-2]) + "/31718/" 

42 dtd = download_data("debvisdic-strict.dtd", url=[url2, "xd"], 

43 fLOG=fLOG, whereTo=temp_folder) 

44 name = spl[-1].strip('.') 

45 local = download_data( 

46 name, url=[url, "xd"], fLOG=fLOG, whereTo=temp_folder) 

47 if isinstance(local, str): 

48 local = [local] 

49 # We check the file was downloaded. 

50 expected = os.path.join(temp_folder, "wolf-1.0b4.xml") 

51 if not os.path.exists(expected): # pragma: no cover 

52 res = download_data("wolf-1.0b4.xml.zip", 

53 whereTo=temp_folder, fLOG=fLOG) 

54 if not os.path.exists(expected): 

55 raise FileNotFoundError(expected) 

56 return res 

57 elif isinstance(dtd, list): 

58 return local + dtd 

59 return local + [dtd] # pragma: no cover 

60 

61 

62def enumerate_wolf_xml_row(filename, fLOG=noLOG, xmlformat=False, encoding="utf-8", errors=None): 

63 """ 

64 walk through an XML file returned by function 

65 @see fn wolf_xml 

66 

67 @param filename filename 

68 @param fLOG logging function 

69 @param xmlformat if True, return the xml, otherwise return the node, 

70 see `XMLHandlerDictNode <http://www.xavierdupre.fr/app/pyrsslocal/ 

71 helpsphinx/pyrsslocal/xmlhelper/xml_tree_node.html# 

72 module-pyrsslocal.xmlhelper.xml_tree_node>`_ 

73 @param encoding encoding 

74 @param errors what to do with errors 

75 @return elements 

76 """ 

77 for row in xml_filter_iterator(filename, xmlformat=xmlformat, fLOG=fLOG, encoding=encoding, errors=errors): 

78 yield row 

79 

80 

81def enumerate_wolf_synonyms(filename, fLOG=noLOG, encoding="utf-8", errors=None): 

82 """ 

83 enumerate list of synonyms 

84 Language is French. 

85 

86 @param filename xml file 

87 @param fLOG logging function 

88 @param encoding encoding 

89 @param errors what to do with errors 

90 @return iterator on list of words 

91 """ 

92 for row in enumerate_wolf_xml_row( 

93 filename, fLOG=fLOG, encoding=encoding, errors=errors): 

94 syn = [v for k, v in row.iterfields() if k == "SYNSET/SYNONYM/LITERAL/_"] 

95 if len(syn) > 1: 

96 yield syn