# -*- coding: utf-8 -*-
"""
Various function to download data about population
:githublink:`%|py|6`
"""
import os
import re
from pyquickhelper.loghelper import noLOG
from pymyinstall.installcustom import download_page
from pyensae.datasource import download_data
from pyrsslocal.xmlhelper import xml_filter_iterator
from .data_exceptions import LinkNotFoundError
[docs]def wolf_xml(url="http://pauillac.inria.fr/~sagot/index.html", temp_folder=".", fLOG=noLOG):
"""
The `WOLF <http://alpage.inria.fr/~sagot/wolf-en.html>`_
(Wordnet Libre du Français, Free French Wordnet) is a free semantic
lexical resource (wordnet) for French.
This data is licensed under `Cecill-C license
<http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.html>`_.
Language is French.
:param url: url
:param fLOG: logging function
:param temp_folder: where to download
:return: list of files
:githublink:`%|py|29`
"""
link = url
page = download_page(link)
reg = re.compile("href=\\\"(https.*?wolf.*?[.]bz2)\\\"")
alls = reg.findall(page)
if len(alls) == 0:
raise LinkNotFoundError( # pragma: no cover
"unable to find a link on a .bz2 file on page\n{}".format(page))
url = alls[0]
spl = url.split("/")
url = "/".join(spl[:-1]) + "/"
url2 = "/".join(spl[:-2]) + "/31718/"
dtd = download_data("debvisdic-strict.dtd", url=[url2, "xd"],
fLOG=fLOG, whereTo=temp_folder)
name = spl[-1].strip('.')
local = download_data(
name, url=[url, "xd"], fLOG=fLOG, whereTo=temp_folder)
if isinstance(local, str):
local = [local]
# We check the file was downloaded.
expected = os.path.join(temp_folder, "wolf-1.0b4.xml")
if not os.path.exists(expected): # pragma: no cover
res = download_data("wolf-1.0b4.xml.zip",
whereTo=temp_folder, fLOG=fLOG)
if not os.path.exists(expected):
raise FileNotFoundError(expected)
return res
elif isinstance(dtd, list):
return local + dtd
return local + [dtd] # pragma: no cover
[docs]def enumerate_wolf_xml_row(filename, fLOG=noLOG, xmlformat=False, encoding="utf-8", errors=None):
"""
walk through an XML file returned by function
:func:`wolf_xml <actuariat_python.data.wolf.wolf_xml>`
:param filename: filename
:param fLOG: logging function
:param xmlformat: if True, return the xml, otherwise return the node,
see `XMLHandlerDictNode <http://www.xavierdupre.fr/app/pyrsslocal/
helpsphinx/pyrsslocal/xmlhelper/xml_tree_node.html#
module-pyrsslocal.xmlhelper.xml_tree_node>`_
:param encoding: encoding
:param errors: what to do with errors
:return: elements
:githublink:`%|py|76`
"""
for row in xml_filter_iterator(filename, xmlformat=xmlformat, fLOG=fLOG, encoding=encoding, errors=errors):
yield row
[docs]def enumerate_wolf_synonyms(filename, fLOG=noLOG, encoding="utf-8", errors=None):
"""
enumerate list of synonyms
Language is French.
:param filename: xml file
:param fLOG: logging function
:param encoding: encoding
:param errors: what to do with errors
:return: iterator on list of words
:githublink:`%|py|91`
"""
for row in enumerate_wolf_xml_row(
filename, fLOG=fLOG, encoding=encoding, errors=errors):
syn = [v for k, v in row.iterfields() if k == "SYNSET/SYNONYM/LITERAL/_"]
if len(syn) > 1:
yield syn