"""
Functions to retrieve data from Wikipedia
:githublink:`%|py|5`
"""
import os
from pyquickhelper.loghelper import noLOG
from pyquickhelper.filehelper import get_url_content_timeout, ungzip_files
from .data_exceptions import DataException
[docs]def download_pageviews(dt, folder=".", unzip=True, timeout=-1,
overwrite=False, fLOG=noLOG):
"""
Downloads wikipedia pagacount for a precise date (up to the hours),
the url follows the pattern::
https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pagecounts-%Y%m%d-%H0000.gz
:param dt: datetime
:param folder: where to download
:param unzip: unzip the file
:param timeout: timeout
:param overwrite: overwrite
:param fLOG: logging function
:return: filename
More information on page
`pageviews <https://dumps.wikimedia.org/other/pageviews/>`_.
:githublink:`%|py|29`
"""
url = "https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pageviews-%Y%m%d-%H0000.gz"
url = dt.strftime(url)
file = url.split("/")[-1]
name = os.path.join(folder, file)
unzipname = os.path.splitext(name)[0]
if overwrite or (not os.path.exists(name) and not os.path.exists(unzipname)):
get_url_content_timeout(url, timeout=timeout,
encoding=None, output=name, chunk=2**20, fLOG=fLOG)
if unzip and not os.path.exists(unzipname):
names = ungzip_files(name, unzip=False, where_to=folder)
os.remove(name)
if isinstance(names, list):
if len(names) != 1:
raise DataException( # pragma: no cover
"Expecting only one file, not '{0}'".format(names))
return names[0]
return names
return name
[docs]def download_dump(country, name, folder=".", unzip=True, timeout=-1,
overwrite=False, fLOG=noLOG):
"""
Downloads *wikipedia dumps* from
`dumps.wikimedia.org/frwiki/latest/
<https://dumps.wikimedia.org/frwiki/latest/>`_.
:param country: country
:param name: name of the stream to download
:param folder: where to download
:param unzip: unzip the file
:param timeout: timeout
:param overwrite: overwrite
:param fLOG: logging function
:githublink:`%|py|64`
"""
url = "https://dumps.wikimedia.org/{0}wiki/latest/{0}wiki-{1}".format(
country, name)
file = url.split("/")[-1]
name = os.path.join(folder, file)
unzipname = os.path.splitext(name)[0]
if overwrite or (not os.path.exists(name) and not os.path.exists(unzipname)):
get_url_content_timeout(url, timeout=timeout,
encoding=None, output=name, chunk=2**20, fLOG=fLOG)
if unzip and not os.path.exists(unzipname):
names = ungzip_files(name, unzip=False, where_to=folder)
os.remove(name)
if isinstance(names, list):
if len(names) != 1:
raise DataException( # pragma: no cover
"Expecting only one file, not '{0}'".format(names))
return names[0]
return names
return name[:-3] if name.endswith('.gz') else name
[docs]def download_titles(country, folder=".", unzip=True, timeout=-1,
overwrite=False, fLOG=noLOG):
"""
Downloads wikipedia titles from
`dumps.wikimedia.org/frwiki/latest/latest-all-titles-in-ns0.gz
<https://dumps.wikimedia.org/frwiki/latest/latest-all-titles-in-ns0.gz>`_.
:param country: country
:param folder: where to download
:param unzip: unzip the file
:param timeout: timeout
:param overwrite: overwrite
:param fLOG: logging function
:githublink:`%|py|98`
"""
return download_dump(country, "latest-all-titles-in-ns0.gz",
folder, unzip=unzip, timeout=timeout,
overwrite=overwrite, fLOG=fLOG)
[docs]def normalize_wiki_text(text):
"""
Normalizes a text such as a wikipedia title.
:param text: text to normalize
:return: normalized text
:githublink:`%|py|110`
"""
return text.replace("_", " ").replace("''", '"')
[docs]def enumerate_titles(filename, norm=True, encoding="utf8"):
"""
Enumerates titles from a file.
:param filename: filename
:param norm: normalize in the function
:param encoding: encoding
:githublink:`%|py|121`
"""
if norm:
with open(filename, "r", encoding=encoding) as f:
for line in f:
yield normalize_wiki_text(line.strip(" \r\n\t"))
else:
with open(filename, "r", encoding=encoding) as f:
for line in f:
yield line.strip(" \r\n\t")