Code source de mlstatpy.data.wikipedia

"""
Functions to retrieve data from Wikipedia


:githublink:`%|py|5`
"""
import os
from pyquickhelper.loghelper import noLOG
from pyquickhelper.filehelper import get_url_content_timeout, ungzip_files
from .data_exceptions import DataException


[docs]def download_pageviews(dt, folder=".", unzip=True, timeout=-1, overwrite=False, fLOG=noLOG): """ Downloads wikipedia pagacount for a precise date (up to the hours), the url follows the pattern:: https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pagecounts-%Y%m%d-%H0000.gz :param dt: datetime :param folder: where to download :param unzip: unzip the file :param timeout: timeout :param overwrite: overwrite :param fLOG: logging function :return: filename More information on page `pageviews <https://dumps.wikimedia.org/other/pageviews/>`_. :githublink:`%|py|29` """ url = "https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pageviews-%Y%m%d-%H0000.gz" url = dt.strftime(url) file = url.split("/")[-1] name = os.path.join(folder, file) unzipname = os.path.splitext(name)[0] if overwrite or (not os.path.exists(name) and not os.path.exists(unzipname)): get_url_content_timeout(url, timeout=timeout, encoding=None, output=name, chunk=2**20, fLOG=fLOG) if unzip and not os.path.exists(unzipname): names = ungzip_files(name, unzip=False, where_to=folder) os.remove(name) if isinstance(names, list): if len(names) != 1: raise DataException( # pragma: no cover "Expecting only one file, not '{0}'".format(names)) return names[0] return names return name
[docs]def download_dump(country, name, folder=".", unzip=True, timeout=-1, overwrite=False, fLOG=noLOG): """ Downloads *wikipedia dumps* from `dumps.wikimedia.org/frwiki/latest/ <https://dumps.wikimedia.org/frwiki/latest/>`_. :param country: country :param name: name of the stream to download :param folder: where to download :param unzip: unzip the file :param timeout: timeout :param overwrite: overwrite :param fLOG: logging function :githublink:`%|py|64` """ url = "https://dumps.wikimedia.org/{0}wiki/latest/{0}wiki-{1}".format( country, name) file = url.split("/")[-1] name = os.path.join(folder, file) unzipname = os.path.splitext(name)[0] if overwrite or (not os.path.exists(name) and not os.path.exists(unzipname)): get_url_content_timeout(url, timeout=timeout, encoding=None, output=name, chunk=2**20, fLOG=fLOG) if unzip and not os.path.exists(unzipname): names = ungzip_files(name, unzip=False, where_to=folder) os.remove(name) if isinstance(names, list): if len(names) != 1: raise DataException( # pragma: no cover "Expecting only one file, not '{0}'".format(names)) return names[0] return names return name[:-3] if name.endswith('.gz') else name
[docs]def download_titles(country, folder=".", unzip=True, timeout=-1, overwrite=False, fLOG=noLOG): """ Downloads wikipedia titles from `dumps.wikimedia.org/frwiki/latest/latest-all-titles-in-ns0.gz <https://dumps.wikimedia.org/frwiki/latest/latest-all-titles-in-ns0.gz>`_. :param country: country :param folder: where to download :param unzip: unzip the file :param timeout: timeout :param overwrite: overwrite :param fLOG: logging function :githublink:`%|py|98` """ return download_dump(country, "latest-all-titles-in-ns0.gz", folder, unzip=unzip, timeout=timeout, overwrite=overwrite, fLOG=fLOG)
[docs]def normalize_wiki_text(text): """ Normalizes a text such as a wikipedia title. :param text: text to normalize :return: normalized text :githublink:`%|py|110` """ return text.replace("_", " ").replace("''", '"')
[docs]def enumerate_titles(filename, norm=True, encoding="utf8"): """ Enumerates titles from a file. :param filename: filename :param norm: normalize in the function :param encoding: encoding :githublink:`%|py|121` """ if norm: with open(filename, "r", encoding=encoding) as f: for line in f: yield normalize_wiki_text(line.strip(" \r\n\t")) else: with open(filename, "r", encoding=encoding) as f: for line in f: yield line.strip(" \r\n\t")