Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2@file
3@brief Functions to retrieve data from Wikipedia
4"""
5import os
6from pyquickhelper.loghelper import noLOG
7from pyquickhelper.filehelper import get_url_content_timeout, ungzip_files
8from .data_exceptions import DataException
11def download_pageviews(dt, folder=".", unzip=True, timeout=-1,
12 overwrite=False, fLOG=noLOG):
13 """
14 Downloads wikipedia pagacount for a precise date (up to the hours),
15 the url follows the pattern::
17 https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pagecounts-%Y%m%d-%H0000.gz
19 @param dt datetime
20 @param folder where to download
21 @param unzip unzip the file
22 @param timeout timeout
23 @param overwrite overwrite
24 @param fLOG logging function
25 @return filename
27 More information on page
28 `pageviews <https://dumps.wikimedia.org/other/pageviews/>`_.
29 """
30 url = "https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pageviews-%Y%m%d-%H0000.gz"
31 url = dt.strftime(url)
32 file = url.split("/")[-1]
33 name = os.path.join(folder, file)
34 unzipname = os.path.splitext(name)[0]
35 if overwrite or (not os.path.exists(name) and not os.path.exists(unzipname)):
36 get_url_content_timeout(url, timeout=timeout,
37 encoding=None, output=name, chunk=2**20, fLOG=fLOG)
38 if unzip and not os.path.exists(unzipname):
39 names = ungzip_files(name, unzip=False, where_to=folder)
40 os.remove(name)
41 if isinstance(names, list):
42 if len(names) != 1:
43 raise DataException( # pragma: no cover
44 "Expecting only one file, not '{0}'".format(names))
45 return names[0]
46 return names
47 return name
50def download_dump(country, name, folder=".", unzip=True, timeout=-1,
51 overwrite=False, fLOG=noLOG):
52 """
53 Downloads *wikipedia dumps* from
54 `dumps.wikimedia.org/frwiki/latest/
55 <https://dumps.wikimedia.org/frwiki/latest/>`_.
57 @param country country
58 @param name name of the stream to download
59 @param folder where to download
60 @param unzip unzip the file
61 @param timeout timeout
62 @param overwrite overwrite
63 @param fLOG logging function
64 """
65 url = "https://dumps.wikimedia.org/{0}wiki/latest/{0}wiki-{1}".format(
66 country, name)
67 file = url.split("/")[-1] # pylint: disable=C0207
68 name = os.path.join(folder, file)
69 unzipname = os.path.splitext(name)[0]
70 if overwrite or (not os.path.exists(name) and not os.path.exists(unzipname)):
71 get_url_content_timeout(url, timeout=timeout,
72 encoding=None, output=name, chunk=2**20, fLOG=fLOG)
73 if unzip and not os.path.exists(unzipname):
74 names = ungzip_files(name, unzip=False, where_to=folder)
75 os.remove(name)
76 if isinstance(names, list):
77 if len(names) != 1:
78 raise DataException( # pragma: no cover
79 "Expecting only one file, not '{0}'".format(names))
80 return names[0]
81 return names
82 return name[:-3] if name.endswith('.gz') else name
85def download_titles(country, folder=".", unzip=True, timeout=-1,
86 overwrite=False, fLOG=noLOG):
87 """
88 Downloads wikipedia titles from
89 `dumps.wikimedia.org/frwiki/latest/latest-all-titles-in-ns0.gz
90 <https://dumps.wikimedia.org/frwiki/latest/latest-all-titles-in-ns0.gz>`_.
92 @param country country
93 @param folder where to download
94 @param unzip unzip the file
95 @param timeout timeout
96 @param overwrite overwrite
97 @param fLOG logging function
98 """
99 return download_dump(country, "latest-all-titles-in-ns0.gz",
100 folder, unzip=unzip, timeout=timeout,
101 overwrite=overwrite, fLOG=fLOG)
104def normalize_wiki_text(text):
105 """
106 Normalizes a text such as a wikipedia title.
108 @param text text to normalize
109 @return normalized text
110 """
111 return text.replace("_", " ").replace("''", '"')
114def enumerate_titles(filename, norm=True, encoding="utf8"):
115 """
116 Enumerates titles from a file.
118 @param filename filename
119 @param norm normalize in the function
120 @param encoding encoding
121 """
122 if norm:
123 with open(filename, "r", encoding=encoding) as f:
124 for line in f:
125 yield normalize_wiki_text(line.strip(" \r\n\t"))
126 else:
127 with open(filename, "r", encoding=encoding) as f:
128 for line in f:
129 yield line.strip(" \r\n\t")