Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Defines a set of modules to try
5"""
7from ..installhelper.module_install import ModuleInstall
10def scraping_set():
11 """
12 modules which help scraping the web, it requires the modules in set *small*
13 """
14 mod = [
15 ModuleInstall(
16 "langdetect", "pip", usage="WEB",
17 purpose="Language detection library ported from Google's language-detection."),
18 ModuleInstall(
19 "justext", "pip", usage="WEB",
20 purpose="Program jusText is a tool for removing boilerplate content, such as navigation links, headers, " +
21 "and footers from HTML pages. It is designed to preserve mainly text containing full " +
22 "sentences and it is therefore well suited for creating linguistic resources such as Web corpora."),
23 ModuleInstall(
24 "tldextract", "pip", usage="WEB",
25 purpose="Accurately separate the TLD from the registered domain and subdomains of a URL, using the Public Suffix List."),
26 ModuleInstall(
27 "cchardet", "wheel", usage="WEB",
28 purpose="Universal encoding detector. This library is faster than chardet."),
29 ModuleInstall(
30 "multidict", "pip",
31 purpose="Multidicts are useful for working with HTTP headers, URL query args etc."),
32 ModuleInstall(
33 "async_timeout", "pip", usage="WEB",
34 purpose="Timeout context manager for asyncio programs"),
35 ModuleInstall(
36 "yarl", "pip", usage="WEB",
37 purpose="Yet another URL library"),
38 ModuleInstall(
39 "idna_ssl", "pip", usage="WEB",
40 purpose="Patch ssl.match_hostname for Unicode(idna) domains support"),
41 ModuleInstall(
42 "aiohttp", "wheel", usage="WEB",
43 purpose="http client/server for asyncio"),
44 ModuleInstall(
45 "sky", "pip", usage="WEB",
46 purpose="sky is a web scraping framework, implemented with the latest python versions in mind (3.4+). " +
47 "It uses the asynchronous asyncio framework, " +
48 "as well as many popular modules and extensions."),
49 ]
51 return [_ for _ in mod if _ is not None]