Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Defines a set of modules to try 

5""" 

6 

7from ..installhelper.module_install import ModuleInstall 

8 

9 

10def scraping_set(): 

11 """ 

12 modules which help scraping the web, it requires the modules in set *small* 

13 """ 

14 mod = [ 

15 ModuleInstall( 

16 "langdetect", "pip", usage="WEB", 

17 purpose="Language detection library ported from Google's language-detection."), 

18 ModuleInstall( 

19 "justext", "pip", usage="WEB", 

20 purpose="Program jusText is a tool for removing boilerplate content, such as navigation links, headers, " + 

21 "and footers from HTML pages. It is designed to preserve mainly text containing full " + 

22 "sentences and it is therefore well suited for creating linguistic resources such as Web corpora."), 

23 ModuleInstall( 

24 "tldextract", "pip", usage="WEB", 

25 purpose="Accurately separate the TLD from the registered domain and subdomains of a URL, using the Public Suffix List."), 

26 ModuleInstall( 

27 "cchardet", "wheel", usage="WEB", 

28 purpose="Universal encoding detector. This library is faster than chardet."), 

29 ModuleInstall( 

30 "multidict", "pip", 

31 purpose="Multidicts are useful for working with HTTP headers, URL query args etc."), 

32 ModuleInstall( 

33 "async_timeout", "pip", usage="WEB", 

34 purpose="Timeout context manager for asyncio programs"), 

35 ModuleInstall( 

36 "yarl", "pip", usage="WEB", 

37 purpose="Yet another URL library"), 

38 ModuleInstall( 

39 "idna_ssl", "pip", usage="WEB", 

40 purpose="Patch ssl.match_hostname for Unicode(idna) domains support"), 

41 ModuleInstall( 

42 "aiohttp", "wheel", usage="WEB", 

43 purpose="http client/server for asyncio"), 

44 ModuleInstall( 

45 "sky", "pip", usage="WEB", 

46 purpose="sky is a web scraping framework, implemented with the latest python versions in mind (3.4+). " + 

47 "It uses the asynchronous asyncio framework, " + 

48 "as well as many popular modules and extensions."), 

49 ] 

50 

51 return [_ for _ in mod if _ is not None]