Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Jeux de données reliés aux vins. 

5""" 

6import os 

7import pandas 

8 

9 

10def load_sentiment_dataset(cache="."): 

11 """ 

12 Retourne un ensemble de phrases en anglais avec 

13 assorties d'un sentiment positif ou négatif. 

14 Source : 

15 `Sentiment Labelled Sentences Data Set <https://archive.ics.uci.edu/ 

16 ml/datasets/Sentiment+Labelled+Sentences>`_. 

17 Notebooks associés à ce jeu de données : 

18 

19 .. runpython:: 

20 :rst: 

21 

22 from papierstat.datasets.documentation import list_notebooks_rst_links 

23 links = list_notebooks_rst_links('lectures', 'text_sentiment_wordvec') 

24 links = [' * %s' % s for s in links] 

25 print('\\n'.join(links)) 

26 

27 @param cache where to cache or unzip the data if 

28 downloaded a second time 

29 @return text content (str) 

30 """ 

31 from pyensae.datasource import download_data 

32 # url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/" 

33 name = "sentiment_labelled_sentences.zip" 

34 res = download_data(name, whereTo=cache) 

35 if len(res) != 9: 

36 raise ValueError( # pragma: no cover 

37 "Unzipping '{0}' failed.".format(name)) 

38 dfs = [] 

39 for fi in res: 

40 if ".txt" not in fi or "readme" in fi or "__MACOSX" in fi: 

41 continue 

42 df = pandas.read_csv(fi, sep='\t', quoting=3, 

43 names=['sentence', 'sentiment']) 

44 df["source"] = os.path.splitext(os.path.split(fi)[-1])[0] 

45 dfs.append(df) 

46 return pandas.concat(dfs)