Code source de papierstat.datasets.sentiment

# -*- coding: utf-8 -*-
"""
Jeux de données reliés aux vins.


:githublink:`%|py|6`
"""
import os
import pandas


[docs]def load_sentiment_dataset(cache="."): """ Retourne un ensemble de phrases en anglais avec assorties d'un sentiment positif ou négatif. Source : `Sentiment Labelled Sentences Data Set <https://archive.ics.uci.edu/ ml/datasets/Sentiment+Labelled+Sentences>`_. Notebooks associés à ce jeu de données : .. runpython:: :rst: from papierstat.datasets.documentation import list_notebooks_rst_links links = list_notebooks_rst_links('lectures', 'text_sentiment_wordvec') links = [' * %s' % s for s in links] print('\\n'.join(links)) :param cache: where to cache or unzip the data if downloaded a second time :return: text content (str) :githublink:`%|py|30` """ from pyensae.datasource import download_data # url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/" name = "sentiment_labelled_sentences.zip" res = download_data(name, whereTo=cache) if len(res) != 9: raise ValueError( # pragma: no cover "Unzipping '{0}' failed.".format(name)) dfs = [] for fi in res: if ".txt" not in fi or "readme" in fi or "__MACOSX" in fi: continue df = pandas.read_csv(fi, sep='\t', quoting=3, names=['sentence', 'sentiment']) df["source"] = os.path.splitext(os.path.split(fi)[-1])[0] dfs.append(df) return pandas.concat(dfs)