%matplotlib inline


from jyquickhelper import add_notebook_menu
add_notebook_menu()


import os
for o, v in sorted(os.environ.items()):
    if "SPARK" in o.upper():
        print("{0:25}= {1}".format(o, v.replace(os.environ["USERNAME"], "<username>")))

LOCAL_PYSPARK            = c:\<username>rdupre\spark-2.2.0-bin-hadoop2.7
PYSPARK_DRIVER_PYTHON    = jupyter-notebook
PYSPARK_PYTHON           = c:\Python36_x64\python
PYSPARK_SUBMIT_ARGS      = "--name" "PySparkShell" "pyspark-shell" 
SPARK_CMD                = set PYSPARK_SUBMIT_ARGS="--name" "PySparkShell" "pyspark-shell" && jupyter-notebook 
SPARK_ENV_LOADED         = 1
SPARK_HIVE               = true
SPARK_HOME               = c:\<username>rdupre\spark-2.2.0-bin-hadoop2.7\bin\..
SPARK_JARS_DIR           = "c:\<username>rdupre\spark-2.2.0-bin-hadoop2.7\bin\..\jars"
SPARK_SCALA_VERSION      = 2.10
_SPARK_CMD_USAGE         = Usage: bin\pyspark.cmd [options]


from pyquickhelper.filehelper import remove_folder
def clean(folder):
    if os.path.exists(folder):
        return remove_folder(folder)
    else:
        return []
clean("fichier.out.txt")

[]


text_file = sc.textFile("spark_first_steps.ipynb")
counts = text_file.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
counts.saveAsTextFile("fichier.out.txt")


os.listdir("fichier.out.txt/")

['.part-00000.crc',
 '.part-00001.crc',
 '._SUCCESS.crc',
 'part-00000',
 'part-00001',
 '_SUCCESS']


%load_ext pyensae
%head fichier.out.txt/part-00000 -n 3

('', 11686)
('[collect](http://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.collect)', 1)
('SQL](http://spark.apache.org/docs/latest/sql-programming-guide.html)\\n",', 1)


import os
if not os.path.exists("data_adult.txt"):
    from pyquickhelper.filehelper import unzip_files
    unzip_files("data_adult.zip", where_to=".")
assert os.path.exists("data_adult.txt")


import pandas
df = pandas.read_csv("data_adult.txt", sep="\t", encoding="utf-8")
df.head()


df.to_csv("adult.txt", sep="\t", encoding="utf-8", index=False, header=None)


%head adult.txt -n 2

39	 State-gov	77516	 Bachelors	13	 Never-married	 Adm-clerical	 Not-in-family	 White	 Male	2174	0	40	 United-States	 <=50K
50	 Self-emp-not-inc	83311	 Bachelors	13	 Married-civ-spouse	 Exec-managerial	 Husband	 White	 Male	0	0	13	 United-States	 <=50K


rdd = sc.textFile("adult.txt")


import os
if not os.path.exists("out"):
    os.mkdir("out")


clean("out/copy_adult.txt")
rdd.saveAsTextFile(os.path.abspath("out/copy_adult.txt"))


%head out/copy_adult.txt/part-00000 -n 2

39	 State-gov	77516	 Bachelors	13	 Never-married	 Adm-clerical	 Not-in-family	 White	 Male	2174	0	40	 United-States	 <=50K
50	 Self-emp-not-inc	83311	 Bachelors	13	 Married-civ-spouse	 Exec-managerial	 Husband	 White	 Male	0	0	13	 United-States	 <=50K


import glob
import pandas
def read_rdd(path, **options):
    pat = os.path.join(path, "part*")
    all_files = glob.glob(pat)
    if len(all_files) == 0:
        raise Exception("No file to read in '{0}'".format(path))
    merge = []
    for f in all_files:
        try:
            df = pandas.read_csv(f, header=None, **options)
        except Exception as e:
            raise Exception("Unable to read '{0}'".format(f)) from e
        merge.append(df)
    if len(merge) == 0:
        raise Exception("No file to read in '{0}'".format(path))
    concatenated_df = pandas.concat(merge, ignore_index=True)
    return concatenated_df

data = read_rdd("out/copy_adult.txt", sep="\t", encoding="utf-8")
data.head(n=2)


res = rdd.collect()


res[:2]

['39\t State-gov\t77516\t Bachelors\t13\t Never-married\t Adm-clerical\t Not-in-family\t White\t Male\t2174\t0\t40\t United-States\t <=50K',
 '50\t Self-emp-not-inc\t83311\t Bachelors\t13\t Married-civ-spouse\t Exec-managerial\t Husband\t White\t Male\t0\t0\t13\t United-States\t <=50K']


import pandas
df = pandas.DataFrame([_.split("\t") for _ in res])
df.head(2)


def extract_column(cols, row):
    spl = row.split("\t")
    return [spl[i].strip() for i in cols]

res = rdd.map(lambda row: extract_column([1,3], row))
res.collect()[:2]

[['State-gov', 'Bachelors'], ['Self-emp-not-inc', 'Bachelors']]


def filter_column(row):
    spl = row.split("\t")
    return spl[-1].strip() != "<=50K"

res = rdd.filter(lambda row: filter_column(row))
res.collect()[:2]

['52\t Self-emp-not-inc\t209642\t HS-grad\t9\t Married-civ-spouse\t Exec-managerial\t Husband\t White\t Male\t0\t0\t45\t United-States\t >50K',
 '31\t Private\t45781\t Masters\t14\t Never-married\t Prof-specialty\t Not-in-family\t White\t Female\t14084\t0\t50\t United-States\t >50K']


def filter_column_split(row):
    return row[-1].strip() != "<=50K"

res = rdd.map(lambda row: extract_column([1,3,-1], row)) \
         .filter(lambda row: filter_column_split(row))
res.collect()[:2]

[['Self-emp-not-inc', 'HS-grad', '>50K'], ['Private', 'Masters', '>50K']]


def extract_column_and_multiply_row(n, row):
    spl = row.split("\t")
    return [tuple(_.strip() for _ in spl)] * n

res = rdd.flatMap(lambda row: extract_column_and_multiply_row(2, row))
res.collect()[:3]

[('39',
  'State-gov',
  '77516',
  'Bachelors',
  '13',
  'Never-married',
  'Adm-clerical',
  'Not-in-family',
  'White',
  'Male',
  '2174',
  '0',
  '40',
  'United-States',
  '<=50K'),
 ('39',
  'State-gov',
  '77516',
  'Bachelors',
  '13',
  'Never-married',
  'Adm-clerical',
  'Not-in-family',
  'White',
  'Male',
  '2174',
  '0',
  '40',
  'United-States',
  '<=50K'),
 ('50',
  'Self-emp-not-inc',
  '83311',
  'Bachelors',
  '13',
  'Married-civ-spouse',
  'Exec-managerial',
  'Husband',
  'White',
  'Male',
  '0',
  '0',
  '13',
  'United-States',
  '<=50K')]


def extract_age_rich(row):
    spl = row.split("\t")
    target = spl[-1].strip()
    age = float(spl[0])
    return (age, target)

def custom_agg(aggset):
    temp = list([_[0] for _ in aggset])
    return len(temp), sum(temp)

ave = rdd.map(extract_age_rich).groupBy(lambda row: row[1]).mapValues(custom_agg)
fin = ave.collect()
fin

[('>50K', (7841, 346963.0)), ('<=50K', (24720, 909294.0))]


add_key = rdd.map(lambda row: row.split("\t")).map(lambda row: (row[-1].strip(), row))
join = add_key.join(ave)
join.collect()[:2]

[('>50K',
  (['52',
    ' Self-emp-not-inc',
    '209642',
    ' HS-grad',
    '9',
    ' Married-civ-spouse',
    ' Exec-managerial',
    ' Husband',
    ' White',
    ' Male',
    '0',
    '0',
    '45',
    ' United-States',
    ' >50K'],
   (7841, 346963.0))),
 ('>50K',
  (['31',
    ' Private',
    '45781',
    ' Masters',
    '14',
    ' Never-married',
    ' Prof-specialty',
    ' Not-in-family',
    ' White',
    ' Female',
    '14084',
    '0',
    '50',
    ' United-States',
    ' >50K'],
   (7841, 346963.0)))]


from pyspark.context import SparkContext
ages = sc.broadcast([20, 30, 40])
ages.value

[20, 30, 40]


subset = rdd.filter(lambda row: int(row.split("\t")[0]) in ages.value )
subset.collect()[:2]

['30\t State-gov\t141297\t Bachelors\t13\t Married-civ-spouse\t Prof-specialty\t Husband\t Asian-Pac-Islander\t Male\t0\t0\t40\t India\t >50K',
 '40\t Private\t121772\t Assoc-voc\t11\t Married-civ-spouse\t Craft-repair\t Husband\t Asian-Pac-Islander\t Male\t0\t0\t40\t ?\t >50K']


simple_rdd = sc.parallelize([2, 3, 4])
simple_rdd.collect()

[2, 3, 4]


simple_rdd.flatMap(lambda x: range(1, x)).collect()

[1, 1, 2, 1, 2, 3]


import pandas
data = pandas.read_csv("data_adult.txt", sep="\t", encoding="utf-8")
data.head(2)


if "spark" not in locals():
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.appName("nimportequoi").getOrCreate()  # à ne faire qu'une fois


# sdf = spark.createDataFrame(data)  # ça marche
sdf = spark.read.csv("data_adult.txt", sep="\t", encoding="utf-8")


sdf.show()

+---+-----------------+------+-------------+-------------+--------------------+------------------+--------------+-------------------+-------+------------+------------+--------------+--------------+------+
|_c0|              _c1|   _c2|          _c3|          _c4|                 _c5|               _c6|           _c7|                _c8|    _c9|        _c10|        _c11|          _c12|          _c13|  _c14|
+---+-----------------+------+-------------+-------------+--------------------+------------------+--------------+-------------------+-------+------------+------------+--------------+--------------+------+
|age|        workclass|fnlwgt|    education|education_num|      marital_status|        occupation|  relationship|               race|    sex|capital_gain|capital_loss|hours_per_week|native_country|target|
| 39|        State-gov| 77516|    Bachelors|           13|       Never-married|      Adm-clerical| Not-in-family|              White|   Male|        2174|           0|            40| United-States| <=50K|
| 50| Self-emp-not-inc| 83311|    Bachelors|           13|  Married-civ-spouse|   Exec-managerial|       Husband|              White|   Male|           0|           0|            13| United-States| <=50K|
| 38|          Private|215646|      HS-grad|            9|            Divorced| Handlers-cleaners| Not-in-family|              White|   Male|           0|           0|            40| United-States| <=50K|
| 53|          Private|234721|         11th|            7|  Married-civ-spouse| Handlers-cleaners|       Husband|              Black|   Male|           0|           0|            40| United-States| <=50K|
| 28|          Private|338409|    Bachelors|           13|  Married-civ-spouse|    Prof-specialty|          Wife|              Black| Female|           0|           0|            40|          Cuba| <=50K|
| 37|          Private|284582|      Masters|           14|  Married-civ-spouse|   Exec-managerial|          Wife|              White| Female|           0|           0|            40| United-States| <=50K|
| 49|          Private|160187|          9th|            5| Married-spouse-a...|     Other-service| Not-in-family|              Black| Female|           0|           0|            16|       Jamaica| <=50K|
| 52| Self-emp-not-inc|209642|      HS-grad|            9|  Married-civ-spouse|   Exec-managerial|       Husband|              White|   Male|           0|           0|            45| United-States|  >50K|
| 31|          Private| 45781|      Masters|           14|       Never-married|    Prof-specialty| Not-in-family|              White| Female|       14084|           0|            50| United-States|  >50K|
| 42|          Private|159449|    Bachelors|           13|  Married-civ-spouse|   Exec-managerial|       Husband|              White|   Male|        5178|           0|            40| United-States|  >50K|
| 37|          Private|280464| Some-college|           10|  Married-civ-spouse|   Exec-managerial|       Husband|              Black|   Male|           0|           0|            80| United-States|  >50K|
| 30|        State-gov|141297|    Bachelors|           13|  Married-civ-spouse|    Prof-specialty|       Husband| Asian-Pac-Islander|   Male|           0|           0|            40|         India|  >50K|
| 23|          Private|122272|    Bachelors|           13|       Never-married|      Adm-clerical|     Own-child|              White| Female|           0|           0|            30| United-States| <=50K|
| 32|          Private|205019|   Assoc-acdm|           12|       Never-married|             Sales| Not-in-family|              Black|   Male|           0|           0|            50| United-States| <=50K|
| 40|          Private|121772|    Assoc-voc|           11|  Married-civ-spouse|      Craft-repair|       Husband| Asian-Pac-Islander|   Male|           0|           0|            40|             ?|  >50K|
| 34|          Private|245487|      7th-8th|            4|  Married-civ-spouse|  Transport-moving|       Husband| Amer-Indian-Eskimo|   Male|           0|           0|            45|        Mexico| <=50K|
| 25| Self-emp-not-inc|176756|      HS-grad|            9|       Never-married|   Farming-fishing|     Own-child|              White|   Male|           0|           0|            35| United-States| <=50K|
| 32|          Private|186824|      HS-grad|            9|       Never-married| Machine-op-inspct|     Unmarried|              White|   Male|           0|           0|            40| United-States| <=50K|
| 38|          Private| 28887|         11th|            7|  Married-civ-spouse|             Sales|       Husband|              White|   Male|           0|           0|            50| United-States| <=50K|
+---+-----------------+------+-------------+-------------+--------------------+------------------+--------------+-------------------+-------+------------+------------+--------------+--------------+------+
only showing top 20 rows


df = sdf.toPandas()


df.head()


sdf.rdd

MapPartitionsRDD[59] at javaToPython at null:-2


sdf.schema

StructType(List(StructField(_c0,StringType,true),StructField(_c1,StringType,true),StructField(_c2,StringType,true),StructField(_c3,StringType,true),StructField(_c4,StringType,true),StructField(_c5,StringType,true),StructField(_c6,StringType,true),StructField(_c7,StringType,true),StructField(_c8,StringType,true),StructField(_c9,StringType,true),StructField(_c10,StringType,true),StructField(_c11,StringType,true),StructField(_c12,StringType,true),StructField(_c13,StringType,true),StructField(_c14,StringType,true)))


sdf.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)
 |-- _c13: string (nullable = true)
 |-- _c14: string (nullable = true)


import pandas
df = pandas.read_csv("data_adult.txt", sep="\t", encoding="utf-8")
df.head(n=2)


sdf = spark.createDataFrame(df)


sdf.printSchema()

root
 |-- age: long (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: long (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: long (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: long (nullable = true)
 |-- capital_loss: long (nullable = true)
 |-- hours_per_week: long (nullable = true)
 |-- native_country: string (nullable = true)
 |-- target: string (nullable = true)


fullsdf = spark.createDataFrame(sdf.rdd, sdf.schema)


fullsdf.printSchema()

root
 |-- age: long (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: long (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: long (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: long (nullable = true)
 |-- capital_loss: long (nullable = true)
 |-- hours_per_week: long (nullable = true)
 |-- native_country: string (nullable = true)
 |-- target: string (nullable = true)


fullsdf.write.parquet("data_adult.schema.parquet")


newsdf = spark.read.parquet("data_adult.schema.parquet/")


newsdf.printSchema()

root
 |-- age: long (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: long (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: long (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: long (nullable = true)
 |-- capital_loss: long (nullable = true)
 |-- hours_per_week: long (nullable = true)
 |-- native_country: string (nullable = true)
 |-- target: string (nullable = true)


fifty = fullsdf [fullsdf.age > 50]


fifty.show()

+---+-----------------+------+-------------+-------------+-------------------+------------------+---------------+-------------------+-------+------------+------------+--------------+--------------+------+
|age|        workclass|fnlwgt|    education|education_num|     marital_status|        occupation|   relationship|               race|    sex|capital_gain|capital_loss|hours_per_week|native_country|target|
+---+-----------------+------+-------------+-------------+-------------------+------------------+---------------+-------------------+-------+------------+------------+--------------+--------------+------+
| 53|          Private|234721|         11th|            7| Married-civ-spouse| Handlers-cleaners|        Husband|              Black|   Male|           0|           0|            40| United-States| <=50K|
| 52| Self-emp-not-inc|209642|      HS-grad|            9| Married-civ-spouse|   Exec-managerial|        Husband|              White|   Male|           0|           0|            45| United-States|  >50K|
| 54|          Private|302146|      HS-grad|            9|          Separated|     Other-service|      Unmarried|              Black| Female|           0|           0|            20| United-States| <=50K|
| 59|          Private|109015|      HS-grad|            9|           Divorced|      Tech-support|      Unmarried|              White| Female|           0|           0|            40| United-States| <=50K|
| 56|        Local-gov|216851|    Bachelors|           13| Married-civ-spouse|      Tech-support|        Husband|              White|   Male|           0|           0|            40| United-States|  >50K|
| 54|                ?|180211| Some-college|           10| Married-civ-spouse|                 ?|        Husband| Asian-Pac-Islander|   Male|           0|           0|            60|         South|  >50K|
| 53| Self-emp-not-inc| 88506|    Bachelors|           13| Married-civ-spouse|    Prof-specialty|        Husband|              White|   Male|           0|           0|            40| United-States| <=50K|
| 57|      Federal-gov|337895|    Bachelors|           13| Married-civ-spouse|    Prof-specialty|        Husband|              Black|   Male|           0|           0|            40| United-States|  >50K|
| 53|          Private|144361|      HS-grad|            9| Married-civ-spouse| Machine-op-inspct|        Husband|              White|   Male|           0|           0|            38| United-States| <=50K|
| 53|          Private|169846|      HS-grad|            9| Married-civ-spouse|      Adm-clerical|           Wife|              White| Female|           0|           0|            40| United-States|  >50K|
| 79|          Private|124744| Some-college|           10| Married-civ-spouse|    Prof-specialty| Other-relative|              White|   Male|           0|           0|            20| United-States| <=50K|
| 67|                ?|212759|         10th|            6| Married-civ-spouse|                 ?|        Husband|              White|   Male|           0|           0|             2| United-States| <=50K|
| 52|          Private|276515|    Bachelors|           13| Married-civ-spouse|     Other-service|        Husband|              White|   Male|           0|           0|            40|          Cuba| <=50K|
| 59|          Private|159937|      HS-grad|            9| Married-civ-spouse|             Sales|        Husband|              White|   Male|           0|           0|            48| United-States| <=50K|
| 53|          Private|346253|      HS-grad|            9|           Divorced|             Sales|      Own-child|              White| Female|           0|           0|            35| United-States| <=50K|
| 57|          Private|249977|    Assoc-voc|           11| Married-civ-spouse|    Prof-specialty|        Husband|              White|   Male|           0|           0|            40| United-States| <=50K|
| 76|          Private|124191|      Masters|           14| Married-civ-spouse|   Exec-managerial|        Husband|              White|   Male|           0|           0|            40| United-States|  >50K|
| 56| Self-emp-not-inc|335605|      HS-grad|            9| Married-civ-spouse|     Other-service|        Husband|              White|   Male|           0|        1887|            50|        Canada|  >50K|
| 53|          Private| 95647|          9th|            5| Married-civ-spouse| Handlers-cleaners|        Husband|              White|   Male|           0|           0|            50| United-States| <=50K|
| 56|     Self-emp-inc|303090| Some-college|           10| Married-civ-spouse|             Sales|        Husband|              White|   Male|           0|           0|            50| United-States| <=50K|
+---+-----------------+------+-------------+-------------+-------------------+------------------+---------------+-------------------+-------+------------+------------+--------------+--------------+------+
only showing top 20 rows

Premiers pas avec Spark¶

Deux ou trois petites choses à ne pas oublier¶

Local et cluster¶

Spark et RDD¶

Les partitions¶

Spark et Python¶

Librairies sur Spark¶

Erreur : Cannot run program "python"¶

Erreur : Output directory file:/... already exists¶

Vérifier que Spark en local fonctionne¶

Sortie en plusieurs fichiers¶

Les opérations de bases¶

déclaration d'un RDD¶

enregistrement d'un RDD¶

lecture locale d'un RDD avec pandas¶

collect¶

map¶

filter¶

flatMap¶

group / reduce + mapValues¶

sort¶

join¶

le choix existentiel du join : le petit join¶

les trucs qui servent parfois parce que ... à l'usage ça sert¶

le truc à retenir¶

Spark DataFrame¶

Conversion à pandas¶

Retour aux RDD¶

Récuperer le schéma¶

Utiliser pandas pour spécifer le format¶

Enregistrement au format parquet¶

Relecture du format parquet¶

Dataframe Spark VS Dataframe pandas¶

	age	workclass	fnlwgt	education	education_num	marital_status	occupation	relationship	race	sex	capital_gain	hours_per_week	native_country	target
0	39	State-gov	77516	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	40	United-States	<=50K
1	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	13	United-States	<=50K
2	38	Private	215646	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	40	United-States	<=50K
3	53	Private	234721	11th	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0	40	United-States	<=50K
4	28	Private	338409	Bachelors	13	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0	40	Cuba	<=50K

	_c0	_c1	_c2	_c3	_c4	_c5	_c6	_c7	_c8	_c9	_c10	_c11	_c12	_c13	_c14
0	age	workclass	fnlwgt	education	education_num	marital_status	occupation	relationship	race	sex	capital_gain	capital_loss	hours_per_week	native_country	target
1	39	State-gov	77516	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	0	40	United-States	<=50K
2	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	0	13	United-States	<=50K
3	38	Private	215646	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	0	40	United-States	<=50K
4	53	Private	234721	11th	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0	0	40	United-States	<=50K