%%bash
$HADOOP_HOME/sbin/start-dfs.sh
$HADOOP_HOME/sbin/start-yarn.sh
$HADOOP_HOME/bin/mapred --daemon start historyserver
! $HADOOP_HOME/bin/hdfs dfs -put salaries.csv
%%file salaries_spark.py
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
spark = SparkSession.builder.appName("Top 10 salaries").getOrCreate()
mySchema = StructType([
StructField("name", StringType(), False),
StructField("position", StringType(), False),
StructField("id", StringType(), False),
StructField("department", StringType(), False),
StructField("date", StringType(), False),
StructField("salary", StringType(), False),
StructField("value", StringType(), False),
])
df = spark.read.format("csv").schema(mySchema).load("salaries.csv")
df.select(col("salary"),(length("salary")-1).alias("strlen")) \
.select(col("salary").substr(lit(2),col("strlen")).cast("float").alias("salary")) \
.orderBy(desc("salary")) \
.select(col("salary").alias("Top_10_salaries")) \
.limit(10).show()
! $SPARK_HOME/bin/spark-submit --master yarn salaries_spark.py
%%bash
$HADOOP_HOME/bin/mapred --daemon stop historyserver
$HADOOP_HOME/sbin/stop-yarn.sh
$HADOOP_HOME/sbin/stop-dfs.sh