#Set up
import re
from statistics import mean
AllFile = sc.wholeTextFiles("/FileStore/tables/data/")
#Q1
avgMincomeS = AllFile.map(lambda x :x[1])\
.flatMap(lambda x: x.split("\r\n"))\
.map(lambda x: float(x.split(" ")[1]))\
.mean()
avgMincomeS
#Q2
tmp = AllFile.map(lambda x: (re.split("\.|\_",x[0].split("dbfs:/FileStore/tables/data/")[-1])[0],x[1].split("\r\n")))
avgMincomeC = tmp.map(lambda x: (x[0],mean(list(map(lambda x: float(x.split(" ")[-1]),x[1])))))\
.groupByKey().mapValues(mean)\
.collect()
avgMincomeC
#Q3
tmp = AllFile.map(lambda x: (re.split("\.|\_",x[0].split("dbfs:/FileStore/tables/data/")[-1])[0],x[1].split("\r\n")))
avgSincomeC = tmp.map(lambda x: (x[0],mean(list(map(lambda x: float(x.split(" ")[-1]),x[1])))))\
.groupByKey().mapValues(sum)\
.collect()
avgSincomeC
#Q4
tmp = AllFile.map(lambda x: (re.split("\.",x[0].split("dbfs:/FileStore/tables/data/")[-1])[0],x[1].split("\r\n")))
avgSincomeS = tmp.map(lambda x: (x[0],sum(list(map(lambda x: float(x.split(" ")[-1]),x[1])))))\
.groupByKey().mapValues(sum)\
.collect()
avgSincomeS
#Q5
tmp = AllFile.map(lambda x: (re.split("\.",x[0].split("dbfs:/FileStore/tables/data/")[-1])[0],x[1].split("\r\n")))
tmp = tmp.flatMap(lambda x : list(map(lambda y: (y.split(" ")[0],(float(y.split(" ")[1]),x[0])),x[1])))
bestPerf = tmp.groupByKey().mapValues(max).collect()
bestPerf