fileRDD = sc.textFile('/FileStore/tables/long_text/What_is_Lorem_Ipsum.txt')
fileRDD.collect()
fileRDD.flatMap(lambda x: x.split(" ")).map(lambda x: ((x[0] if len(x) >= 1 else x),1)).filter(lambda x : x[0] == "a" or x[0] == "A" or x[0] == "b" or x[0]=="B").reduceByKey(lambda a,b: a+b).collect()
import re
regex = re.compile("[^a-zA-Z']")
fileRDD.flatMap(lambda x: x.lower().split(" ")).map(lambda x:regex.sub('',x)).map(lambda x: (x,1)).reduceByKey(lambda a,b: a+b).sortBy(lambda x: x[1], ascending=False).collect()
folderRDD = sc.wholeTextFiles("/FileStore/tables/TPShop").map(lambda x : (x[0].split("dbfs:/FileStore/tables/TPShop/")[1],x[1].split("\r\n")))
folderRDD.collect()
## Average of shops in France
meanFrance = folderRDD.flatMap(lambda x : x[1]).map(lambda x : float(x.split(" ")[1])).mean()
meanFrance
## Average of each city
from statistics import mean
meanShop = folderRDD.map(lambda x : (re.split("\_|\.", x[0])[0],x[1])).map(lambda x: (x[0],mean(list(map(lambda x: float(x.split(" ")[-1]),x[1]))))).groupByKey().mapValues(mean)
meanShop.collect()
## Total of each city
from statistics import mean
totalCity = folderRDD.map(lambda x : (re.split("\_|\.", x[0])[0],x[1])).map(lambda x: (x[0],mean(list(map(lambda x: float(x.split(" ")[-1]),x[1]))))).groupByKey().mapValues(sum)
totalCity.collect()
## Total of each shop
from statistics import mean
totalCity = folderRDD.map(lambda x : (re.split("\.", x[0])[0],x[1])).map(lambda x: (x[0],mean(list(map(lambda x: float(x.split(" ")[-1]),x[1]))))).groupByKey().mapValues(sum)
totalCity.collect()
##Best Performance
Best=folderRDD.flatMap(lambda x : list(map(lambda y: (y.split(" ")[0],(float(y.split(" ")[1]),x[0].split(".")[0]),),x[1])))
Best.groupByKey().mapValues(max).collect()
Best.collect()