!gcloud dataproc clusters create ex-dataproc --enable-component-gateway --region us-central1 --zone us-central1-c --master-machine-type n1-standard-4 --master-boot-disk-size 500 --num-workers 2 --worker-machine-type n1-standard-4 --worker-boot-disk-size 500 --image-version 2.0-debian10 --optional-components JUPYTER --project dataproc-334718
!gcloud compute ssh ex-dataproc-m --project=dataproc-334718 --zone=us-central1-c -- -D 1080 -N
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" --proxy-server="socks5://localhost:1080" --user-data-dir="/tmp/ex-dataproc-m" http://ex-dataproc-m:8088
!gsutil mb -p dataproc-334718 -b on gs://ex-dataproc-bucket
!gsutil cp nlpDisasterTweets.csv gs://ex-dataproc-bucket
data = spark.read.format('csv').options(header='true', inferSchema='true', multiLine=True).load("gs://ex-dataproc-bucket/nlpDisasterTweets.csv")
print('Number of row in Data:', data.count())
[Stage 2:> (0 + 1) / 1]
Number of row in Data: 7613
data.show(5)
+---+-------+--------+--------------------+------+
| id|keyword|location| text|target|
+---+-------+--------+--------------------+------+
| 1| null| null|Our Deeds are the...| 1|
| 4| null| null|Forest fire near ...| 1|
| 5| null| null|All residents ask...| 1|
| 6| null| null|13,000 people rec...| 1|
| 7| null| null|Just got sent thi...| 1|
+---+-------+--------+--------------------+------+
only showing top 5 rows