pip install xlrd
pip install openpyxl
pip install --upgrade pip
import pandas as pd
import xlrd as xlrd
import matplotlib.pyplot as plt
movies = pd.read_csv("Movies_dataset.csv", sep=",")
movies.info
movies.head(5)
1.
print(movies)
2.
2.1 Filtering "titleType" nach "movie" und "tvMovie"
movies.value_counts("titleType")
movies.loc[(movies["titleType"]).str.contains("movie")]
movies.loc[(movies["titleType"]).str.contains("movie") | (movies["titleType"]).str.contains("tvMovie")]
#Variable movies_restricted_1 mit Filter
movies_restricted_1=movies.loc[(movies["titleType"]).str.contains("movie") | (movies["titleType"]).str.contains("tvMovie")]
print(movies_restricted_1)
movies_restricted_1.head(10)
movies_restricted_1.loc[(movies_restricted_1["startYear"]>=1980)]
movies_restricted_2: Anwendung Filter > 1980
#Variable movies_restricted_2 mit Filter > 1980
movies_restricted_2=movies_restricted_1.loc[(movies_restricted_1["startYear"]>=1980)]
movies_restricted_2.sort_values(by=["startYear"])
2.3 Exclude rows with missing ratings or genres
#leere_Werte_genres = movies_restricted_2.isna()
#leere_Werte_genres = movies_restricted_2["genres"].isna()
#print(leere_Werte_genres)
#leere_Werte_rating = movies_restricted_2["averageRating"].isna()
leere_Werte_genres = movies_restricted_2["genres"].isna()==False
leere_Werte_rating = movies_restricted_2["averageRating"].isna()==False
#Variable movies_restricted_3
movies_restricted_3=movies_restricted_2[leere_Werte_genres & leere_Werte_rating]
movies_restricted_3.head(8)
3.
movies_restricted_4=movies_restricted_3.groupby("startYear")["averageRating"].mean().reset_index().round(decimals=2)
movies_restricted_4.plot.scatter(x = "startYear", y = "averageRating")
#plt.title("Diameter/Price")
#plt.show()
4.
4.1 Ratings
movies_restricted_3[["tconst", "averageRating"]]
ratings=movies_restricted_3[["tconst", "averageRating"]]
ratings.to_csv("ratings.csv")
4.2 Titles
movies_restricted_3[["tconst", "primaryTitle", "startYear", "runtimeMinutes", "genres"]]
title=movies_restricted_3[["tconst", "primaryTitle", "startYear", "runtimeMinutes", "genres"]]
title.to_csv("title.csv")