imdb_analysis

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import sqlite3

import opendatasets as od od.download("https://www.kaggle.com/datasets/priy998/imdbsqlitedataset")

db = '/work/Imdb_dataset_analysis/imdbsqlitedataset/movie.sqlite' conn = sqlite3.connect(db) cur = conn.cursor()

cur.execute("SELECT * FROM movies") movies = cur.fetchall()

movie_df = pd.DataFrame(movies, columns=['id', 'original_title', 'budget', 'popularity', 'release_date', 'revenue', 'title', 'vote_average', 'vote_count', 'overview', 'tagline', 'uid', 'director_id'])

movie_df

movie_df.info()

cur.execute("SELECT * FROM directors") directors = cur.fetchall()

directors_df = pd.DataFrame(directors, columns=['name', 'id', 'gender', 'uid', 'department'])

directors_df.head()

cur.execute('SELECT COUNT(TITLE) FROM movies') count = cur.fetchall()

print(f"Number of the movies present in IMDB database is {count[0]}")

cur.execute('select * from directors where name in ("James Cameron", "Luc Besson", "John Woo")') three_directors = cur.fetchall()

print(f'these three directors data are : {three_directors}')

cur.execute('select * from directors where name like "Steven%"') name_like = cur.fetchall()

print(f"The directors whose names are starting with the word 'Steven' are : {name_like}")

cur.execute('select count(id) from directors where gender=="1"') females = cur.fetchall()

print(f"The number of female directors is {females[0][0]}")

cur.execute('select name from directors where gender==1') tenth = cur.fetchall()

print(f"The tenth first women is {tenth[9][0]}")

movie_df

cur.execute('select original_title from movies order by popularity desc limit 3') most_popular = cur.fetchall()

print(f"The 3 most popular movies are : {most_popular[0][0]}, {most_popular[1][0]} and {most_popular[2][0]}")

movie_df.head()

cur.execute('select original_title from movies order by budget desc limit 3') most_bankable = cur.fetchall()

print(f"the 3 most bankable movies are {most_bankable[0][0]}, {most_bankable[1][0]} and {most_bankable[2][0]}")

movie_df

cur.execute('select original_title from movies where release_date > "2000-01-01" order by vote_average desc limit 1') most_awarded_avg = cur.fetchall()

print(f"the most awarded average vote movie is {most_awarded_avg[0][0]}")

cur.execute( 'select original_title from movies join directors on directors.id= movies.director_id where directors.name = "Brenda Chapman"') directed_by = cur.fetchall()

print(f"The movie(e) directed by Brendan Chapman is {directed_by[0]}")

cur.execute( 'select name from directors join movies on directors.id= movies.director_id group by director_id order by count(name) desc limit 1') director_movie = cur.fetchall()

print(f"The director who has made the most movies is {director_movie[0][0]}")

cur.execute( 'select name from directors join movies on directors.id= movies.director_id group by director_id order by sum(budget) desc limit 1') most_bankable = cur.fetchall()

print(f"The director who is most bankable is {most_bankable[0][0]}")

top_10 = pd.DataFrame(cur.execute('select * from movies order by budget desc limit 10').fetchall(), columns=['id', 'original_title', 'budget', 'popularity', 'release_date', 'revenue', 'title', 'vote_average', 'vote_count', 'overview', 'tagline', 'uid', 'director_id'])

top_10

cur.execute('select original_title, revenue from movies order by revenue desc limit 10') top10_movies = cur.fetchall() most_revenue = pd.DataFrame(top10_movies, columns=['original_title', 'revenue'])

most_revenue

cur.execute('select original_title,vote_average from movies order by vote_average desc limit 10') data = cur.fetchall() highest_vote_average = pd.DataFrame(data, columns=['original_title', 'vote_average'])

highest_vote_average

cur.execute( 'SELECT name, COUNT(original_title), SUM(revenue)FROM directors JOIN movies ON directors.id = movies.director_id GROUP BY name ORDER BY SUM(revenue) DESC') mostProfitableDirector = cur.fetchall() mostProfitableDirector = pd.DataFrame(mostProfitableDirector, columns=['director_name', 'movies', 'revenue']) mostProfitableDirector = mostProfitableDirector.head(10)

mostProfitableDirector

cur.execute( 'SELECT original_title, release_date, budget, revenue, popularity, vote_average FROM directors JOIN movies ON directors.id = movies.director_id WHERE name = "Steven Spielberg" ORDER BY release_date DESC') moviesByStevenSpielberg = cur.fetchall() moviesByStevenSpielberg = pd.DataFrame(moviesByStevenSpielberg, columns=['original_title', 'release_date', 'budget', 'revenue', 'popularity', 'vote_average']) moviesByStevenSpielberg