import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import opendatasets as od
od.download("https://www.kaggle.com/datasets/priy998/imdbsqlitedataset")
db = '/work/Imdb_dataset_analysis/imdbsqlitedataset/movie.sqlite'
conn = sqlite3.connect(db)
cur = conn.cursor()
cur.execute("SELECT * FROM movies")
movies = cur.fetchall()
movie_df = pd.DataFrame(movies,
columns=['id', 'original_title', 'budget', 'popularity', 'release_date', 'revenue', 'title',
'vote_average', 'vote_count', 'overview', 'tagline', 'uid', 'director_id'])
movie_df
movie_df.info()
cur.execute("SELECT * FROM directors")
directors = cur.fetchall()
directors_df = pd.DataFrame(directors, columns=['name', 'id', 'gender', 'uid', 'department'])
directors_df.head()
cur.execute('SELECT COUNT(TITLE) FROM movies')
count = cur.fetchall()
print(f"Number of the movies present in IMDB database is {count[0]}")
cur.execute('select * from directors where name in ("James Cameron", "Luc Besson", "John Woo")')
three_directors = cur.fetchall()
print(f'these three directors data are : {three_directors}')
cur.execute('select * from directors where name like "Steven%"')
name_like = cur.fetchall()
print(f"The directors whose names are starting with the word 'Steven' are : {name_like}")
cur.execute('select count(id) from directors where gender=="1"')
females = cur.fetchall()
print(f"The number of female directors is {females[0][0]}")
cur.execute('select name from directors where gender==1')
tenth = cur.fetchall()
print(f"The tenth first women is {tenth[9][0]}")
movie_df
cur.execute('select original_title from movies order by popularity desc limit 3')
most_popular = cur.fetchall()
print(f"The 3 most popular movies are : {most_popular[0][0]}, {most_popular[1][0]} and {most_popular[2][0]}")
movie_df.head()
cur.execute('select original_title from movies order by budget desc limit 3')
most_bankable = cur.fetchall()
print(f"the 3 most bankable movies are {most_bankable[0][0]}, {most_bankable[1][0]} and {most_bankable[2][0]}")
movie_df
cur.execute('select original_title from movies where release_date > "2000-01-01" order by vote_average desc limit 1')
most_awarded_avg = cur.fetchall()
print(f"the most awarded average vote movie is {most_awarded_avg[0][0]}")
cur.execute(
'select original_title from movies join directors on directors.id= movies.director_id where directors.name = "Brenda Chapman"')
directed_by = cur.fetchall()
print(f"The movie(e) directed by Brendan Chapman is {directed_by[0]}")
cur.execute(
'select name from directors join movies on directors.id= movies.director_id group by director_id order by count(name) desc limit 1')
director_movie = cur.fetchall()
print(f"The director who has made the most movies is {director_movie[0][0]}")
cur.execute(
'select name from directors join movies on directors.id= movies.director_id group by director_id order by sum(budget) desc limit 1')
most_bankable = cur.fetchall()
print(f"The director who is most bankable is {most_bankable[0][0]}")
top_10 = pd.DataFrame(cur.execute('select * from movies order by budget desc limit 10').fetchall(),
columns=['id', 'original_title', 'budget', 'popularity', 'release_date', 'revenue', 'title',
'vote_average', 'vote_count', 'overview', 'tagline', 'uid', 'director_id'])
top_10
cur.execute('select original_title, revenue from movies order by revenue desc limit 10')
top10_movies = cur.fetchall()
most_revenue = pd.DataFrame(top10_movies, columns=['original_title', 'revenue'])
most_revenue
cur.execute('select original_title,vote_average from movies order by vote_average desc limit 10')
data = cur.fetchall()
highest_vote_average = pd.DataFrame(data, columns=['original_title', 'vote_average'])
highest_vote_average
cur.execute(
'SELECT name, COUNT(original_title), SUM(revenue)FROM directors JOIN movies ON directors.id = movies.director_id GROUP BY name ORDER BY SUM(revenue) DESC')
mostProfitableDirector = cur.fetchall()
mostProfitableDirector = pd.DataFrame(mostProfitableDirector, columns=['director_name', 'movies', 'revenue'])
mostProfitableDirector = mostProfitableDirector.head(10)
mostProfitableDirector
cur.execute(
'SELECT original_title, release_date, budget, revenue, popularity, vote_average FROM directors JOIN movies ON directors.id = movies.director_id WHERE name = "Steven Spielberg" ORDER BY release_date DESC')
moviesByStevenSpielberg = cur.fetchall()
moviesByStevenSpielberg = pd.DataFrame(moviesByStevenSpielberg,
columns=['original_title', 'release_date', 'budget', 'revenue', 'popularity',
'vote_average'])
moviesByStevenSpielberg