import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import opendatasets as od
import time, datetime
warnings.filterwarnings("ignore")
# For downloading you need to provide kaggle user and key in input
od.download("https://www.kaggle.com/datasets/zusmani/uberdrives")
df = pd.read_csv('/work/uber-analysis/uberdrives/My Uber Drives - 2016.csv')
df.head()
df.info()
df.columns = df.columns.str.replace('*', '')
df
df.drop(index=1155, inplace=True)
df
df.isnull().sum()
df['PURPOSE'].fillna(method='ffill', inplace=True)
df.isnull().sum()
df.describe()
df['START_DATE'] = pd.to_datetime(df['START_DATE'], errors='coerce')
df['END_DATE'] = pd.to_datetime(df['END_DATE'], errors='coerce')
df.info()
# Frequency of all start locations
start = df['START'].value_counts()
start[start > 10]
# Frequency of all end locations
stop = df['STOP'].value_counts()
stop
# miles, column, purpose, minute
miles = df.MILES.value_counts()
miles[miles > 10]
plt.figure(figsize=(12, 8))
miles[miles > 10].plot(kind='bar')
plt.xlabel('miles')
plt.ylabel('frequency')
plt.title("most frequent booked miles")
df['PURPOSE'].value_counts()
df[df['MILES'] > 10]
df.groupby(["PURPOSE"])['MILES'].mean()
df.groupby(["PURPOSE"]).agg({'MILES': ['mean', max, min]})
sns.boxplot(data=df, x=df.PURPOSE, y=df.MILES)
plt.xticks(rotation=45)
plt.figure(figsize=(15, 6))
sns.countplot(data=df['PURPOSE'], x=df.PURPOSE, order=df['PURPOSE'].value_counts().index, palette='viridis')
df['minutes'] = df.END_DATE - df.START_DATE
df
df['minutes'] = df['minutes'].dt.total_seconds() / 60
df.head()
df.info()
plt.figure(figsize=(16, 7))
plt.subplot(1, 2, 1)
sns.boxplot(data=df, x=df.PURPOSE, y=df.MILES)
plt.xticks(rotation=45)
plt.subplot(1, 2, 2)
sns.boxplot(data=df, x=df.PURPOSE, y=df.minutes)
plt.xticks(rotation=45)
plt.figure(figsize=(16, 7))
plt.subplot(1, 2, 1)
sns.boxplot(data=df, x=df.PURPOSE, y=df.MILES, showfliers=False)
plt.xticks(rotation=45)
plt.subplot(1, 2, 2)
sns.boxplot(data=df, x=df.PURPOSE, y=df.minutes, showfliers=False)
plt.xticks(rotation=45)
### month column
df['month'] = pd.DatetimeIndex(df['START_DATE']).month
df
dic = {1: 'jan', 2: 'feb', 3: 'mar', 4: 'apr', 5: 'may', 6: 'june', 7: 'july', 8: 'aug', 9: 'sep', 10: 'oct', 11: 'nov',
12: 'dec'}
df['month'] = df['month'].map(dic)
df
plt.figure(figsize=(12, 7))
sns.countplot(data=df['month'], x=df['month'], order=df['month'].value_counts().index)
def round(x):
if x['START'] == x['STOP']:
return 'yes'
else:
return 'no'
df['Round_trip'] = df.apply(round, axis=1)
sns.countplot(data=df, x=df['Round_trip'], order=df['Round_trip'].value_counts().index)
df['Round_trip'].value_counts()
plt.figure(figsize=(17, 7))
sns.countplot(x =df['Round_trip'], hue=df['month'], palette='Paired')
a = df.groupby(['month', 'Round_trip']).agg({'Round_trip': 'count'})
a.columns = ['countdata']
a = a.reset_index()
a.sort_values(by=['month','countdata'], ascending=False)
a[a['Round_trip'] == 'yes'].sort_values(by=['countdata'], ascending=False)
df[df['month'] == 'dec'].groupby(['PURPOSE','month', 'Round_trip'])['month'].count().plot(kind='bar')
# line and scatter plt
plt.figure(figsize=(12, 7))
plt.subplot(1,2,1)
sns.lineplot(data=df, x=df.minutes, y=df['MILES'])
plt.subplot(1,2,2)
sns.scatterplot(data=df, x=df.minutes, y=df['MILES'])