uber-analysis

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import warnings import opendatasets as od import time, datetime warnings.filterwarnings("ignore")

# For downloading you need to provide kaggle user and key in input od.download("https://www.kaggle.com/datasets/zusmani/uberdrives")

df = pd.read_csv('/work/uber-analysis/uberdrives/My Uber Drives - 2016.csv')

df.head()

df.info()

df.columns = df.columns.str.replace('*', '')

df.drop(index=1155, inplace=True)

df.isnull().sum()

df['PURPOSE'].fillna(method='ffill', inplace=True)

df.isnull().sum()

df.describe()

df['START_DATE'] = pd.to_datetime(df['START_DATE'], errors='coerce')

df['END_DATE'] = pd.to_datetime(df['END_DATE'], errors='coerce')

df.info()

# Frequency of all start locations start = df['START'].value_counts()

start[start > 10]

# Frequency of all end locations stop = df['STOP'].value_counts()

stop

# miles, column, purpose, minute miles = df.MILES.value_counts()

miles[miles > 10]

plt.figure(figsize=(12, 8)) miles[miles > 10].plot(kind='bar') plt.xlabel('miles') plt.ylabel('frequency') plt.title("most frequent booked miles")

df['PURPOSE'].value_counts()

df[df['MILES'] > 10]

df.groupby(["PURPOSE"])['MILES'].mean()

df.groupby(["PURPOSE"]).agg({'MILES': ['mean', max, min]})

sns.boxplot(data=df, x=df.PURPOSE, y=df.MILES) plt.xticks(rotation=45)

plt.figure(figsize=(15, 6)) sns.countplot(data=df['PURPOSE'], x=df.PURPOSE, order=df['PURPOSE'].value_counts().index, palette='viridis')

df['minutes'] = df.END_DATE - df.START_DATE

df['minutes'] = df['minutes'].dt.total_seconds() / 60

df.head()

df.info()

plt.figure(figsize=(16, 7)) plt.subplot(1, 2, 1) sns.boxplot(data=df, x=df.PURPOSE, y=df.MILES) plt.xticks(rotation=45) plt.subplot(1, 2, 2) sns.boxplot(data=df, x=df.PURPOSE, y=df.minutes) plt.xticks(rotation=45)

plt.figure(figsize=(16, 7)) plt.subplot(1, 2, 1) sns.boxplot(data=df, x=df.PURPOSE, y=df.MILES, showfliers=False) plt.xticks(rotation=45) plt.subplot(1, 2, 2) sns.boxplot(data=df, x=df.PURPOSE, y=df.minutes, showfliers=False) plt.xticks(rotation=45)

### month column df['month'] = pd.DatetimeIndex(df['START_DATE']).month

dic = {1: 'jan', 2: 'feb', 3: 'mar', 4: 'apr', 5: 'may', 6: 'june', 7: 'july', 8: 'aug', 9: 'sep', 10: 'oct', 11: 'nov', 12: 'dec'}

df['month'] = df['month'].map(dic)

plt.figure(figsize=(12, 7)) sns.countplot(data=df['month'], x=df['month'], order=df['month'].value_counts().index)

def round(x): if x['START'] == x['STOP']: return 'yes' else: return 'no'

df['Round_trip'] = df.apply(round, axis=1)

sns.countplot(data=df, x=df['Round_trip'], order=df['Round_trip'].value_counts().index)

df['Round_trip'].value_counts()

plt.figure(figsize=(17, 7)) sns.countplot(x =df['Round_trip'], hue=df['month'], palette='Paired')

a = df.groupby(['month', 'Round_trip']).agg({'Round_trip': 'count'}) a.columns = ['countdata'] a = a.reset_index() a.sort_values(by=['month','countdata'], ascending=False)

a[a['Round_trip'] == 'yes'].sort_values(by=['countdata'], ascending=False)

df[df['month'] == 'dec'].groupby(['PURPOSE','month', 'Round_trip'])['month'].count().plot(kind='bar')

# line and scatter plt plt.figure(figsize=(12, 7)) plt.subplot(1,2,1) sns.lineplot(data=df, x=df.minutes, y=df['MILES']) plt.subplot(1,2,2) sns.scatterplot(data=df, x=df.minutes, y=df['MILES'])