import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import io
import requests
from PIL import Image
plt.figure(figsize=(15,8)) # Setting plot size
df = pd.read_csv('AB_NYC_2019.csv')
df.columns
df.shape
df.dtypes
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 48895 non-null int64
1 name 48879 non-null object
2 host_id 48895 non-null int64
3 host_name 48874 non-null object
4 neighbourhood_group 48895 non-null object
5 neighbourhood 48895 non-null object
6 latitude 48895 non-null float64
7 longitude 48895 non-null float64
8 room_type 48895 non-null object
9 price 48895 non-null int64
10 minimum_nights 48895 non-null int64
11 number_of_reviews 48895 non-null int64
12 last_review 38843 non-null object
13 reviews_per_month 38843 non-null float64
14 calculated_host_listings_count 48895 non-null int64
15 availability_365 48895 non-null int64
dtypes: float64(3), int64(7), object(6)
memory usage: 6.0+ MB
df.head()
df.tail()
df.describe()
df.isnull().sum()
columnsToDrop = ['id', 'host_name', 'last_review']
columnsToDrop = ['id', 'host_name', 'last_review']
df.drop(columnsToDrop, axis="columns", inplace=True) # inplace = True returns new df
df.fillna({'reviews_per_month': 0}, inplace=True) # {'column_name': value}
df['name'] # column_name, pandas series
df[['name', 'price']] # multiple columns, pandas df
df[0:111] # rows by index
df[['name', 'price']][0:111]
df['price'] < 100 # Boolean Indexing
booleanMask = df['price'] < 100
df[booleanMask]
df.nlargest(10, 'price')
df['neighbourhood_group'].unique()
df['neighbourhood_group'].value_counts()
df['neighbourhood'].value_counts().head(10)
df['neighbourhood'].value_counts().head(10).plot(kind="bar")
sns.countplot(data=df, x="neighbourhood_group" )
order = df['neighbourhood_group'].value_counts().index
sns.countplot(data=df, x='neighbourhood_group', order=order)
sns.countplot(data=df, x='neighbourhood_group', order=order, hue="room_type")
sns.histplot(df['price'])
mask = df['price'] <= 500
df[mask]
sns.histplot(df[mask]['price'])
sns.histplot(df[mask]['price'], kde=True)
df[mask].price.mean()
affordableDf = df[mask]
plt.figure(figsize=(15,8))
sns.violinplot(data=affordableDf, x="neighbourhood_group", y="price")
affordableDf.plot(
kind='scatter',
x='longitude',
y='latitude',
c='price',
cmap='inferno',
colorbar=True,
alpha=0.8,
figsize=(12,8))
# url ='https://upload.wikimedia.org/wikipedia/commons/e/ec/Neighbourhoods_New_York_City_Map.PNG'
# im = Image.open(requests.get(url, stream=True).raw)
# print(requests.get(url, stream=True).content)
im = Image.open('Neighbourhoods_New_York_City_Map.png')
plt.imshow(im, zorder=0, extent=[-74.258, -73.7, 40.49, 40.92])
ax = plt.gca()
affordableDf.plot(
ax=ax,
zorder=1,
kind='scatter',
x='longitude',
y='latitude',
c='price',
cmap='inferno',
colorbar=True,
alpha=0.8,
figsize=(12,8)
)