#importing modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
data = pd.read_csv('desktop/AB_NYC_2019.csv')
data.head()
#viewing columns and rows
data.shape
#Descriptive statistics for numerical values
data.describe()
#Dropping columns that are unnecessary for our analysis
data.drop('id', axis = 1, inplace= True)
data.drop('host_name', axis = 1, inplace = True)
data.drop('calculated_host_listings_count', axis=1, inplace=True)
data.drop('last_review', axis = 1, inplace =True)
#Renaming columns
data = data.rename(columns={'neighbourhood_group': 'borough',
'neighbourhood': 'neighborhood',
'listing_price': 'price'})
data.columns
#checking for duplicated values
print(np.sum(data.duplicated()), 'values are duplicated.')
#counting null values for each value
missing_values_count = np.sum(data.isnull())
missing_values_count
total_cells = np.product(data.shape)
total_missing = missing_values_count.sum()
#calculating the percentage of missing values
percentage_missing = np.round((total_missing / total_cells) * 100, 2)
print("Percentage of missing values in the dataset:", percentage_missing)
#Imputation of missing values with median values
data['reviews_per_month'] = data.reviews_per_month.fillna(data.reviews_per_month.median())
#Dropping null values and counting the dropped rows
cleaned_df = data.dropna(axis = 0, how ='any') #Dropping Rows with at least 1 null value
print(len(data) - len(cleaned_df ), 'rows were dropped with missing values.')
np.sum(cleaned_df.isnull())
#Excluding prices that are equal to zero
df = cleaned_df[cleaned_df['price'] != 0]
df.head()
# calculate interquartile range
q25, q75 = np.percentile(cleaned_df.price, 25), np.percentile(cleaned_df.price, 75)
iqr = q75 - q25
#calculate the outlier cutoff
cut_off = iqr * 1.5
lower, upper = q25 - cut_off, q75 + cut_off
print('Percentiles: 25th=%.3f, 75th=%.3f, IQR=%.3f' % (q25, q75, iqr))
# identify outliers
outliers = [x for x in cleaned_df.price if x < lower or x > upper]
print('Outliers observations:',len(outliers))
#non outliers
non_outliers = [x for x in cleaned_df.price if x >= lower and x <= upper]
print('Non-outlier observations:', len(non_outliers))
#Viewing distribution of numeric data
_ = df.hist(figsize=(20,12), grid = False, color = '#FF5A5F')
#Vewing correlation of the numerical values
plt.figure(figsize=(20,12))
abnb_corr = df.corr()
_ = sns.heatmap(abnb_corr ,cbar=True,annot=True, cmap="Greens")
#Univariate analysis
from wordcloud import WordCloud
text = ' '.join([text for text in df['name']])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110, background_color="white").generate(str(text))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
print('There are {} words in the listing description.'.format(len(text)))
#Median Listing Price by borough
median_listing_price = df.groupby('borough')['price'].agg('median').sort_values(ascending = False)
np.round(median_listing_price, 2)
#Plotting median listing price
median_listing_price.plot(kind='bar', title = "Median Listing Price", figsize=(10,8), color = '#FF5A5F')
_ = plt.xlabel('Borough')
_ = plt.ylabel('Lisitng Price (USD)')
listings_boroughs = df.groupby('borough')['name'].count().sort_values(ascending = False)
listings_boroughs
#plotting
listings_boroughs.plot(kind = 'bar', title = 'Total Listings by Borough', figsize=(10,8), color = '#FF5A5F')
_ = plt.ylabel('Count')
#Median Listing Price By Room Type
median_price_room_type = df.groupby('room_type')['price'].agg('median').sort_values(ascending = False)
np.round(median_price_room_type,2)
#Plotting
median_price_room_type.plot(kind = 'bar', title = 'Median Listing Price By Room Type', figsize=(10,8), color = '#FF5A5F')
_ = plt.xlabel('Room Type')
_ = plt.ylabel('Listing Price (USD)')
reviews_room_type = df.groupby('room_type')['number_of_reviews'].agg('count')
reviews_room_type
_ = reviews_room_type .plot(kind = 'bar', title = 'Number of Reviews by Room Type',figsize=(10,8), color = '#FF5A5F')
top_ten_neighborhoods = df.groupby('neighborhood')['price'].agg('median').nlargest(n=10).sort_values(ascending = True)
top_ten_neighborhoods
_ = top_ten_neighborhoods.plot(kind = 'barh', title = 'Top Ten Neighborhoods Median Listing Price by Neighborhood', figsize=(10,8), color = '#FF5A5F')
bottom_ten_neighborhoods = df.groupby('neighborhood')['price'].agg('median').nsmallest(n=10).sort_values(ascending = True)
bottom_ten_neighborhoods
_ = bottom_ten_neighborhoods.plot(kind = 'barh', title = 'Bottom Ten Median Listing Price by Neighborhood', figsize=(10,8), color = '#FF5A5F')
import urllib
#initializing the figure size
plt.figure(figsize=(20,12))
#loading the png NYC image found on Google and saving to my local folder along with the project
i=urllib.request.urlopen('https://upload.wikimedia.org/wikipedia/commons/e/ec/Neighbourhoods_New_York_City_Map.PNG')
nyc_img=plt.imread(i)
#scaling the image based on the latitude and longitude max and mins for proper output
plt.imshow(nyc_img,zorder=0,extent=[-74.258, -73.7, 40.49,40.92])
ax=plt.gca()
fig = df.plot(kind = 'scatter', x="longitude", y="latitude", label='availability_365', c = 'availability_365',
ax = ax, cmap=plt.get_cmap('jet'), colorbar=True, alpha=0.4, figsize=(20,12), grid = False)
plt.legend()
plt.show()