NYC Airbnb Exploratory Data Analysis (EDA) and Geospatial Visualization

#importing modules import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns sns.set_style('whitegrid')

data = pd.read_csv('desktop/AB_NYC_2019.csv')

data.head()

#viewing columns and rows data.shape

#Descriptive statistics for numerical values data.describe()

#Dropping columns that are unnecessary for our analysis data.drop('id', axis = 1, inplace= True) data.drop('host_name', axis = 1, inplace = True) data.drop('calculated_host_listings_count', axis=1, inplace=True) data.drop('last_review', axis = 1, inplace =True) #Renaming columns data = data.rename(columns={'neighbourhood_group': 'borough', 'neighbourhood': 'neighborhood', 'listing_price': 'price'}) data.columns

#checking for duplicated values print(np.sum(data.duplicated()), 'values are duplicated.')

#counting null values for each value missing_values_count = np.sum(data.isnull()) missing_values_count

total_cells = np.product(data.shape) total_missing = missing_values_count.sum() #calculating the percentage of missing values percentage_missing = np.round((total_missing / total_cells) * 100, 2) print("Percentage of missing values in the dataset:", percentage_missing)

#Imputation of missing values with median values data['reviews_per_month'] = data.reviews_per_month.fillna(data.reviews_per_month.median())

#Dropping null values and counting the dropped rows cleaned_df = data.dropna(axis = 0, how ='any') #Dropping Rows with at least 1 null value print(len(data) - len(cleaned_df ), 'rows were dropped with missing values.')

np.sum(cleaned_df.isnull())

#Excluding prices that are equal to zero df = cleaned_df[cleaned_df['price'] != 0]

df.head()

# calculate interquartile range q25, q75 = np.percentile(cleaned_df.price, 25), np.percentile(cleaned_df.price, 75) iqr = q75 - q25

#calculate the outlier cutoff cut_off = iqr * 1.5 lower, upper = q25 - cut_off, q75 + cut_off print('Percentiles: 25th=%.3f, 75th=%.3f, IQR=%.3f' % (q25, q75, iqr))

# identify outliers outliers = [x for x in cleaned_df.price if x < lower or x > upper] print('Outliers observations:',len(outliers))

#non outliers non_outliers = [x for x in cleaned_df.price if x >= lower and x <= upper] print('Non-outlier observations:', len(non_outliers))

#Viewing distribution of numeric data _ = df.hist(figsize=(20,12), grid = False, color = '#FF5A5F')

#Vewing correlation of the numerical values plt.figure(figsize=(20,12)) abnb_corr = df.corr() _ = sns.heatmap(abnb_corr ,cbar=True,annot=True, cmap="Greens")

#Univariate analysis from wordcloud import WordCloud text = ' '.join([text for text in df['name']]) wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110, background_color="white").generate(str(text)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.show()

print('There are {} words in the listing description.'.format(len(text)))

#Median Listing Price by borough median_listing_price = df.groupby('borough')['price'].agg('median').sort_values(ascending = False) np.round(median_listing_price, 2)

#Plotting median listing price median_listing_price.plot(kind='bar', title = "Median Listing Price", figsize=(10,8), color = '#FF5A5F') _ = plt.xlabel('Borough') _ = plt.ylabel('Lisitng Price (USD)')

listings_boroughs = df.groupby('borough')['name'].count().sort_values(ascending = False) listings_boroughs

#plotting listings_boroughs.plot(kind = 'bar', title = 'Total Listings by Borough', figsize=(10,8), color = '#FF5A5F') _ = plt.ylabel('Count')

#Median Listing Price By Room Type median_price_room_type = df.groupby('room_type')['price'].agg('median').sort_values(ascending = False) np.round(median_price_room_type,2)

#Plotting median_price_room_type.plot(kind = 'bar', title = 'Median Listing Price By Room Type', figsize=(10,8), color = '#FF5A5F') _ = plt.xlabel('Room Type') _ = plt.ylabel('Listing Price (USD)')

reviews_room_type = df.groupby('room_type')['number_of_reviews'].agg('count') reviews_room_type

_ = reviews_room_type .plot(kind = 'bar', title = 'Number of Reviews by Room Type',figsize=(10,8), color = '#FF5A5F')

top_ten_neighborhoods = df.groupby('neighborhood')['price'].agg('median').nlargest(n=10).sort_values(ascending = True) top_ten_neighborhoods

_ = top_ten_neighborhoods.plot(kind = 'barh', title = 'Top Ten Neighborhoods Median Listing Price by Neighborhood', figsize=(10,8), color = '#FF5A5F')

bottom_ten_neighborhoods = df.groupby('neighborhood')['price'].agg('median').nsmallest(n=10).sort_values(ascending = True) bottom_ten_neighborhoods

_ = bottom_ten_neighborhoods.plot(kind = 'barh', title = 'Bottom Ten Median Listing Price by Neighborhood', figsize=(10,8), color = '#FF5A5F')

import urllib #initializing the figure size plt.figure(figsize=(20,12)) #loading the png NYC image found on Google and saving to my local folder along with the project i=urllib.request.urlopen('https://upload.wikimedia.org/wikipedia/commons/e/ec/Neighbourhoods_New_York_City_Map.PNG') nyc_img=plt.imread(i) #scaling the image based on the latitude and longitude max and mins for proper output plt.imshow(nyc_img,zorder=0,extent=[-74.258, -73.7, 40.49,40.92]) ax=plt.gca() fig = df.plot(kind = 'scatter', x="longitude", y="latitude", label='availability_365', c = 'availability_365', ax = ax, cmap=plt.get_cmap('jet'), colorbar=True, alpha=0.4, figsize=(20,12), grid = False) plt.legend() plt.show()