import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
#Extract dataset
df_airbnb = pd.read_csv('/work/Airbnbdata/Airbnb_dataset/Airbnb_Open_Data.csv', header=0, sep=',')
df_airbnb.head()
#In which columns there are null-data?
df_airbnb.isnull().any()
df_airbnb.info()
"""
License only has two non-null, so it's irrelevant.
"""
df_airbnb.drop(['license'], axis=1, inplace=True)
"""
This dataset is about New York Airbnb, so all values of country and country code
should be United States and US respectively.
"""
df_airbnb.country.fillna('United States', inplace=True)
df_airbnb['country code'].fillna('US', inplace=True)
"""
Null data in house_rules column
"""
total_data = df_airbnb.shape[0]
null_data = df_airbnb[df_airbnb.house_rules.isnull()].shape[0]
percentage = np.round(null_data / total_data, 4) * 100
print('More than {}% of data in house_rules is null-data'.format(percentage))
"""
Remove house_rules columns of dataset, but I'll assign it in a variable for
further analysis.
"""
house_rules = df_airbnb.house_rules[df_airbnb.house_rules.notnull()]
df_airbnb.drop(['house_rules'], axis=1, inplace=True)
"""
There are certain columns with null-data, so I'll apply dropna.
"""
df_airbnb.dropna(inplace=True)
df_airbnb.drop(['id', 'host id'], axis=1, inplace=True) #Unuseful columns
def cast_int(x):
return int(x.replace('$', '').replace(',', '').strip())
df_airbnb.price = df_airbnb.price.apply(cast_int)
df_airbnb['service fee'] = df_airbnb['service fee'].apply(cast_int)
df_airbnb.loc[:, ['price', 'service fee']].dtypes
df_airbnb.columns
#Change label columns
df_airbnb.rename(columns={'host_identity_verified': 'host_verified', 'neighbourhood group': 'Boroughs',
'room type': 'room_type', 'Construction year': 'year_built', 'service fee': 'service_fee',
'minimum nights': 'min_nights', 'number of reviwes': '#reviews', 'last review': 'last_review',
'number of reviews': '#reviews','reviews per month': 'reviews_month', 'review rate number': 'review_rate',
'calculated host listings count': '#guests', 'availability 365': 'availability_365'}, inplace=True)
#My colors
c_1 = sns.color_palette('coolwarm')[-1]
c_2 = sns.color_palette('bone')[-2]
palette = sns.color_palette('Set2')[:4]
#My palette
my_pltt = {x: c_2 for x in df_airbnb['room_type'].unique()}
my_pltt['Entire home/apt'] = c_1
frequency = df_airbnb.room_type.value_counts()
df_room_type = pd.DataFrame(frequency)
df_room_type.rename(columns={'room_type': 'freq'}, inplace=True)
explode = [0, 0, 0.3, 0.3]
#Canvas
sns.set_theme(style='white')
fig = plt.figure(figsize=(16, 8))
#Barplot
ax1 = fig.add_subplot(121)
ax1 = sns.barplot(data=df_room_type, x=df_room_type.index, y='freq',palette=my_pltt)
ax1.set_title('The most common\nroom type\n', fontdict={'size': 'x-large','weight': 'bold'})
ax1.set_xlabel('room_type')
#Pie chart
ax2 = fig.add_subplot(122)
ax2.pie(data=df_room_type, x='freq', autopct='%.01f%%', pctdistance=1.12, explode=explode, colors=palette)
ax2.legend(labels=df_room_type.index, bbox_to_anchor=(0.75, 0))
ax2.set_title('Composition of ads\nin Airbnb New York\n', fontdict={'size': 'x-large', 'weight': 'bold'})
fig.set_facecolor('#FFFFCB')
fig.show()
df_mean_price = df_airbnb.pivot_table(index='room_type', aggfunc={'price': 'mean'})
df_mean_price.sort_values('price', ascending=False, inplace=True)
df_mean_price.rename(columns={'price': 'mean_price'}, inplace=True)
df_mean_price.mean_price = df_mean_price.mean_price.apply(lambda x: np.round(x,2))
#Palette
my_pltt['Entire home/apt'] = c_2
my_pltt['Hotel room'] = c_1
#Canvas
fig = plt.figure(figsize=(18, 8))
#Boxplot
ax1 = fig.add_subplot(121)
ax1 = sns.boxplot(data=df_airbnb, x='room_type', y='price', palette='Set2')
ax1.set_title('Distribution of price\naccording to room type\n', fontdict={'size': 'large', 'weight': 'bold'})
plt.xticks(fontsize=11), plt.yticks(fontsize=11)
ax1.set_xlabel('room_type', weight='bold', fontsize=13), ax1.set_ylabel('price',weight='bold', fontsize=13)
#Barplot
ax2 = fig.add_subplot(122)
ax2 = sns.barplot(data=df_mean_price, x=df_mean_price.index, y='mean_price', palette=my_pltt)
ax2.set_title('Average price\naccording room type\n', fontdict={'size': 'large', 'weight': 'bold'})
plt.xticks(fontsize=11), plt.yticks(fontsize=11)
ax2.set_xlabel('room_type', weight='bold', fontsize=13), ax2.set_ylabel('mean_price',weight='bold', fontsize=13)
fig.set_facecolor('#FFFFCB')
fig.show()
df_mean_price
#Canvas
sns.set_theme(style='white')
fig = plt.figure()
#Axes
ax = fig.add_axes([0, 0, 1, 1])
#Graphic
ax = sns.scatterplot(data=df_airbnb, x='price', y='service_fee', color='#000080')
ax.set_title('Price vs Service fee', fontdict={'size': 'large', 'weight': 'bold'})
fig.set_facecolor('#FFFFCB')
fig.show()
#Canvas
sns.set_theme(style='white')
fig = plt.figure(figsize=(18, 8))
#Box1
ax1 = fig.add_subplot(121)
ax1 = sns.boxplot(data=df_airbnb.review_rate, palette='Set2')
ax1.set_ylabel('review_rate', weight='bold', size=12)
ax1.set_title('Distribution of\nreview rate', fontdict={'size': 'large', 'weight': 'bold'})
#Box2
ax2 = fig.add_subplot(122)
ax2 = sns.boxplot(data=df_airbnb['#guests'], palette='Set3')
ax2.set_title('Distribution of\nguests', fontdict={'size': 'large', 'weight': 'bold'})
ax2.set_ylabel('guests', weight='bold', size=12)
fig.set_facecolor('#FFFFCB')
fig.show()
df_review = df_airbnb.groupby('host_verified').agg({'review_rate': 'mean'}).sort_values('review_rate', ascending=False)
df_review['review_rate'] = df_review.review_rate.apply(lambda x: np.round(x, 4))
#Canvas
fig = plt.figure()
#Axes
ax = fig.add_axes([0, 0, 1.2, 0.9])
#Barplot
ax = sns.barplot(data=df_review, x=df_review.index, y='review_rate', palette='bone')
ax.set_title('Review rate:\nVerified host vs Unconfirmed hosts', fontdict={'size': 'large', 'weight': 'bold'})
fig.set_facecolor('#FFFFCB')
fig.show()
#Finding out outliers
q_3 = np.percentile(df_airbnb['#guests'], 75)
q_1 = np.percentile(df_airbnb['#guests'], 25)
iqr = q_3 - q_1
outlier = q_3 + (1.5 * iqr)
#Df without outliers
guests_1 = df_airbnb[df_airbnb['#guests'] < outlier]
df_guests = guests_1.groupby('host_verified').agg({'#guests': ['sum', 'mean']})
#Canvas
fig = plt.figure(figsize=(18, 8))
#Total guests
ax1 = fig.add_subplot(121)
ax1 = sns.barplot(x=df_guests.index, y=df_guests.iloc[:, 0], palette='Set2')
ax1.set_title('Total guests according to\nkind of host', fontdict={'size': 'x-large', 'weight': 'bold'})
ax1.set_ylabel('Total_guests')
#Avg guests
ax2 = fig.add_subplot(122)
ax2 = sns.barplot(x=df_guests.index, y=df_guests.iloc[:, 1], palette='Set2')
ax2.set_title('Average guests according to\nkind of host', fontdict={'size': 'x-large', 'weight': 'bold'})
ax2.set_ylabel('avg_guests')
fig.set_facecolor('#FFFFCB')
fig.show()
df_guests.iloc[:, 1] = df_guests.iloc[:, 1].apply(lambda x: np.round(x, 4))
df_guests
total = df_airbnb.shape[0]
no_outlier = guests_1.shape[0]
percentage = np.round((no_outlier / total) * 100, 2)
print("After having removed the outliers, we've kept 82.62% of the data".format(percentage))
df_airbnb.Boroughs.unique()
df_airbnb.loc[df_airbnb.Boroughs == 'brookln', ['Boroughs']] = 'Brooklyn'
df_airbnb.Boroughs.unique()
#Matrix whose variable is the review rate and the elements are the boroughs.
review_borough = df_airbnb.groupby('Boroughs').agg({'review_rate': 'mean'})
review_borough.sort_values('review_rate', ascending=False, inplace=True)
#Palette
my_pltt2 = {x:c_2 for x in df_airbnb.Boroughs.unique()}
my_pltt2['Staten Island'] = c_1
#Canvas
sns.set_theme(style='white')
fig = plt.figure()
#Axes
ax = fig.add_axes([0, 0, 1, 1])
#Barplot
ax = sns.barplot(data=review_borough, x=review_borough.index, y='review_rate', palette=my_pltt2)
ax.set_title('Review rate according\nto borough', fontdict={'size': 'large', 'weight': 'bold'})
fig.set_facecolor('#FFFFCB')
fig.show()
review_borough
#There are a lot of outliers in the min_nights column
#Canvas
fig = plt.figure()
#Axes
ax = fig.add_axes([0, 0, 1, 1])
#Boxplot
ax = sns.boxplot(data=df_airbnb, y='min_nights', palette='Set2')
ax.set_title('Distribution of\nminimum nights variable', fontdict={'size': 'large', 'weight': 'bold'})
ax.set_ylabel('min_nights')
fig.set_facecolor('#FFFFCB')
fig.show()
# There are data less than 0 and very large quantites, so I've to delete them.
df_airbnb.min_nights.describe()
q_75 = np.percentile(df_airbnb.min_nights, 75)
q_25 = np.percentile(df_airbnb.min_nights, 25)
iqr = q_75 - q_25
outlier = q_75 + (1.5*iqr)
#Clean data
df_min_nights = df_airbnb.copy()
cond_1, cond_2 = df_min_nights.min_nights > 0, df_min_nights.min_nights < outlier
df_min_nights = df_min_nights[cond_1 & cond_2]
#Canvas
sns.set_theme(style='white')
fig = plt.figure()
#Axes
ax = fig.add_axes([0, 0, 1, 1])
#Boxplot
ax = sns.boxplot(data=df_min_nights, y='min_nights', palette='Set3')
ax.set_title('New distribution of\nminmum nights')
fig.set_facecolor('#FFFFCB')
fig.show()
df_min_nights.min_nights.describe()
#Remove outliers
df_min_nights = df_min_nights[df_min_nights.min_nights <= 6]
df_min_nights.min_nights = list(map(int, df_min_nights.min_nights))
#Palette
my_pltt3 = {x: c_2 for x in df_min_nights.min_nights.unique()}
my_pltt3[2] = c_1
#Canvas
sns.set_style(style='white')
fig = plt.figure(figsize=(18, 9))
#Boxplot
ax1 = fig.add_subplot(121)
ax1 = sns.boxplot(data=df_min_nights, y='min_nights', palette='Set2')
ax1.set_title('Distribution without\noutliers', fontdict={'size': 'large', 'weight':'bold'})
#Countplot
ax2 = fig.add_subplot(122)
ax2 = sns.countplot(data=df_min_nights, x='min_nights', palette=my_pltt3)
ax2.set_title('Minimum nights offered\nby hosts', fontdict={'size': 'large', 'weight': 'bold'})
fig.set_facecolor('#FFFFCB')
fig.show()
#Check if there are outliers
#Canvas
fig = plt.figure()
#Axes
ax = fig.add_axes([0, 0, 0.8, 0.8])
#Boxplot
ax = sns.boxplot(data=df_airbnb, y='availability_365')
ax.set_title('Distribution of\navailability_365 variable', fontdict={'size': 'large', 'weight': 'bold'})
fig.set_facecolor('#FFFFCB')
fig.show()
df_airbnb.availability_365.describe()
#Matrix without inconsistent data
df_availability = df_airbnb.copy()
cond_1, cond_2 = df_availability.availability_365 > 0, df_availability.availability_365 <= 365
df_availability = df_availability[cond_1 & cond_2]
#Canvas
fig = plt.figure()
#Axes
ax = fig.add_axes([0, 0, 0.8, 0.8])
#Boxplot
ax = sns.boxplot(data=df_availability, y='availability_365', palette='Set3')
ax.set_title('Consistent data', fontdict={'size': 'large', 'weight': 'bold'})
fig.set_facecolor('#FFFFCB')
fig.show()
df_availability.availability_365.describe()
intervals = pd.cut(df_availability.availability_365, [1, 182, 365])
df_availability['intervals'] = intervals
df_intervals_365 = df_availability.groupby('intervals').agg({'availability_365': 'count'})
#Canvas
fig = plt.figure()
#Axes
ax = fig.add_axes([0, 0, 1, 1])
#Barplot
ax.pie(data=df_intervals_365, x='availability_365', autopct='%.01f%%', pctdistance=1.12,
colors=[c_1, c_2])
ax.legend(labels=df_intervals_365.index, bbox_to_anchor=(0.65, 0))
ax.set_title('Availability - 365', fontdict={'size': 'x-large', 'weight': 'bold'})
fig.set_facecolor('#FFFFCB')
fig.show()
df_intervals_365
#Variables
price = df_airbnb.price
year_built = df_airbnb.year_built
#Calculate covariance
covariance = np.cov(price, year_built)[0, 1]
#Correlation coefficient
coefficient_corr = np.round(covariance / (price.std() * year_built.std()), 4)
coefficient_corr
price_min = df_airbnb.price.min()
df_airbnb.loc[df_airbnb.price == price_min, ['year_built']].value_counts()[:5]
price_max = df_airbnb.price.max()
df_airbnb.loc[df_airbnb.price == price_max, ['year_built']].value_counts()[:5]
#Canvas
sns.set_theme(style='white')
fig = plt.figure()
#Axes
ax = fig.add_axes([0, 0, 1, 1])
#Boxplot
ax = sns.boxplot(data=df_airbnb, x='Boroughs', y='price', palette='Set2')
ax.set_title('Distribution of prices\naccording to borough', fontdict={'size': 'large', 'weight': 'bold'})
fig.set_facecolor('#FFFFCB')
fig.show()
df_price_mean = df_airbnb.groupby('Boroughs').agg({'price': 'mean'}).sort_values('price', ascending=False)
df_price_mean.price = df_price_mean.price.apply(lambda x: np.round(x, 2))
#Palette
my_pltt4 = {x: c_2 for x in df_airbnb.Boroughs.unique()}
my_pltt4['Queens'] = c_1
#Canvas
fig = plt.figure()
#Axes
ax = fig.add_axes([0, 0, 1, 1])
#Barplot
ax = sns.barplot(data=df_price_mean, x=df_price_mean.index, y='price', palette=my_pltt4)
ax.set_title('Average price according\nto borough', fontdict={'size': 'large', 'weight': 'bold'})
fig.set_facecolor('#FFFFCB')
fig.show()
df_price_mean
df = pd.DataFrame(df_airbnb.instant_bookable.value_counts())
df.rename(columns={'instant_bookable': 'freq'}, inplace=True)
#Canvas
sns.set_theme(style='white')
fig = plt.figure()
#Axes
ax = fig.add_axes([0, 0, 1, 1])
#Countplot
ax.pie(data=df, x='freq', autopct='%.01f%%', pctdistance=1.16, colors=[c_1, c_2], startangle=90)
ax.legend(labels=df.index,bbox_to_anchor=(1, 1))
ax.set_title('Instat bookable\nin Airbnb New York', fontdict={'size': 'x-large', 'weight': 'bold'})
fig.set_facecolor('#FFFFCB')
fig.show()
df
#Canvas
sns.set_theme(style='white')
fig = plt.figure()
#Axes
ax = fig.add_axes([0, 0, 1, 1])
#Barplot
ax = sns.countplot(data=df_airbnb, y='Boroughs', hue='cancellation_policy', palette='rocket', orient='h')
ax.set_title('The most common cancellation\naccording to borough',fontdict={'size': 'large', 'weight': 'bold'})
fig.set_facecolor('#FFFFCB')
fig.show()
df_cancel = df_airbnb.pivot_table(index='Boroughs', columns='cancellation_policy', aggfunc={'cancellation_policy':'count'})
df_cancel.columns = ['flexible', 'moderate', 'strict']
df_cancel
from wordcloud import WordCloud
wordcloud = WordCloud(background_color='white', width=800, height=400).generate(str(house_rules))
plt.figure(figsize=(10, 8))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
#Rule: No smoking
smoking = house_rules[house_rules.str.contains('smoking')].shape[0]
variable_1 = house_rules[house_rules.str.contains('no smoking')].shape[0]
variable_2 = house_rules[house_rules.str.contains('No smoking')].shape[0]
variable_3 = house_rules[house_rules.str.contains('NO smoking')].shape[0]
variable_4 = house_rules[house_rules.str.contains('NO SMOKING')].shape[0]
no_smoking = variable_1 + variable_2 + variable_3 + variable_4
pct = np.round((no_smoking / smoking)*100, 2)
print("""
In the column rules, {}% of values containing the word 'smoking' indicate that smoking is not
allowed inside property.
""".format(pct))
#Rule: Pets
def no_pets(rules):
key_words = ['No Pets', 'NO Pets']
freq = 0
for word in key_words:
freq += rules[rules.str.contains(word)].shape[0]
return freq
def allow_pets(rules):
key_words = ['Pets Allowed', 'Pets are allowed', 'Pet Friendly', 'Pets allowed']
freq = 0
for word in key_words:
freq += rules[rules.str.contains(word)].shape[0]
return freq
no_pets = no_pets(house_rules)
allow_pets = allow_pets(house_rules)
df_pets = pd.DataFrame({
'No pets': no_pets,
'Allow pets': allow_pets
}, index=['freq'])
df_pets = df_pets.T
df_pets
#Canvas
sns.set_theme(style='white')
fig = plt.figure()
#Axes
ax = fig.add_axes([0, 0, 1, 1])
#Pie chart
ax.pie(data=df_pets, x='freq', colors=[c_1, c_2], autopct='%.01f%%', pctdistance = 1.12, startangle=90)
ax.legend(labels=df_pets.index, bbox_to_anchor=(1, 1))
ax.set_title('Rules about\npets', fontdict={'size': 'x-large', 'weight': 'bold'})
fig.set_facecolor('#FFFFCB')
fig.show()
pets = house_rules[house_rules.str.contains('Pet')].shape[0]
pct = np.round(((no_pets + allow_pets) / pets)*100, 2)
print("""
It has only been collected {}% of values that contain the word 'Pet'.
Result:
* The most common is that the pets are not allowed.
""".format(pct))