import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import pingouin as pg
import altair as alt
alt.data_transformers.disable_max_rows()
from sklearn.feature_extraction.text import TfidfVectorizer
airbnb = pd.read_csv('AB_NYC_2019.csv')
airbnb = airbnb.dropna(subset=['name'])
background = gpd.read_file('https://raw.githubusercontent.com/dwillis/nyc-maps/master/boroughs.geojson')
background = alt.Chart(background).mark_geoshape().encode(
color=alt.Color('BoroName',legend=None,scale=alt.Scale(scheme='pastel1')),
tooltip = ['BoroName'],
)
# Filter the data, so we can get a nice clean heatmap
data = airbnb.groupby('neighbourhood_group').filter(lambda x: len(x) > 10)
data = data.loc[(data.price < 400) & (data.price > 20)]
data = data.sample(3000,random_state=0)
data.sort_values('price',inplace=True)
listings = alt.Chart(data).mark_circle(size=10).encode(
longitude='longitude:Q',
latitude='latitude:Q',
color=alt.Color('price',scale=alt.Scale(scheme='plasma')),
tooltip = ['price','neighbourhood'],
opacity=alt.value(0.5),
).properties(
width=600,
height=400
)
background+listings
airbnb.groupby('neighbourhood_group').price.describe()[['count','mean','50%','max']]
a = airbnb.loc[airbnb.neighbourhood_group == 'Manhattan'].groupby('neighbourhood').price.describe()
a.loc[a['count'] > 10].sort_values('50%')[['count','mean','50%','max']]
airbnb.groupby('room_type').price.describe()[['count','mean','50%','max']]
mta = pd.read_csv('mta_station_locations.csv')
mta.rename(columns={
'GTFS Latitude': 'latitude',
'GTFS Longitude': 'longitude'
}, inplace=True)
mta.drop_duplicates(subset=['Stop Name'],inplace=True) # Drop repeated stations (1 station can be part of multiple lines)
alt.data_transformers.disable_max_rows()
data = airbnb.loc[airbnb.price < 300]
listings = alt.Chart(data.sample(3000,random_state=0)).mark_circle(size=4).encode(
longitude='longitude:Q',
latitude='latitude:Q',
color=alt.Color('price:Q',scale=alt.Scale(scheme='plasma'),legend=None),
tooltip = ['price','neighbourhood'],
).properties(
width=600,
height=400
)
stations = alt.Chart(mta).mark_point(size=10).encode(
longitude='longitude:Q',
latitude='latitude:Q',
tooltip = ['Stop Name'],
color=alt.value('green'),
)
background+stations+listings
# Convert to GeoDataFrame
airbnb['geometry'] = [Point(lat,lon) for lat,lon in zip(airbnb.latitude,airbnb.longitude)]
mta['geometry'] = [Point(lat,lon) for lat,lon in zip(mta.latitude,mta.longitude)]
airbnb = gpd.GeoDataFrame(airbnb,crs='EPSG:4326')
mta = gpd.GeoDataFrame(mta,crs='EPSG:4326')
airbnb = airbnb.to_crs('EPSG:32116')
mta = mta.to_crs('EPSG:32116')
sample = airbnb.loc[airbnb.price < 300]
sample = sample.sample(3000,random_state=0)
dists = {}
# Get the distance to all stations
for i,station in mta.iterrows():
g = station.geometry
name = station['Stop Name']
d = sample.distance(g)
dists[name] = d
# For each listing, onyl keep the minimum distance
sample['d_station'] = pd.DataFrame(dists).min(axis=1)
alt.Chart(sample.sample(1000)).mark_circle(size=10).encode(
x=alt.X('d_station',scale=alt.Scale(type='log')),
y='price',
color='neighbourhood_group',
opacity=alt.value(1)
)
sample.groupby(sample.d_station > 500).price.describe()
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(airbnb.name)
df_tfidf = pd.DataFrame(tfidf.toarray(), columns=vectorizer.get_feature_names())
# Filter out place names. These are less relevant to this analysis.
places = ['greenpoint','harlem','upper','soho','chelsea','bushwick','midtown','williamsburg','prospect','york']
df_tfidf.drop(places,axis=1, inplace=True)
airbnb['tfidf_max'] = df_tfidf.idxmax(axis=1) # For each listing, get the most "important" word
data = airbnb.loc[airbnb.room_type == 'Entire home/apt'].copy()
data = data.loc[(data.price >= 300) | (data.price <= 200)]
data['is_expensive'] = data.price >= 300
0 # Let's say that prices above
data.groupby(['room_type','is_expensive']).tfidf_max.apply(lambda x: x.value_counts().head(15))