pasi’s Untitled project

import pandas as pd import geopandas as gpd from shapely.geometry import Point import pingouin as pg import altair as alt alt.data_transformers.disable_max_rows() from sklearn.feature_extraction.text import TfidfVectorizer

Run to view results

airbnb = pd.read_csv('AB_NYC_2019.csv') airbnb = airbnb.dropna(subset=['name'])

Run to view results

background = gpd.read_file('https://raw.githubusercontent.com/dwillis/nyc-maps/master/boroughs.geojson') background = alt.Chart(background).mark_geoshape().encode( color=alt.Color('BoroName',legend=None,scale=alt.Scale(scheme='pastel1')), tooltip = ['BoroName'], ) # Filter the data, so we can get a nice clean heatmap data = airbnb.groupby('neighbourhood_group').filter(lambda x: len(x) > 10) data = data.loc[(data.price < 400) & (data.price > 20)] data = data.sample(3000,random_state=0) data.sort_values('price',inplace=True) listings = alt.Chart(data).mark_circle(size=10).encode( longitude='longitude:Q', latitude='latitude:Q', color=alt.Color('price',scale=alt.Scale(scheme='plasma')), tooltip = ['price','neighbourhood'], opacity=alt.value(0.5), ).properties( width=600, height=400 ) background+listings

Run to view results

airbnb.groupby('neighbourhood_group').price.describe()[['count','mean','50%','max']]

Run to view results

a = airbnb.loc[airbnb.neighbourhood_group == 'Manhattan'].groupby('neighbourhood').price.describe() a.loc[a['count'] > 10].sort_values('50%')[['count','mean','50%','max']]

Run to view results

airbnb.groupby('room_type').price.describe()[['count','mean','50%','max']]

Run to view results

mta = pd.read_csv('mta_station_locations.csv') mta.rename(columns={ 'GTFS Latitude': 'latitude', 'GTFS Longitude': 'longitude' }, inplace=True) mta.drop_duplicates(subset=['Stop Name'],inplace=True) # Drop repeated stations (1 station can be part of multiple lines)

Run to view results

alt.data_transformers.disable_max_rows() data = airbnb.loc[airbnb.price < 300] listings = alt.Chart(data.sample(3000,random_state=0)).mark_circle(size=4).encode( longitude='longitude:Q', latitude='latitude:Q', color=alt.Color('price:Q',scale=alt.Scale(scheme='plasma'),legend=None), tooltip = ['price','neighbourhood'], ).properties( width=600, height=400 ) stations = alt.Chart(mta).mark_point(size=10).encode( longitude='longitude:Q', latitude='latitude:Q', tooltip = ['Stop Name'], color=alt.value('green'), ) background+stations+listings

Run to view results

# Convert to GeoDataFrame airbnb['geometry'] = [Point(lat,lon) for lat,lon in zip(airbnb.latitude,airbnb.longitude)] mta['geometry'] = [Point(lat,lon) for lat,lon in zip(mta.latitude,mta.longitude)] airbnb = gpd.GeoDataFrame(airbnb,crs='EPSG:4326') mta = gpd.GeoDataFrame(mta,crs='EPSG:4326') airbnb = airbnb.to_crs('EPSG:32116') mta = mta.to_crs('EPSG:32116')

Run to view results

sample = airbnb.loc[airbnb.price < 300] sample = sample.sample(3000,random_state=0) dists = {} # Get the distance to all stations for i,station in mta.iterrows(): g = station.geometry name = station['Stop Name'] d = sample.distance(g) dists[name] = d # For each listing, onyl keep the minimum distance sample['d_station'] = pd.DataFrame(dists).min(axis=1)

Run to view results

alt.Chart(sample.sample(1000)).mark_circle(size=10).encode( x=alt.X('d_station',scale=alt.Scale(type='log')), y='price', color='neighbourhood_group', opacity=alt.value(1) )

Run to view results

sample.groupby(sample.d_station > 500).price.describe()

Run to view results

vectorizer = TfidfVectorizer() tfidf = vectorizer.fit_transform(airbnb.name) df_tfidf = pd.DataFrame(tfidf.toarray(), columns=vectorizer.get_feature_names()) # Filter out place names. These are less relevant to this analysis. places = ['greenpoint','harlem','upper','soho','chelsea','bushwick','midtown','williamsburg','prospect','york'] df_tfidf.drop(places,axis=1, inplace=True) airbnb['tfidf_max'] = df_tfidf.idxmax(axis=1) # For each listing, get the most "important" word

Run to view results

data = airbnb.loc[airbnb.room_type == 'Entire home/apt'].copy() data = data.loc[(data.price >= 300) | (data.price <= 200)] data['is_expensive'] = data.price >= 300 0 # Let's say that prices above data.groupby(['room_type','is_expensive']).tfidf_max.apply(lambda x: x.value_counts().head(15))

Run to view results