import pandas as pd
pd.__version__
# Download a sample file from http://insideairbnb.com/
! wget http://data.insideairbnb.com/united-states/fl/broward-county/2022-06-17/visualisations/listings.csv -O listings.csv
# read the airbnb NYC listings csv file
airbnb = pd.read_csv("listings.csv")
# display the pandas DataFrame
display(airbnb)
# View first few entries
airbnb
# View last few entries
airbnb.tail()
# Results for a single column
airbnb['name']
airbnb.columns
# results for multiple columns
hosts = airbnb[['host_id', 'host_name']]
hosts.head()
# Show the data types for each column
airbnb.dtypes
# Change the type of a column to datetime
airbnb['last_review'] = pd.to_datetime(airbnb['last_review'])
airbnb.dtypes
# extract the year from a datetime series
airbnb['year'] = airbnb['last_review'].dt.year
airbnb['year'].head()
# Strip leading and trailing spaces from a string series
airbnb['name'] = airbnb['name'].str.strip()
airbnb['name'].tail()
# uppercase all strings in a series
airbnb['name_upper'] = airbnb['name'].str.upper()
airbnb['name_upper'].tail()
# lowercase all strings in a series
airbnb['name_lower'] = airbnb['name'].str.lower()
airbnb['name_lower'].tail()
# calculate using two columns
airbnb['min_revenue'] = airbnb['minimum_nights'] * airbnb['price']
airbnb[['minimum_nights', 'price', 'min_revenue']].head()
# get the mean price
airbnb['price'].mean()
# get the median price
airbnb['price'].median()
airbnb['price'].std()
airbnb['price'].var()
# get the mean grouped by type of room
airbnb[['room_type', 'price']].groupby('room_type', as_index=False).mean().round(2)
# get the median grouped by type of room
airbnb[['room_type', 'price']].groupby('room_type', as_index=False).median()
# get all rows with price < 1000
airbnb_under_1000 = airbnb[airbnb['price'] < 1000]
airbnb_under_1000.head()
# get all rows with price < 1000 and year equal to 2020
airbnb_2019_under_1000 = airbnb[(airbnb['price'] < 1000) & (airbnb['year'] == 2020)]
airbnb_2019_under_1000.head()
# distribution of prices under $1000
ax = airbnb_under_1000['price'].plot.hist(bins=40)
import numpy as np
pd.Series([1,3,5,6])
pd.Series([1,3,5,6], index=['A1','A2','A3','A4'])
a = {'A': 5, 'B': 7}
s = pd.Series(a)
s
a = np.random.randn(100)*5+100
date = pd.date_range('20220101',periods=100)
s = pd.Series(a,index=date)
s
a = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
b = pd.Series([4, 3, 2, 1], index=['d', 'c', 'b', 'a'])
a + b # different from Python list
a - b
a * b
a/b
date = pd.date_range('20220101',periods=20)
s = pd.Series(np.random.randn(20),index=date)
# Slice out the data from 2022-01-05 to 2022-01-10?
d = [[1,2],[3,4]]
df = pd.DataFrame(d,index=['r1','r2'],columns=['a','b'])
df
import numpy as np
d = np.arange(24).reshape(6,4)
d
df = pd.DataFrame(d, index=np.arange(1,7), columns=list('ABCD'))
df
pd.DataFrame(
{
'name': ['Ally','Jane','Belinda'],
'height':[160,155,163],
},
columns = ['name','height'],
index = ['A1','A2','A3']
)
date = pd.date_range('20170101',periods=6)
s1 = pd.Series(np.random.randn(6),index=date)
s2 = pd.Series(np.random.randn(6),index=date)
df = pd.DataFrame({'Asia':s1,'Europe':s2})
df
a = [[3,4],[5,6]]
b = [[6,5],[4,3]]
a2 = pd.DataFrame(a,index=[1,2],columns=['d','b'])
a2
b2 = pd.DataFrame(b,index=[3,2],columns=['c','b'])
b2
print(a2+b2)
from pandas import DataFrame
my_df = DataFrame(data = np.random.randn(16).round(2).reshape(4,4),
index = ['r'+str(i) for i in range(1, 5)],
columns = ['c'+str(i) for i in range(1, 5)])
my_df
my_df.T
my_df.loc[['r1', 'r4'], ['c3', 'c4']]
my_df.iloc[[0, 3], [2, 3]]
!head -5 listings.csv
%ls
[x for x in os.listdir(os.getcwd()) if 'csv' in x]
!mkdir data
%ls
airbnb.to_csv('./data/listings.csv')
os.listdir(os.getcwd() + '/data')
%cd data
airbnb1 = pd.read_csv('listings.csv')
airbnb1.sum(axis = 0)
airbnb1.sum(axis = 1)
airbnb1.fillna(0).describe().round(1)
airbnb1.head()
airbnb1.set_index('id', inplace=True)
airbnb1.head()
airbnb1.room_type.value_counts()
airbnb1_grouped = airbnb1.groupby("room_type")
len(airbnb1_grouped)
for i in ['Entire home/apt', 'Private room', 'Shared room', "Hotel room"]:
print(airbnb1_grouped.get_group(i).shape)
#airbnb1_grouped
#.apply(lambda x: x[['host_name', 'price', 'room_type']]
#.sort_values(by = 'price', ascending = False)
#.iloc[:3,:])
airbnb1_grouped.apply(lambda x: x[['host_name', 'price', 'room_type']].sort_values(by = 'price', ascending = False).iloc[:3,:])
airbnb1.groupby('room_type').apply(lambda x: x['price'].describe())
airbnb.groupby('room_type')['price'].mean()
airbnb1.groupby(['room_type', 'neighbourhood'])['price'].mean()
airbnb1.groupby(['room_type', 'neighbourhood'])['price'].mean().unstack()
# pd.pivot_table(data = airbnb1,
# index = 'room_type',
# values = 'price',
# aggfunc = 'mean')
pd.pivot_table(data = airbnb1,
index = 'room_type',
values = 'price',
aggfunc = 'mean')
pd.pivot_table(data = airbnb1,
index = 'room_type',
columns = 'neighbourhood',
values = 'price',
aggfunc = 'mean')
%timeit airbnb1.groupby('room_type')['price'].mean()
%timeit pd.pivot_table(data = airbnb1, index= 'room_type', values='price', aggfunc='mean')
%timeit airbnb1.groupby(['room_type', 'neighbourhood'])['price'].mean().unstack()
%timeit pd.pivot_table(data=airbnb1, index='room_type', columns='neighbourhood', values='price', aggfunc='mean')