Pandas
import pandas as pd
pd.__version__
# Download a sample file from http://insideairbnb.com/
! wget http://data.insideairbnb.com/united-states/fl/broward-county/2022-06-17/visualisations/listings.csv -O listings.csv
# read the airbnb NYC listings csv file
airbnb = pd.read_csv('listings.csv')
# View last few entries
airbnb.tail(10)
# Results fpr a single column
airbnb['name']
# View first few entries
airbnb.head(5)
# Change the type of a column to datetime
airbnb['last_review'] = pd.to_datetime(airbnb['last_review'])
airbnb.dtypes
# extract the year from a datetime series
airbnb['year'] = airbnb['last_review'].dt.year
airbnb['year'].head()
Series String Function
# Strip leading and trailing spaces from a string series
airbnb['name'] = airbnb['name'].str.strip()
airbnb['name'].tail()
# uppercase all strings in a series
airbnb['name_upper'] = airbnb['name'].str.upper()
airbnb['name_upper'].tail()
# lowercase all strings in a series
airbnb['name_lower'] = airbnb['name'].str.lower()
airbnb['name_lower'].tail()
Derived Columns
# calculate using two columns
airbnb['min_revenue'] = airbnb['minimum_nights'] * airbnb['price']
airbnb[['minimum_nights', 'price', 'min_revenue']].head()
Summary statistics
airbnb['price'].var()
airbnb['price'].std()
airbnb['price'].mean()
airbnb['price'].median()
Grouped Statisitcs
# get the mean grouped by type of room
airbnb[['room_type', 'price']].groupby('room_type', as_index=False).mean()
# get the median group by type of room
airbnb[['room_type', 'price']].groupby('room_type', as_index=False).median()
Filtering Data
# get all rows with price < 1000
airbnb_under_1000 = airbnb[airbnb['price'] < 1000]
airbnb_under_1000.head()
# get all rows with price <1000 and year equal to 2020
airbnb_2019_under_1000 = airbnb[(airbnb['price'] < 1000) & (airbnb['year'] == 2020)]
airbnb_2019_under_1000.head()
Plotting
# distrubtion of pices under $1000
ax = airbnb_under_1000['price'].plot.hist(bins=40)
Pandas dataframe
a collection of series
d = [[1,2],[3,4]]
df = pd.DataFrame(d,index=['r1','r2'],columns=['a' , 'b'])
df
import numpy as np
d = np.arange(24).reshape(6,4)
d
pd.DataFrame(
{
'name' : ['Ally', 'Jane', 'Belinda'],
'height' :(160,155,163),
},
columns = ['name', 'height'],
index = ['A1', "A2",'A3']
)
from pandas import DataFrame
my_df = DataFrame(data = np.random.randn(16).round(2).reshape(4,4),
index = ['r'+str(i) for i in range(1,5)],
columns = ['c'+str(i) for i in range(1,5)])
my_df.loc[['r1', 'r4'],['c3', 'c4']]
my_df.iloc[[0,3], [2,3]]
my_df.loc[['r1', 'r4'], ['c3', 'c4']]
import os
[x for x in os.listdir(os.getcwd()) if 'csv' in x]
!mkdir data
Groupby
airbnb_grouped = airbnb.groupby('room_type')
len(airbnb_grouped)
for i in ['Entire home/apt', 'Private room', 'Shared room', "Hotel room"]:
print(airbnb_grouped.get_group(i).shape)
airbnb_grouped
#.apply(lambda x: x[['host_name', 'price', 'room_type']]
#.sort_values(by = 'price', ascending = False)
#.iloc[:3,:])
airbnb_grouped.apply(lambda x: x[['host_name', 'price', 'room_type']].sort_values(by = 'price', ascending = False).iloc[:3,:])
airbnb_grouped .apply(lambda x: x[['host_name', 'price', 'room_type']] .sort_values(by = 'price', ascending = False) .iloc[:3,:])
airbnb.groupby('room_type')['price'].mean()
Pivot Table
pd.pivot_table(data = airbnb,
index = 'room_type',
values = 'price',
aggfunc = 'mean' )