Pandas

import pandas as pd pd.__version__

# Download a sample file from http://insideairbnb.com/ ! wget http://data.insideairbnb.com/united-states/fl/broward-county/2022-06-17/visualisations/listings.csv -O listings.csv

# read the airbnb NYC listings csv file airbnb = pd.read_csv('listings.csv')

# View last few entries airbnb.tail(10)

# Results fpr a single column airbnb['name']

# View first few entries airbnb.head(5)

# Change the type of a column to datetime airbnb['last_review'] = pd.to_datetime(airbnb['last_review']) airbnb.dtypes

# extract the year from a datetime series airbnb['year'] = airbnb['last_review'].dt.year airbnb['year'].head()

Series String Function

# Strip leading and trailing spaces from a string series airbnb['name'] = airbnb['name'].str.strip() airbnb['name'].tail()

# uppercase all strings in a series airbnb['name_upper'] = airbnb['name'].str.upper() airbnb['name_upper'].tail()

# lowercase all strings in a series airbnb['name_lower'] = airbnb['name'].str.lower() airbnb['name_lower'].tail()

Derived Columns

# calculate using two columns airbnb['min_revenue'] = airbnb['minimum_nights'] * airbnb['price'] airbnb[['minimum_nights', 'price', 'min_revenue']].head()

Summary statistics

airbnb['price'].var()

airbnb['price'].std()

airbnb['price'].mean()

airbnb['price'].median()

Grouped Statisitcs

# get the mean grouped by type of room airbnb[['room_type', 'price']].groupby('room_type', as_index=False).mean()

# get the median group by type of room airbnb[['room_type', 'price']].groupby('room_type', as_index=False).median()

Filtering Data

# get all rows with price < 1000 airbnb_under_1000 = airbnb[airbnb['price'] < 1000] airbnb_under_1000.head()

# get all rows with price <1000 and year equal to 2020 airbnb_2019_under_1000 = airbnb[(airbnb['price'] < 1000) & (airbnb['year'] == 2020)] airbnb_2019_under_1000.head()

Plotting

# distrubtion of pices under $1000 ax = airbnb_under_1000['price'].plot.hist(bins=40)

Pandas dataframe

a collection of series

d = [[1,2],[3,4]] df = pd.DataFrame(d,index=['r1','r2'],columns=['a' , 'b']) df

import numpy as np d = np.arange(24).reshape(6,4) d

pd.DataFrame( { 'name' : ['Ally', 'Jane', 'Belinda'], 'height' :(160,155,163), }, columns = ['name', 'height'], index = ['A1', "A2",'A3'] )

from pandas import DataFrame my_df = DataFrame(data = np.random.randn(16).round(2).reshape(4,4), index = ['r'+str(i) for i in range(1,5)], columns = ['c'+str(i) for i in range(1,5)])

my_df.loc[['r1', 'r4'],['c3', 'c4']]

my_df.iloc[[0,3], [2,3]]

my_df.loc[['r1', 'r4'], ['c3', 'c4']]

import os [x for x in os.listdir(os.getcwd()) if 'csv' in x]

!mkdir data

Groupby

airbnb_grouped = airbnb.groupby('room_type') len(airbnb_grouped)

for i in ['Entire home/apt', 'Private room', 'Shared room', "Hotel room"]: print(airbnb_grouped.get_group(i).shape)

airbnb_grouped #.apply(lambda x: x[['host_name', 'price', 'room_type']] #.sort_values(by = 'price', ascending = False) #.iloc[:3,:]) airbnb_grouped.apply(lambda x: x[['host_name', 'price', 'room_type']].sort_values(by = 'price', ascending = False).iloc[:3,:])

airbnb_grouped .apply(lambda x: x[['host_name', 'price', 'room_type']] .sort_values(by = 'price', ascending = False) .iloc[:3,:])

airbnb.groupby('room_type')['price'].mean()

Pivot Table

pd.pivot_table(data = airbnb, index = 'room_type', values = 'price', aggfunc = 'mean' )

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Pandas