import numpy as np
import pandas as pd
# A structured array
my_array = np.ones(3, dtype=([('foo',int), ('bar',float)]))
print(my_array['foo'])
# A record array
my_array2 = my_array.view(np.recarray)
print(my_array2.bar)
s = pd.Series(np.random.randn(5), index = ['a','b','c','d','e'])
print(s)
d = {'one': [1., 2., 3., 4.],
'two': [4., 3., 2., 1.]}
df = pd.DataFrame(d)
print(df)
data = np.array([['','Col1','Col2'],
['Row1',1,2],
['Row2',3,4]])
df = pd.DataFrame(data=data[1:,1:],
index=data[1:,0],
columns=data[0,1:])
data[1:,1:]
data[1:,0]
data[0,1:]
my_dict = {'First': ['1','3'],
'Second': ['1','2'],
'Third': ['2','4']}
df = pd.DataFrame(my_dict)
print(df)
bankData = pd.read_csv("bank-data.csv",sep=";")
print(bankData.shape)
bankData.info()
bankData.to_csv('test.csv')
df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]))
df
# Use the shape property
print(df.shape)
# or use the len() function with the index property
print(len(df.index))
df.index
bankData.head()
bankData.tail()
bankData.columns
bankData.describe()
bankData.sort_values(by='age').tail()
bankData['job'].head()
bankData.job
bankData[['job','age']].head()
bankData.loc[1:5,['job','age','education']]
bankData.iloc[:5,:3]
bankData[0:5]
bankData.marital.isin(['single'])
bankData[bankData.marital.isin(['single','married'])].head()
data = bankData[bankData.age > 30]
data
data = data[data.marital.isin(['single','married']) & (data.age == 30)]
data
oneColumn = np.ones(len(bankData))
print(oneColumn)
bankData['one'] = oneColumn
bankData.head()
del bankData['one']
bankData.head()
# Read flight data
flightData = pd.read_csv('flights.csv',index_col=0)
#del cases['Unnamed: 0']
flightData.head()
flightData.info()
# Show rows with missing data
flightData[flightData.dep_delay.isnull()]
index_nan = flightData.dep_delay.index[flightData.dep_delay.isnull()]
print(index_nan)
flightData.shape
flightData.dropna(how='any').shape
flightData.dropna(how='any').head()
flightData_dropNA = flightData.dropna(how='any')
flightData_dropNA.head()
x = np.mean(flightData.dep_delay)
print("%1.1f"%x)
#Show index of the missing data
#index_nan = flightData.dep_delay.index[flightData.dep_delay.isnull()]
flightData.fillna(value={'dep_delay':x}).loc[index_nan].head()
flightData_fillNA = flightData.fillna(value={'dep_delay':x})
flightData_fillNA
bankData.mean()
bankData.std()
bankData.median()
df = pd.DataFrame(np.array([[1,2,3],
[4,5,6],
[7,8,9]]))
df.apply(np.cumsum, axis =0)
bankData.num = bankData._get_numeric_data()
bankData.num.head()
bankData.num.columns
bankData[bankData.num.columns] = bankData.num.apply(lambda x:x/x.max())
bankData.head()
pd.concat([bankData['age'],bankData['job']],axis = 1).head()
left = pd.DataFrame({'key': ['A', 'B'],
'lval': [1, 2]})
right = pd.DataFrame({'key': ['A', 'B'],
'rval': [4, 5]})
print(left)
print(right)
join_df = pd.merge(left,right,on='key')
join_df
bankData.groupby('job').mean()
bankData.groupby(['marital','education']).mean()
cases = pd.read_csv('cases.csv')
cases
cases.melt(id_vars='country',
value_vars=cases.columns[2:4],
var_name = 'Year',
value_name = 'n')
pollution = pd.read_csv('pollution.csv')
pollution
pollution.pivot(index='city',
columns='size',
values='amount')
df = pd.DataFrame({'key': list('bbacab'), 'data1':range(6)})
df
pd.get_dummies(df['key'])
bankData.education
pd.get_dummies(bankData, columns=['education']).head()