import numpy as np
import pandas as pd
# A structured array
my_array = np.ones(3, dtype=([('foo',int), ('bar',float)]))
print(my_array['foo'])
[1 1 1]
# A record array
my_array2 = my_array.view(np.recarray)
print(my_array2.bar)
[1. 1. 1.]
s = pd.Series(np.random.randn(5), index = ['a','b','c','d','e'])
print(s)
a -0.292054
b 1.295976
c -0.264489
d 0.421115
e 0.359720
dtype: float64
d = {'one': [1., 2., 3., 4.],
'two': [4., 3., 2., 1.]}
df = pd.DataFrame(d)
print(df)
one two
0 1.0 4.0
1 2.0 3.0
2 3.0 2.0
3 4.0 1.0
data = np.array([['','Col1','Col2'],
['Row1',1,2],
['Row2',3,4]])
df = pd.DataFrame(data=data[1:,1:],
index=data[1:,0],
columns=data[0,1:])
data[1:,1:]
data[1:,0]
data[0,1:]
my_dict = {'First': ['1','3'],
'Second': ['1','2'],
'Third': ['2','4']}
df = pd.DataFrame(my_dict)
print(df)
First Second Third
0 1 1 2
1 3 2 4
bankData = pd.read_csv("bank-data.csv",sep=";")
print(bankData.shape)
bankData.info()
(4521, 17)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 4521 non-null int64
1 job 4521 non-null object
2 marital 4521 non-null object
3 education 4521 non-null object
4 default 4521 non-null object
5 balance 4521 non-null int64
6 housing 4521 non-null object
7 loan 4521 non-null object
8 contact 4521 non-null object
9 day 4521 non-null int64
10 month 4521 non-null object
11 duration 4521 non-null int64
12 campaign 4521 non-null int64
13 pdays 4521 non-null int64
14 previous 4521 non-null int64
15 poutcome 4521 non-null object
16 y 4521 non-null object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB
bankData.to_csv('test.csv')
df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]))
df
# Use the shape property
print(df.shape)
(2, 3)
# or use the len() function with the index property
print(len(df.index))
2
df.index
bankData.head()
bankData.tail()
bankData.columns
bankData.describe()
bankData.sort_values(by='age').tail()
bankData['job'].head()
bankData.job
bankData[['job','age']].head()
bankData.loc[1:5,['job','age','education']]
bankData.iloc[:5,:3]
bankData[0:5]
bankData.marital.isin(['single'])
bankData[bankData.marital.isin(['single','married'])].head()
data = bankData[bankData.age > 30]
data
data = data[data.marital.isin(['single','married']) & (data.age == 30)]
data
oneColumn = np.ones(len(bankData))
print(oneColumn)
bankData['one'] = oneColumn
bankData.head()
[1. 1. 1. ... 1. 1. 1.]
del bankData['one']
bankData.head()
# Read flight data
flightData = pd.read_csv('flights.csv',index_col=0)
#del cases['Unnamed: 0']
flightData.head()
flightData.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 327346 entries, 0 to 327345
Data columns (total 19 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 year 327346 non-null int64
1 month 327346 non-null int64
2 day 327346 non-null int64
3 dep_time 327346 non-null int64
4 sched_dep_time 327346 non-null int64
5 dep_delay 321132 non-null float64
6 arr_time 327346 non-null int64
7 sched_arr_time 327346 non-null int64
8 arr_delay 327346 non-null int64
9 carrier 327346 non-null object
10 flight 327346 non-null int64
11 tailnum 327346 non-null object
12 origin 327346 non-null object
13 dest 327346 non-null object
14 air_time 327346 non-null int64
15 distance 327346 non-null int64
16 hour 327346 non-null int64
17 minute 327346 non-null int64
18 time_hour 327346 non-null object
dtypes: float64(1), int64(13), object(5)
memory usage: 49.9+ MB
# Show rows with missing data
flightData[flightData.dep_delay.isnull()]
index_nan = flightData.dep_delay.index[flightData.dep_delay.isnull()]
print(index_nan)
Int64Index([ 0, 2, 69, 73, 98, 185, 200, 236,
245, 325,
...
326581, 326651, 326660, 326741, 326840, 326863, 327005, 327020,
327065, 327102],
dtype='int64', length=6214)
flightData.shape
flightData.dropna(how='any').shape
flightData.dropna(how='any').head()
flightData_dropNA = flightData.dropna(how='any')
flightData_dropNA.head()
x = np.mean(flightData.dep_delay)
print("%1.1f"%x)
12.8
#Show index of the missing data
#index_nan = flightData.dep_delay.index[flightData.dep_delay.isnull()]
flightData.fillna(value={'dep_delay':x}).loc[index_nan].head()
flightData_fillNA = flightData.fillna(value={'dep_delay':x})
flightData_fillNA
bankData.mean()
bankData.std()
bankData.median()
df = pd.DataFrame(np.array([[1,2,3],
[4,5,6],
[7,8,9]]))
df.apply(np.cumsum, axis =0)
bankData.num = bankData._get_numeric_data()
bankData.num.head()
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:1: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access
"""Entry point for launching an IPython kernel.
bankData.num.columns
bankData[bankData.num.columns] = bankData.num.apply(lambda x:x/x.max())
bankData.head()
pd.concat([bankData['age'],bankData['job']],axis = 1).head()
left = pd.DataFrame({'key': ['A', 'B'],
'lval': [1, 2]})
right = pd.DataFrame({'key': ['A', 'B'],
'rval': [4, 5]})
print(left)
print(right)
key lval
0 A 1
1 B 2
key rval
0 A 4
1 B 5
join_df = pd.merge(left,right,on='key')
join_df
bankData.groupby('job').mean()
bankData.groupby(['marital','education']).mean()
cases = pd.read_csv('cases.csv')
cases
cases.melt(id_vars='country',
value_vars=cases.columns[2:4],
var_name = 'Year',
value_name = 'n')
pollution = pd.read_csv('pollution.csv')
pollution
pollution.pivot(index='city',
columns='size',
values='amount')
df = pd.DataFrame({'key': list('bbacab'), 'data1':range(6)})
df
pd.get_dummies(df['key'])
bankData.education
pd.get_dummies(bankData, columns=['education']).head()