Exam 1 - Data Mining Fall 2022

#importing html data using web scraping import requests con = requests.get("https://canvas.colorado.edu/courses/84839/modules") con.content

import pandas as pd df=pd.read_csv("bigmartsales.csv") display(df['Outlet_Type'].value_counts()) print("For Item_MRP Column:") print("Max is:"+str(df['Item_MRP'].max())) print("Min is:"+str(df['Item_MRP'].min())) print("Mean is:"+str(df['Item_MRP'].mean())) print("Median is:"+str(df['Item_MRP'].median())) print("Mode is :"+str(df['Item_MRP'].mode()[0])) print("Range is :"+str(df['Item_MRP'].max()-df['Item_MRP'].min())) print("Midrange is :"+str((df['Item_MRP'].max()-df['Item_MRP'].min())/2)) q1=df['Item_MRP'].quantile(0.25) q2=df['Item_MRP'].median() q3=df['Item_MRP'].quantile(0.75) print("Quartiles are: "+str(q1)+", "+str(q3)) IQR=q3-q1 print("IQR is: "+ str(IQR)) var1=df['Item_MRP'].var() print("Variance :"+ str(var1)) std1=df['Item_MRP'].std() print('Standard deviation is :'+str(std1)) outliers = (df['Item_MRP'] - q2).abs() > std1 print(df['Item_MRP'][outliers]) df.info()

import seaborn as sns sns.relplot(x='Item_MRP',y='Profit',data=df,hue='Outlet_Type')

sns.catplot(x='Outlet_Type',y='Item_Outlet_Sales',data=df)

sns.displot(data=df,x='Profit')

sns.jointplot(data=df,x='Item_Weight',y='Item_Visibility',kind='kde')

sns.pairplot(data=df,hue='Outlet_Type')

display(df.isnull().sum()) #we will try to fill null values in one column with mean mean_value=df['Item_Weight'].mean() df['Item_Weight_mean']=df['Item_Weight'].fillna(mean_value)

#we will try to perform data reduction(data descretization) on Item_weight df['IW_bucket']=pd.cut(df['Item_Weight_mean'],3,labels=['Heavy','Middleweight','Light']) df.head(10)

#We will try to normalize a column using z-score(data transformation) df['Profit_Zscore']=(df['Profit']-df['Profit'].mean())/df['Profit'].std() df['Profit_Zscore']