Assignment 2

import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression # Step 1: Importing the the CSV file: df = pd.read_csv('dataset_Superstore (2).csv') # view the summary table df.describe()

# Calculate the z-score for each data point in the "Profits" column z_scores_profits = np.abs((df['Profit'] - df['Profit'].mean()) / df['Profit'].std()) z_scores_sales = np.abs((df['Sales'] - df['Sales'].mean()) / df['Sales'].std()) z_scores_Quantity = np.abs((df['Quantity'] - df['Quantity'].mean()) / df['Quantity'].std()) # I chose a threshold of 1, as it give me the best values threshold = 1 # Removing the outliers df = df[(z_scores_profits < threshold) & (z_scores_sales < threshold) & (z_scores_Quantity < threshold)] # Removing anything above 1 in the discount column df = df[df['Discount'] <= 1] # Save the cleaned data to a new CSV file df.to_csv('dataset_Superstore (2)', index=False) df.describe()

# Ploting a histogram of sales plt.hist(df['Sales'], bins=20) plt.xlabel('Sales') plt.ylabel('Frequency') plt.title('Histogram of Sales') plt.show() # Calculate the total sales for each sub-category sales_by_subcategory = df.groupby('Sub-Category_no')['Sales'].sum() # Calculate the total sales for each sub-category profits_by_subcategory = df.groupby('Sub-Category_no')['Profit'].sum() # Plotting a bar chart of sales against sub-category plt.bar(sales_by_subcategory.index, sales_by_subcategory.values) plt.xlabel('Sub-Category_no') plt.ylabel('Sales') plt.title('Sales by Sub-Category') plt.show() # plotting a bar chart of profits against sub-category plt.bar(profits_by_subcategory.index, profits_by_subcategory.values) plt.xlabel('Sub-Category_no') plt.ylabel('Profit') plt.title('Profits by Sub-Category') plt.show() # Calculate the total profits and sales for each region profits_by_region = df.groupby('Region')['Profit'].sum() sales_by_region = df.groupby('Region')['Sales'].sum() # plotting a bar chart of profits against region plt.bar(profits_by_region.index, profits_by_region.values) plt.xlabel('Region') plt.ylabel('Profit') plt.title('Profits by Region') plt.show() # plotting a bar chart of sales against region plt.bar(sales_by_region.index, sales_by_region.values) plt.xlabel('Region') plt.ylabel('Sales') plt.title('Sales by Region') plt.show() df['Order Date'] = pd.to_datetime(df['Order Date']) df['Ship Date'] = pd.to_datetime(df['Ship Date']) # calculating the daily profits for each order df['Daily Profit'] = df['Profit'] / (df['Ship Date'] - df['Order Date']).dt.days # plotting a scatter plot of daily profits against ship date plt.scatter(df['Ship Date'], df['Daily Profit']) plt.xlabel('Ship Date') plt.ylabel('Daily Profit') plt.title('Daily Profits by Ship Date') plt.show() # plotting a scatter plot of daily profits against order date plt.scatter(df['Order Date'], df['Daily Profit']) plt.xlabel('Order Date') plt.ylabel('Daily Profit') plt.title('Daily Profits by Order Date') plt.show() # plotitng a bar chart of profits by ship date df_profit_by_ship_date = df.groupby('Ship Date')['Profit'].sum() df_profit_by_ship_date.plot(kind='bar', figsize=(10, 6)) plt.xlabel('Ship Date') plt.ylabel('Profit') plt.title('Profit by Ship Date') plt.show()

# linear regression model of sales and profits model = LinearRegression() X = df[['Sales']] y = df['Profit'] model.fit(X, y) # Print the coefficients of the linear regression model print(f'Coefficient: {model.coef_[0]}') print(f'Intercept: {model.intercept_}') # plotitng a scatter plot of sales against profits plt.scatter(df['Sales'], df['Profit']) plt.xlabel('Sales') plt.ylabel('Profit') plt.title('Profits vs. Sales') plt.show() # Make predictions using the linear regression model y_pred = model.predict(X) # Convert the predicted values to binary values (1 if profit > 0, 0 otherwise) y_pred_binary = np.where(y_pred > 0, 1, 0) # Convert the actual values to binary values (1 if profit > 0, 0 otherwise) y_true_binary = np.where(df['Profit'] > 0, 1, 0) # Calculate the confusion matrix and accuracy score conf_matrix = confusion_matrix(y_true_binary, y_pred_binary) accuracy = accuracy_score(y_true_binary, y_pred_binary) # Print the confusion matrix and accuracy score print(f'Confusion Matrix:\n{conf_matrix}') print(f'Accuracy Score: {accuracy}')