import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
# Step 1: Importing the the CSV file:
df = pd.read_csv('dataset_Superstore (2).csv')
# view the summary table
df.describe()
# Calculate the z-score for each data point in the "Profits" column
z_scores_profits = np.abs((df['Profit'] - df['Profit'].mean()) / df['Profit'].std())
z_scores_sales = np.abs((df['Sales'] - df['Sales'].mean()) / df['Sales'].std())
z_scores_Quantity = np.abs((df['Quantity'] - df['Quantity'].mean()) / df['Quantity'].std())
# I chose a threshold of 1, as it give me the best values
threshold = 1
# Removing the outliers
df = df[(z_scores_profits < threshold) & (z_scores_sales < threshold) & (z_scores_Quantity < threshold)]
# Removing anything above 1 in the discount column
df = df[df['Discount'] <= 1]
# Save the cleaned data to a new CSV file
df.to_csv('dataset_Superstore (2)', index=False)
df.describe()
# Ploting a histogram of sales
plt.hist(df['Sales'], bins=20)
plt.xlabel('Sales')
plt.ylabel('Frequency')
plt.title('Histogram of Sales')
plt.show()
# Calculate the total sales for each sub-category
sales_by_subcategory = df.groupby('Sub-Category_no')['Sales'].sum()
# Calculate the total sales for each sub-category
profits_by_subcategory = df.groupby('Sub-Category_no')['Profit'].sum()
# Plotting a bar chart of sales against sub-category
plt.bar(sales_by_subcategory.index, sales_by_subcategory.values)
plt.xlabel('Sub-Category_no')
plt.ylabel('Sales')
plt.title('Sales by Sub-Category')
plt.show()
# plotting a bar chart of profits against sub-category
plt.bar(profits_by_subcategory.index, profits_by_subcategory.values)
plt.xlabel('Sub-Category_no')
plt.ylabel('Profit')
plt.title('Profits by Sub-Category')
plt.show()
# Calculate the total profits and sales for each region
profits_by_region = df.groupby('Region')['Profit'].sum()
sales_by_region = df.groupby('Region')['Sales'].sum()
# plotting a bar chart of profits against region
plt.bar(profits_by_region.index, profits_by_region.values)
plt.xlabel('Region')
plt.ylabel('Profit')
plt.title('Profits by Region')
plt.show()
# plotting a bar chart of sales against region
plt.bar(sales_by_region.index, sales_by_region.values)
plt.xlabel('Region')
plt.ylabel('Sales')
plt.title('Sales by Region')
plt.show()
df['Order Date'] = pd.to_datetime(df['Order Date'])
df['Ship Date'] = pd.to_datetime(df['Ship Date'])
# calculating the daily profits for each order
df['Daily Profit'] = df['Profit'] / (df['Ship Date'] - df['Order Date']).dt.days
# plotting a scatter plot of daily profits against ship date
plt.scatter(df['Ship Date'], df['Daily Profit'])
plt.xlabel('Ship Date')
plt.ylabel('Daily Profit')
plt.title('Daily Profits by Ship Date')
plt.show()
# plotting a scatter plot of daily profits against order date
plt.scatter(df['Order Date'], df['Daily Profit'])
plt.xlabel('Order Date')
plt.ylabel('Daily Profit')
plt.title('Daily Profits by Order Date')
plt.show()
# plotitng a bar chart of profits by ship date
df_profit_by_ship_date = df.groupby('Ship Date')['Profit'].sum()
df_profit_by_ship_date.plot(kind='bar', figsize=(10, 6))
plt.xlabel('Ship Date')
plt.ylabel('Profit')
plt.title('Profit by Ship Date')
plt.show()
# linear regression model of sales and profits
model = LinearRegression()
X = df[['Sales']]
y = df['Profit']
model.fit(X, y)
# Print the coefficients of the linear regression model
print(f'Coefficient: {model.coef_[0]}')
print(f'Intercept: {model.intercept_}')
# plotitng a scatter plot of sales against profits
plt.scatter(df['Sales'], df['Profit'])
plt.xlabel('Sales')
plt.ylabel('Profit')
plt.title('Profits vs. Sales')
plt.show()
# Make predictions using the linear regression model
y_pred = model.predict(X)
# Convert the predicted values to binary values (1 if profit > 0, 0 otherwise)
y_pred_binary = np.where(y_pred > 0, 1, 0)
# Convert the actual values to binary values (1 if profit > 0, 0 otherwise)
y_true_binary = np.where(df['Profit'] > 0, 1, 0)
# Calculate the confusion matrix and accuracy score
conf_matrix = confusion_matrix(y_true_binary, y_pred_binary)
accuracy = accuracy_score(y_true_binary, y_pred_binary)
# Print the confusion matrix and accuracy score
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Accuracy Score: {accuracy}')