import matplotlib.pyplot as plt
import seaborn as sns # Import seaborn for color palettes
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
import os
file_path = os.path.join('/full', 'path', 'to', 'your', 'file', 'dataset_Superstore_Data_Analytics (3)-3.csv')
import pandas as pd
df=pd.read_csv('/content/SuperStore_Data_Analytics (3)-2.csv')
df.head()
#row,column count of data
df.shape
#column names of table
df.columns
#check data type of columns/attributes
df.dtypes
df.isnull().sum()
#Dropping Row ID column and assigning to df
df=df.drop('Row ID',axis=1)
df.head()
df['Country'].value_counts()
#dropping Country column
df=df.drop('Country',axis=1)
df.head()
df['Category'].unique()
#number of products in each category
df['Category'].value_counts()
#number of Sub-categories products are divided.
df['Sub-Category'].nunique()
#number of products in each sub-category
df['Sub-Category'].value_counts()
# Create a bar plot without percentage labels
plt.figure(figsize=(16, 8))
# Use a different color palette for the bars
colors = sns.color_palette('viridis', len(df['Sub-Category']))
bars = plt.bar('Sub-Category', 'Category', data=df, color=colors)
# Set labels and title
plt.xlabel('Sub-Category')
plt.ylabel('Category')
plt.title('Customized Distribution of Sub-Categories', fontdict={'fontsize': 16, 'fontweight': 'bold', 'color': 'purple'})
# Show the plot
plt.show()
plt.figure(figsize=(12, 10))
# Use a different color palette
colors = sns.color_palette("pastel")
# Plotting the pie chart with customizations
df['Sub-Category'].value_counts().plot.pie(autopct="%1.1f%%", colors=colors, wedgeprops=dict(width=0.4), startangle=90)
# Add a title
plt.title("Distribution of Sub-Categories")
# Add a legend
plt.legend(df['Sub-Category'].value_counts().index, title="Sub-Categories", loc="center left", bbox_to_anchor=(1, 0.5))
# Display the chart
plt.show()
# Assuming you have a DataFrame 'df' with columns 'Sub-Category', 'Profit', and 'Sales'
# Sample Data (Replace this with your actual data)
data = {'Sub-Category': ['A', 'B', 'C', 'D'],
'Profit': [100, 150, 200, 50],
'Sales': [500, 800, 1200, 300]}
df = pd.DataFrame(data)
# Plotting with different colors and visual modifications
ax = df.groupby('Sub-Category')['Profit', 'Sales'].agg(['sum']).plot.bar(color=['skyblue', 'lightcoral'])
plt.title('Total Profit and Sales per Sub-Category', fontsize=16, color='darkblue')
plt.xlabel('Sub-Category', fontsize=12, color='green')
plt.ylabel('Amount', fontsize=12, color='purple')
plt.legend(['Profit (total)', 'Sales (total)'], loc='upper right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
# Adding value annotations on each bar
for p in ax.patches:
ax.annotate(f'{p.get_height():,.0f}', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', xytext=(0, 10), textcoords='offset points', fontsize=10, color='black')
# Show the modified plot
plt.show()
df['Sub-Category']
print(df.columns)
df['Product Name'].nunique()
df['Product Name'].value_counts()
# Define custom colors for the pie chart
colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99', '#c2c2f0', '#ffb3e6', '#c2f0c2', '#ff6666', '#c2f0f0', '#ffccff']
df['Product Name'].value_counts().head(10).plot.pie(autopct="%1.1f%%", colors=colors)
# Add a title to the pie chart
plt.title('Distribution of Top 10 Products')
# Show the plot
plt.show()
# Count of Sub-Category region wise with values displayed on each bar
plt.figure(figsize=(15, 8))
# Use a different color palette, for example, "Set2"
sns.set_palette("Set2")
# Increase the bar width and set edge color for better visibility
ax = sns.countplot(x="Sub-Category", hue="Region", data=df, linewidth=2, edgecolor="0.2")
# Add title and adjust font size
plt.title("Count of Sub-Category region wise", fontsize=18)
# Add labels and adjust font size
plt.xlabel("Sub-Category", fontsize=14)
plt.ylabel("Count", fontsize=14)
# Add legend with adjusted font size
plt.legend(title="Region", title_fontsize='14', fontsize='12')
# Annotate values on each bar
for p in ax.patches:
ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', xytext=(0, 10), textcoords='offset points', fontsize=10)
# Customize the grid lines
plt.grid(axis='y', linestyle='--', alpha=0.7)
# Show the plot
plt.show()
df['Cost']=df['Sales']-df['Profit']
df['Cost'].head()
df['Profit %']=(df['Profit']/df['Cost'])*100
#Profit Percentage of first 5 product names
df.iloc[[0,1,2,3,4],[14,20]]
#Products with high Profit Percentage
df.sort_values(['Profit %','Product Name'],ascending=False).groupby('Profit %').head(10)
df['Customer ID'].nunique()
#Top 10 customers who order frequently
df_top10=df['Customer Name'].value_counts().head(10)
df_top10
# Change color palette
custom_palette = sns.color_palette("husl", 3) # You can choose a different palette
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111)
# Use `ax` parameter to specify the subplot
s = sns.countplot(x='Segment', data=df, palette=custom_palette, ax=ax)
# Customize annotations
for rect in ax.patches:
height = rect.get_height()
ax.annotate(
f'{height}',
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(0, 3), # Adjust the vertical position of the text
textcoords="offset points",
ha='center',
va='bottom'
)
plt.show()
# Top 20 Customers who benefitted the store
sortedTop20 = df.sort_values(['Profit'], ascending=False).head(20)
# Choose a different color palette and style
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111)
p = sns.barplot(x='Customer Name', y='Profit', hue='State', palette='viridis', data=sortedTop20, ax=ax)
# Set a different title and rotate x-axis labels
ax.set_title("Top 20 Profitable Customers - Different Style")
ax.set_xticklabels(p.get_xticklabels(), rotation=45)
# Change the background color or any other customizations
ax.set_facecolor('#f0f0f0') # Light gray background color
plt.tight_layout()
plt.show()
#number of unique orders
df['Order ID'].nunique()
#Calculating the time taken for an order to ship and converting the no. of days in int format
df['Shipment Duration']=(pd.to_datetime(df['Ship Date'])-pd.to_datetime(df['Order Date'])).dt.days
df['Shipment Duration']
df.iloc[:,[0,3,21]]
#creating function and appending customer and order info to it.
def agg_customer(x):
d = []
d.append(x['Order ID'].count())
d.append(x['Sales'].sum())
d.append(x['Profit %'].mean())
d.append(pd.to_datetime(x['Order Date']).min())
d.append(pd.to_datetime(x['Order Date']).max())
d.append(x['Product Name'].unique())
d.append(x['City'].unique())
return pd.Series(d, index=['#Purchases','Total_Sales','Average Profit % gained','First_Purchase_Date','Latest_Purchase_Date','Products Purchased','Location_Count'])
#grouping based on Customer ID and applying the function we created above
df_agg = df.groupby('Customer ID').apply(agg_customer)
df_agg
# Convert 'Order Date' to datetime format
df['Order Date'] = pd.to_datetime(df['Order Date'])
# Extracting the year of order
df['order year'] = df['Order Date'].dt.year
df['order year'].head()
# Calculating Profit gained in each Category
fig = plt.figure(figsize=(16, 8))
ax = fig.add_subplot(111)
sns.barplot(x='order year', y='Profit %', hue='Sub-Category', palette='Paired', data=df, estimator=np.mean)
# Annotate each bar with its height
for o in ax.patches:
ax.annotate('{:.0f}'.format(o.get_height()), (o.get_x() + 0.15, o.get_height() + 1))
plt.show()
#Sales per year
df.groupby('order year')['Sales','Profit %'].agg(['sum']).plot.bar()
plt.title('Year wise Total Sales & % of profit gained')
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
# Selecting relevant features for prediction
features = ['Sales', 'Quantity', 'Discount']
# Extracting features and target variable
X = df[features]
y = df['Profit']
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Initialize KNN regressor
knn = KNeighborsRegressor(n_neighbors=5)
# Fit the model
knn.fit(X_train_scaled, y_train)
# Predicting on the test set
y_pred = knn.predict(X_test_scaled)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
sns.lineplot(data=df,x='Discount',y='Profit')
plt.title('Correlation Between Profit and Discount')
sns.lineplot(data=df,x='Discount',y='Sales')
plt.title('Correlation Between Sales and Discount')
sns.regplot(data=df, x='Discount', y='Profit', scatter_kws={'s': 10})
plt.title('Correlation Between Profit and Discount')
plt.show()
df=pd.read_csv('/content/dataset_Superstore_-163487463.csv',encoding= 'unicode_escape')
df.columns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
# Assuming your DataFrame has the necessary structure with index values
# Adjust the column names as needed
df = pd.DataFrame(columns=['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
'Customer ID', 'Customer Name', 'Customer_no', 'Segment', 'Segment_no',
'Country', 'City', 'State', 'State_no', 'Postal Code', 'Region',
'Region_no', 'Product ID', 'Category', 'Category_no', 'Sub-Category',
'Sub-Category_no', 'Product Name', 'Product Name_no',
'Sales', 'Quantity', 'Discount', 'Profit', 'Returned'])
# Check if any column in the DataFrame has missing values
if df.isnull().any().any():
# Drop rows with missing values
df = df.dropna()
# Select features (excluding columns with object dtype) and target variable
X = df.select_dtypes(include=[np.number])
y = df['Profit'] # Assuming 'Profit' is the target variable
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Preprocessing steps for numeric features
numeric_features = X_train.columns.tolist()
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
preprocessor = ColumnTransformer(transformers=[
('num', numeric_transformer, numeric_features),
])
# Define models
linear_model = LinearRegression()
decision_tree_model = DecisionTreeRegressor(random_state=42)
random_forest_model = RandomForestRegressor(random_state=42)
# Create a dictionary to store the models and their names
models = {
'Linear Regression': linear_model,
'Decision Tree': decision_tree_model,
'Random Forest': random_forest_model
}
results = {}
for name, model in models.items():
# Create a pipeline with preprocessor and the current model
model_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('model', model)
])
# Fit the model on the training data
model_pipeline.fit(X_train, y_train)
# Make predictions on the test data
y_pred = model_pipeline.predict(X_test)
# Evaluate the model using MSE and R-squared
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Store the results in the dictionary
results[name] = {'MSE': mse, 'R-squared': r2}
# Convert the results to a DataFrame for easier visualization
results_df = pd.DataFrame.from_dict(results, orient='index')
print(results_df)
else:
print("No missing values found in the DataFrame.")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
# Load your dataset (replace 'your_dataset.csv' with the actual file path)
df = pd.read_csv('/content/dataset_Superstore_-163487463.csv')
# Check if any column in the DataFrame has missing values
if not df.empty and df.isnull().any().any():
# Drop rows with missing values
df = df.dropna()
# Select features (excluding columns with object dtype) and target variable
X = df.select_dtypes(include=[np.number])
y = df['Profit'] # Assuming 'Profit' is the target variable
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Preprocessing steps for numeric features
numeric_features = X_train.columns.tolist()
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
preprocessor = ColumnTransformer(transformers=[
('num', numeric_transformer, numeric_features),
])
# Define models
linear_model = LinearRegression()
decision_tree_model = DecisionTreeRegressor(random_state=42)
random_forest_model = RandomForestRegressor(random_state=42)
# Create a dictionary to store the models and their names
models = {
'Linear Regression': linear_model,
'Decision Tree': decision_tree_model,
'Random Forest': random_forest_model
}
results = {}
for name, model in models.items():
# Create a pipeline with preprocessor and the current model
model_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('model', model)
])
# Fit the model on the training data
model_pipeline.fit(X_train, y_train)
# Make predictions on the test data
y_pred = model_pipeline.predict(X_test)
# Evaluate the model using MSE and R-squared
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Store the results in the dictionary
results[name] = {'MSE': mse, 'R-squared': r2}
# Convert the results to a DataFrame for easier visualization
results_df = pd.DataFrame.from_dict(results, orient='index')
print(results_df)
else:
if df.empty:
print("The DataFrame is empty.")
else:
print("No missing values found in the DataFrame.")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
# Load your dataset (replace 'your_dataset.csv' with the actual file path)
df = pd.read_csv('/content/dataset_Superstore_-163487463.csv')
# Select features (excluding columns with object dtype) and target variable
X = df.select_dtypes(include=[np.number])
y = df['Profit'] # Assuming 'Profit' is the target variable
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Preprocessing steps for numeric features
numeric_features = X_train.columns.tolist()
numeric_transformer = Pipeline(steps=[
('scaler', StandardScaler())
])
preprocessor = ColumnTransformer(transformers=[
('num', numeric_transformer, numeric_features),
])
# Define models
linear_model = LinearRegression()
decision_tree_model = DecisionTreeRegressor(random_state=42)
random_forest_model = RandomForestRegressor(random_state=42)
# Create a dictionary to store the models and their names
models = {
'Linear Regression': linear_model,
'Decision Tree': decision_tree_model,
'Random Forest': random_forest_model
}
results = {}
for name, model in models.items():
# Create a pipeline with preprocessor and the current model
model_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('model', model)
])
# Fit the model on the training data
model_pipeline.fit(X_train, y_train)
# Make predictions on the test data
y_pred = model_pipeline.predict(X_test)
# Evaluate the model using MSE and R-squared
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Store the results in the dictionary
results[name] = {'MSE': mse, 'R-squared': r2}
# Convert the results to a DataFrame for easier visualization
results_df = pd.DataFrame.from_dict(results, orient='index')
print(results_df)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.impute import SimpleImputer
# Assuming your DataFrame has the necessary structure with index values
# Adjust the column names as needed
df = pd.read_csv('/content/dataset_Superstore_-163487463.csv')
# Drop rows with missing values in the 'Sales' column
df = df.dropna(subset=['Sales'])
# Define the target variable (whether a product is of interest or not)
df['Product_of_Interest'] = df['Sales'] > 500
# Select features and target variable
X = df.drop(columns=['Sales', 'Product_of_Interest'])
y = df['Product_of_Interest']
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Preprocessing steps for numeric and categorical features
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
# Use the previously trained Logistic Regression model
logistic_model = LogisticRegression(random_state=42, max_iter=1000, solver='saga')
# Create a pipeline with preprocessor and the logistic regression model
logistic_model_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('model', logistic_model)
])
# Fit the logistic regression model on the training data (you can skip this if you saved and loaded the model)
logistic_model_pipeline.fit(X_train, y_train)
# Make predictions on the test data
y_pred_proba = logistic_model_pipeline.predict_proba(X_test)[:, 1]
# Calculate the AUC ROC score
auc_roc = roc_auc_score(y_test, y_pred_proba)
print("AUC ROC Score:", auc_roc)
# Create the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
# Plot the ROC curve
import matplotlib.pyplot as plt
# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % auc_roc)
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.impute import SimpleImputer
# Assuming your DataFrame has the necessary structure with index values
# Adjust the column names as needed
df = pd.read_csv('/content/dataset_Superstore_-163487463.csv')
# Drop rows with missing values in the 'Sales' column
df = df.dropna(subset=['Sales'])
# Define the target variable (whether a product is of interest or not)
df['Product_of_Interest'] = df['Sales'] > 500
# Select features and target variable
X = df.drop(columns=['Sales', 'Product_of_Interest'])
y = df['Product_of_Interest']
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Preprocessing steps for numeric and categorical features
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
# Create models
models = [
('Logistic Regression', LogisticRegression(random_state=42)),
('Random Forest', RandomForestClassifier(random_state=42)),
('SVM', SVC(probability=True, random_state=42))
]
# Fit and evaluate models
results = []
for name, model in models:
model_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('model', model)
])
model_pipeline.fit(X_train, y_train)
y_pred_proba = model_pipeline.predict_proba(X_test)[:, 1]
auc_roc = roc_auc_score(y_test, y_pred_proba)
results.append((name, auc_roc))
# Print results
for name, auc_roc in results:
print(f"{name} - AUC ROC Score: {auc_roc}")
# Plot the ROC curves
plt.figure(figsize=(8, 6))
for name, model in models:
model_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('model', model)
])
model_pipeline.fit(X_train, y_train)
y_pred_proba = model_pipeline.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
auc_roc = roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr, tpr, lw=2, label=f"{name} (AUC = {auc_roc:.2f})")
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()