👋 Welcome

import matplotlib.pyplot as plt import seaborn as sns # Import seaborn for color palettes import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

Run to view results

import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline import warnings warnings.filterwarnings('ignore') from sklearn.preprocessing import LabelEncoder

Run to view results

import os file_path = os.path.join('/full', 'path', 'to', 'your', 'file', 'dataset_Superstore_Data_Analytics (3)-3.csv')

Run to view results

import pandas as pd df=pd.read_csv('/content/SuperStore_Data_Analytics (3)-2.csv') df.head()

Run to view results

#row,column count of data df.shape

Run to view results

#column names of table df.columns

Run to view results

#check data type of columns/attributes df.dtypes

Run to view results

df.isnull().sum()

Run to view results

#Dropping Row ID column and assigning to df df=df.drop('Row ID',axis=1) df.head()

Run to view results

df['Country'].value_counts()

Run to view results

#dropping Country column df=df.drop('Country',axis=1) df.head()

Run to view results

df['Category'].unique()

Run to view results

#number of products in each category df['Category'].value_counts()

Run to view results

#number of Sub-categories products are divided. df['Sub-Category'].nunique()

Run to view results

#number of products in each sub-category df['Sub-Category'].value_counts()

Run to view results

# Create a bar plot without percentage labels plt.figure(figsize=(16, 8)) # Use a different color palette for the bars colors = sns.color_palette('viridis', len(df['Sub-Category'])) bars = plt.bar('Sub-Category', 'Category', data=df, color=colors) # Set labels and title plt.xlabel('Sub-Category') plt.ylabel('Category') plt.title('Customized Distribution of Sub-Categories', fontdict={'fontsize': 16, 'fontweight': 'bold', 'color': 'purple'}) # Show the plot plt.show()

Run to view results

plt.figure(figsize=(12, 10)) # Use a different color palette colors = sns.color_palette("pastel") # Plotting the pie chart with customizations df['Sub-Category'].value_counts().plot.pie(autopct="%1.1f%%", colors=colors, wedgeprops=dict(width=0.4), startangle=90) # Add a title plt.title("Distribution of Sub-Categories") # Add a legend plt.legend(df['Sub-Category'].value_counts().index, title="Sub-Categories", loc="center left", bbox_to_anchor=(1, 0.5)) # Display the chart plt.show()

Run to view results

# Assuming you have a DataFrame 'df' with columns 'Sub-Category', 'Profit', and 'Sales' # Sample Data (Replace this with your actual data) data = {'Sub-Category': ['A', 'B', 'C', 'D'], 'Profit': [100, 150, 200, 50], 'Sales': [500, 800, 1200, 300]} df = pd.DataFrame(data) # Plotting with different colors and visual modifications ax = df.groupby('Sub-Category')['Profit', 'Sales'].agg(['sum']).plot.bar(color=['skyblue', 'lightcoral']) plt.title('Total Profit and Sales per Sub-Category', fontsize=16, color='darkblue') plt.xlabel('Sub-Category', fontsize=12, color='green') plt.ylabel('Amount', fontsize=12, color='purple') plt.legend(['Profit (total)', 'Sales (total)'], loc='upper right') plt.grid(axis='y', linestyle='--', alpha=0.7) # Adding value annotations on each bar for p in ax.patches: ax.annotate(f'{p.get_height():,.0f}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points', fontsize=10, color='black') # Show the modified plot plt.show()

Run to view results

df['Sub-Category']

Run to view results

print(df.columns) df['Product Name'].nunique()

Run to view results

df['Product Name'].value_counts()

Run to view results

# Define custom colors for the pie chart colors = ['#ff9999', '#66b3ff', '#99ff99', '#ffcc99', '#c2c2f0', '#ffb3e6', '#c2f0c2', '#ff6666', '#c2f0f0', '#ffccff'] df['Product Name'].value_counts().head(10).plot.pie(autopct="%1.1f%%", colors=colors) # Add a title to the pie chart plt.title('Distribution of Top 10 Products') # Show the plot plt.show()

Run to view results

# Count of Sub-Category region wise with values displayed on each bar plt.figure(figsize=(15, 8)) # Use a different color palette, for example, "Set2" sns.set_palette("Set2") # Increase the bar width and set edge color for better visibility ax = sns.countplot(x="Sub-Category", hue="Region", data=df, linewidth=2, edgecolor="0.2") # Add title and adjust font size plt.title("Count of Sub-Category region wise", fontsize=18) # Add labels and adjust font size plt.xlabel("Sub-Category", fontsize=14) plt.ylabel("Count", fontsize=14) # Add legend with adjusted font size plt.legend(title="Region", title_fontsize='14', fontsize='12') # Annotate values on each bar for p in ax.patches: ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points', fontsize=10) # Customize the grid lines plt.grid(axis='y', linestyle='--', alpha=0.7) # Show the plot plt.show()

Run to view results

df['Cost']=df['Sales']-df['Profit'] df['Cost'].head()

Run to view results

df['Profit %']=(df['Profit']/df['Cost'])*100

Run to view results

#Profit Percentage of first 5 product names df.iloc[[0,1,2,3,4],[14,20]]

Run to view results

#Products with high Profit Percentage df.sort_values(['Profit %','Product Name'],ascending=False).groupby('Profit %').head(10)

Run to view results

df['Customer ID'].nunique()

Run to view results

#Top 10 customers who order frequently df_top10=df['Customer Name'].value_counts().head(10) df_top10

Run to view results

# Change color palette custom_palette = sns.color_palette("husl", 3) # You can choose a different palette fig = plt.figure(figsize=(10, 8)) ax = fig.add_subplot(111) # Use `ax` parameter to specify the subplot s = sns.countplot(x='Segment', data=df, palette=custom_palette, ax=ax) # Customize annotations for rect in ax.patches: height = rect.get_height() ax.annotate( f'{height}', xy=(rect.get_x() + rect.get_width() / 2, height), xytext=(0, 3), # Adjust the vertical position of the text textcoords="offset points", ha='center', va='bottom' ) plt.show()

Run to view results

# Top 20 Customers who benefitted the store sortedTop20 = df.sort_values(['Profit'], ascending=False).head(20) # Choose a different color palette and style fig = plt.figure(figsize=(12, 8)) ax = fig.add_subplot(111) p = sns.barplot(x='Customer Name', y='Profit', hue='State', palette='viridis', data=sortedTop20, ax=ax) # Set a different title and rotate x-axis labels ax.set_title("Top 20 Profitable Customers - Different Style") ax.set_xticklabels(p.get_xticklabels(), rotation=45) # Change the background color or any other customizations ax.set_facecolor('#f0f0f0') # Light gray background color plt.tight_layout() plt.show()

Run to view results

#number of unique orders df['Order ID'].nunique()

Run to view results

#Calculating the time taken for an order to ship and converting the no. of days in int format df['Shipment Duration']=(pd.to_datetime(df['Ship Date'])-pd.to_datetime(df['Order Date'])).dt.days df['Shipment Duration']

Run to view results

df.iloc[:,[0,3,21]]

Run to view results

#creating function and appending customer and order info to it. def agg_customer(x): d = [] d.append(x['Order ID'].count()) d.append(x['Sales'].sum()) d.append(x['Profit %'].mean()) d.append(pd.to_datetime(x['Order Date']).min()) d.append(pd.to_datetime(x['Order Date']).max()) d.append(x['Product Name'].unique()) d.append(x['City'].unique()) return pd.Series(d, index=['#Purchases','Total_Sales','Average Profit % gained','First_Purchase_Date','Latest_Purchase_Date','Products Purchased','Location_Count'])

Run to view results

#grouping based on Customer ID and applying the function we created above df_agg = df.groupby('Customer ID').apply(agg_customer) df_agg

Run to view results

# Convert 'Order Date' to datetime format df['Order Date'] = pd.to_datetime(df['Order Date']) # Extracting the year of order df['order year'] = df['Order Date'].dt.year df['order year'].head()

Run to view results

# Calculating Profit gained in each Category fig = plt.figure(figsize=(16, 8)) ax = fig.add_subplot(111) sns.barplot(x='order year', y='Profit %', hue='Sub-Category', palette='Paired', data=df, estimator=np.mean) # Annotate each bar with its height for o in ax.patches: ax.annotate('{:.0f}'.format(o.get_height()), (o.get_x() + 0.15, o.get_height() + 1)) plt.show()

Run to view results

#Sales per year df.groupby('order year')['Sales','Profit %'].agg(['sum']).plot.bar() plt.title('Year wise Total Sales & % of profit gained')

Run to view results

from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsRegressor from sklearn.metrics import mean_squared_error from sklearn.preprocessing import StandardScaler # Selecting relevant features for prediction features = ['Sales', 'Quantity', 'Discount'] # Extracting features and target variable X = df[features] y = df['Profit'] # Splitting the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) # Standardize the features scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # Initialize KNN regressor knn = KNeighborsRegressor(n_neighbors=5) # Fit the model knn.fit(X_train_scaled, y_train) # Predicting on the test set y_pred = knn.predict(X_test_scaled) # Evaluate the model mse = mean_squared_error(y_test, y_pred) print(f'Mean Squared Error: {mse}')

Run to view results

sns.lineplot(data=df,x='Discount',y='Profit') plt.title('Correlation Between Profit and Discount')

Run to view results

sns.lineplot(data=df,x='Discount',y='Sales') plt.title('Correlation Between Sales and Discount')

Run to view results

sns.regplot(data=df, x='Discount', y='Profit', scatter_kws={'s': 10}) plt.title('Correlation Between Profit and Discount') plt.show()

Run to view results

df=pd.read_csv('/content/dataset_Superstore_-163487463.csv',encoding= 'unicode_escape')

Run to view results

df.columns

Run to view results

import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score from sklearn.impute import SimpleImputer # Assuming your DataFrame has the necessary structure with index values # Adjust the column names as needed df = pd.DataFrame(columns=['ï»¿Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode', 'Customer ID', 'Customer Name', 'Customer_no', 'Segment', 'Segment_no', 'Country', 'City', 'State', 'State_no', 'Postal Code', 'Region', 'Region_no', 'Product ID', 'Category', 'Category_no', 'Sub-Category', 'Sub-Category_no', 'Product Name', 'Product Name_no', 'Sales', 'Quantity', 'Discount', 'Profit', 'Returned']) # Check if any column in the DataFrame has missing values if df.isnull().any().any(): # Drop rows with missing values df = df.dropna() # Select features (excluding columns with object dtype) and target variable X = df.select_dtypes(include=[np.number]) y = df['Profit'] # Assuming 'Profit' is the target variable # Split the data into training and testing sets (80% training, 20% testing) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Preprocessing steps for numeric features numeric_features = X_train.columns.tolist() numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler()) ]) preprocessor = ColumnTransformer(transformers=[ ('num', numeric_transformer, numeric_features), ]) # Define models linear_model = LinearRegression() decision_tree_model = DecisionTreeRegressor(random_state=42) random_forest_model = RandomForestRegressor(random_state=42) # Create a dictionary to store the models and their names models = { 'Linear Regression': linear_model, 'Decision Tree': decision_tree_model, 'Random Forest': random_forest_model } results = {} for name, model in models.items(): # Create a pipeline with preprocessor and the current model model_pipeline = Pipeline(steps=[ ('preprocessor', preprocessor), ('model', model) ]) # Fit the model on the training data model_pipeline.fit(X_train, y_train) # Make predictions on the test data y_pred = model_pipeline.predict(X_test) # Evaluate the model using MSE and R-squared mse = mean_squared_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) # Store the results in the dictionary results[name] = {'MSE': mse, 'R-squared': r2} # Convert the results to a DataFrame for easier visualization results_df = pd.DataFrame.from_dict(results, orient='index') print(results_df) else: print("No missing values found in the DataFrame.")

Run to view results

import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score from sklearn.impute import SimpleImputer # Load your dataset (replace 'your_dataset.csv' with the actual file path) df = pd.read_csv('/content/dataset_Superstore_-163487463.csv') # Check if any column in the DataFrame has missing values if not df.empty and df.isnull().any().any(): # Drop rows with missing values df = df.dropna() # Select features (excluding columns with object dtype) and target variable X = df.select_dtypes(include=[np.number]) y = df['Profit'] # Assuming 'Profit' is the target variable # Split the data into training and testing sets (80% training, 20% testing) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Preprocessing steps for numeric features numeric_features = X_train.columns.tolist() numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler()) ]) preprocessor = ColumnTransformer(transformers=[ ('num', numeric_transformer, numeric_features), ]) # Define models linear_model = LinearRegression() decision_tree_model = DecisionTreeRegressor(random_state=42) random_forest_model = RandomForestRegressor(random_state=42) # Create a dictionary to store the models and their names models = { 'Linear Regression': linear_model, 'Decision Tree': decision_tree_model, 'Random Forest': random_forest_model } results = {} for name, model in models.items(): # Create a pipeline with preprocessor and the current model model_pipeline = Pipeline(steps=[ ('preprocessor', preprocessor), ('model', model) ]) # Fit the model on the training data model_pipeline.fit(X_train, y_train) # Make predictions on the test data y_pred = model_pipeline.predict(X_test) # Evaluate the model using MSE and R-squared mse = mean_squared_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) # Store the results in the dictionary results[name] = {'MSE': mse, 'R-squared': r2} # Convert the results to a DataFrame for easier visualization results_df = pd.DataFrame.from_dict(results, orient='index') print(results_df) else: if df.empty: print("The DataFrame is empty.") else: print("No missing values found in the DataFrame.")

Run to view results

import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score # Load your dataset (replace 'your_dataset.csv' with the actual file path) df = pd.read_csv('/content/dataset_Superstore_-163487463.csv') # Select features (excluding columns with object dtype) and target variable X = df.select_dtypes(include=[np.number]) y = df['Profit'] # Assuming 'Profit' is the target variable # Split the data into training and testing sets (80% training, 20% testing) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Preprocessing steps for numeric features numeric_features = X_train.columns.tolist() numeric_transformer = Pipeline(steps=[ ('scaler', StandardScaler()) ]) preprocessor = ColumnTransformer(transformers=[ ('num', numeric_transformer, numeric_features), ]) # Define models linear_model = LinearRegression() decision_tree_model = DecisionTreeRegressor(random_state=42) random_forest_model = RandomForestRegressor(random_state=42) # Create a dictionary to store the models and their names models = { 'Linear Regression': linear_model, 'Decision Tree': decision_tree_model, 'Random Forest': random_forest_model } results = {} for name, model in models.items(): # Create a pipeline with preprocessor and the current model model_pipeline = Pipeline(steps=[ ('preprocessor', preprocessor), ('model', model) ]) # Fit the model on the training data model_pipeline.fit(X_train, y_train) # Make predictions on the test data y_pred = model_pipeline.predict(X_test) # Evaluate the model using MSE and R-squared mse = mean_squared_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) # Store the results in the dictionary results[name] = {'MSE': mse, 'R-squared': r2} # Convert the results to a DataFrame for easier visualization results_df = pd.DataFrame.from_dict(results, orient='index') print(results_df)

Run to view results

import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import roc_auc_score, roc_curve from sklearn.impute import SimpleImputer # Assuming your DataFrame has the necessary structure with index values # Adjust the column names as needed df = pd.read_csv('/content/dataset_Superstore_-163487463.csv') # Drop rows with missing values in the 'Sales' column df = df.dropna(subset=['Sales']) # Define the target variable (whether a product is of interest or not) df['Product_of_Interest'] = df['Sales'] > 500 # Select features and target variable X = df.drop(columns=['Sales', 'Product_of_Interest']) y = df['Product_of_Interest'] # Split the data into training and testing sets (80% training, 20% testing) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Preprocessing steps for numeric and categorical features numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist() categorical_features = X_train.select_dtypes(include=['object']).columns.tolist() numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler()) ]) categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore')) ]) preprocessor = ColumnTransformer(transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features) ]) # Use the previously trained Logistic Regression model logistic_model = LogisticRegression(random_state=42, max_iter=1000, solver='saga') # Create a pipeline with preprocessor and the logistic regression model logistic_model_pipeline = Pipeline(steps=[ ('preprocessor', preprocessor), ('model', logistic_model) ]) # Fit the logistic regression model on the training data (you can skip this if you saved and loaded the model) logistic_model_pipeline.fit(X_train, y_train) # Make predictions on the test data y_pred_proba = logistic_model_pipeline.predict_proba(X_test)[:, 1] # Calculate the AUC ROC score auc_roc = roc_auc_score(y_test, y_pred_proba) print("AUC ROC Score:", auc_roc) # Create the ROC curve fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba) # Plot the ROC curve import matplotlib.pyplot as plt # Plot the ROC curve plt.figure(figsize=(8, 6)) plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % auc_roc) plt.plot([0, 1], [0, 1], color='gray', linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve') plt.legend(loc='lower right') plt.grid(True) plt.show()

Run to view results

import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.metrics import roc_auc_score, roc_curve from sklearn.impute import SimpleImputer # Assuming your DataFrame has the necessary structure with index values # Adjust the column names as needed df = pd.read_csv('/content/dataset_Superstore_-163487463.csv') # Drop rows with missing values in the 'Sales' column df = df.dropna(subset=['Sales']) # Define the target variable (whether a product is of interest or not) df['Product_of_Interest'] = df['Sales'] > 500 # Select features and target variable X = df.drop(columns=['Sales', 'Product_of_Interest']) y = df['Product_of_Interest'] # Split the data into training and testing sets (80% training, 20% testing) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Preprocessing steps for numeric and categorical features numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist() categorical_features = X_train.select_dtypes(include=['object']).columns.tolist() numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler()) ]) categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(handle_unknown='ignore')) ]) preprocessor = ColumnTransformer(transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features) ]) # Create models models = [ ('Logistic Regression', LogisticRegression(random_state=42)), ('Random Forest', RandomForestClassifier(random_state=42)), ('SVM', SVC(probability=True, random_state=42)) ] # Fit and evaluate models results = [] for name, model in models: model_pipeline = Pipeline(steps=[ ('preprocessor', preprocessor), ('model', model) ]) model_pipeline.fit(X_train, y_train) y_pred_proba = model_pipeline.predict_proba(X_test)[:, 1] auc_roc = roc_auc_score(y_test, y_pred_proba) results.append((name, auc_roc)) # Print results for name, auc_roc in results: print(f"{name} - AUC ROC Score: {auc_roc}") # Plot the ROC curves plt.figure(figsize=(8, 6)) for name, model in models: model_pipeline = Pipeline(steps=[ ('preprocessor', preprocessor), ('model', model) ]) model_pipeline.fit(X_train, y_train) y_pred_proba = model_pipeline.predict_proba(X_test)[:, 1] fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba) auc_roc = roc_auc_score(y_test, y_pred_proba) plt.plot(fpr, tpr, lw=2, label=f"{name} (AUC = {auc_roc:.2f})") plt.plot([0, 1], [0, 1], color='gray', linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curves Comparison') plt.legend(loc='lower right') plt.grid(True) plt.show()

Run to view results