DSBE SEMESTERARBEIT

#Importiere Pandas import pandas as pd

data = pd.read_csv("year_genre_region.csv") data

data.shape #aufbau der csv datei anzahle zeilen und reihen

data.index #index start ende stop herausfinden

data.info() #info der csv datei herausfinden

data['region'].value_counts()

data['genre'].value_counts()

df = pd.read_csv("year_genre_region.csv") df

df_new = pd.get_dummies(df, columns=["region"], prefix="region") df_new

df_new = pd.get_dummies(df_new, columns=["genre"], prefix="genre") df_new

df_new

df_new.to_csv("filename_modified.csv", index=False)

df_new.info()

from sklearn.model_selection import train_test_split import pandas as pd # Load your data into a Pandas DataFrame data = pd.read_csv('filename_modified.csv') # Separate the features (X) and labels (y) X = data.drop('sales', axis=1) # Assuming 'target_variable' is the name of your target column y = data['sales'] # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=38) X

import numpy as np # Assuming you have your training target values stored in y_train # Calculate the mean value of the target variable in the training set baseline_prediction = np.mean(y_train) # Generate predictions using the baseline value for all instances in the testing set baseline_predictions = np.full_like(y_test, fill_value=baseline_prediction)

from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from sklearn.svm import SVR # Assuming you have your feature data stored in X_train and target values in y_train # Linear Regression linear_reg = LinearRegression() linear_reg.fit(X_train, y_train) # Decision Tree Regressor decision_tree = DecisionTreeRegressor() decision_tree.fit(X_train, y_train) # Random Forest Regressor random_forest = RandomForestRegressor() random_forest.fit(X_train, y_train) # Gradient Boosting Regressor gradient_boosting = GradientBoostingRegressor() gradient_boosting.fit(X_train, y_train) # Support Vector Regression (SVR) svr = SVR() svr.fit(X_train, y_train)

# Assuming you have your feature data stored in X_test # Make predictions using the trained models linear_reg_predictions = linear_reg.predict(X_test) decision_tree_predictions = decision_tree.predict(X_test) random_forest_predictions = random_forest.predict(X_test) gradient_boosting_predictions = gradient_boosting.predict(X_test) svr_predictions = svr.predict(X_test)

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score # Assuming you have the true target values for the test set stored in y_test # Calculate evaluation metrics for each model linear_reg_mse = mean_squared_error(y_test, linear_reg_predictions) linear_reg_mae = mean_absolute_error(y_test, linear_reg_predictions) linear_reg_r2 = r2_score(y_test, linear_reg_predictions) decision_tree_mse = mean_squared_error(y_test, decision_tree_predictions) decision_tree_mae = mean_absolute_error(y_test, decision_tree_predictions) decision_tree_r2 = r2_score(y_test, decision_tree_predictions) random_forest_mse = mean_squared_error(y_test, random_forest_predictions) random_forest_mae = mean_absolute_error(y_test, random_forest_predictions) random_forest_r2 = r2_score(y_test, random_forest_predictions) gradient_boosting_mse = mean_squared_error(y_test, gradient_boosting_predictions) gradient_boosting_mae = mean_absolute_error(y_test, gradient_boosting_predictions) gradient_boosting_r2 = r2_score(y_test, gradient_boosting_predictions) svr_mse = mean_squared_error(y_test, svr_predictions) svr_mae = mean_absolute_error(y_test, svr_predictions) svr_r2 = r2_score(y_test, svr_predictions)

# Print evaluation results for each model print("Linear Regression:") print(" MSE:", linear_reg_mse) print(" MAE:", linear_reg_mae) print(" R-squared:", linear_reg_r2) print() print("Decision Tree Regressor:") print(" MSE:", decision_tree_mse) print(" MAE:", decision_tree_mae) print(" R-squared:", decision_tree_r2) print() print("Random Forest Regressor:") print(" MSE:", random_forest_mse) print(" MAE:", random_forest_mae) print(" R-squared:", random_forest_r2) print() print("Gradient Boosting Regressor:") print(" MSE:", gradient_boosting_mse) print(" MAE:", gradient_boosting_mae) print(" R-squared:", gradient_boosting_r2) print() print("SVR:") print(" MSE:", svr_mse) print(" MAE:", svr_mae) print(" R-squared:", svr_r2) print()

import matplotlib.pyplot as plt # Assuming you have the true target values for the test set stored in y_test # Create a scatter plot to compare predicted vs. true values plt.scatter(y_test, linear_reg_predictions, color='blue', label='Linear Regression') plt.scatter(y_test, decision_tree_predictions, color='red', label='Decision Tree') plt.scatter(y_test, random_forest_predictions, color='green', label='Random Forest') plt.scatter(y_test, gradient_boosting_predictions, color='orange', label='Gradient Boosting') plt.scatter(y_test, svr_predictions, color='purple', label='SVR') # Add labels and a legend to the plot plt.xlabel('True Values') plt.ylabel('Predicted Values') plt.legend() # Show the plot plt.show()

import matplotlib.pyplot as plt # Assuming you have the true target values for the test set stored in y_test # Assuming you have the predicted results for each model stored in separate variables # Create a scatter plot for Linear Regression predictions plt.scatter(y_test, linear_reg_predictions, color='blue') plt.xlabel('True Values') plt.ylabel('Linear Regression Predictions') plt.title('Linear Regression Predictions vs. True Values') plt.show() # Create a scatter plot for Decision Tree Regressor predictions plt.scatter(y_test, decision_tree_predictions, color='red') plt.xlabel('True Values') plt.ylabel('Decision Tree Predictions') plt.title('Decision Tree Predictions vs. True Values') plt.show() # Create a scatter plot for Random Forest Regressor predictions plt.scatter(y_test, random_forest_predictions, color='green') plt.xlabel('True Values') plt.ylabel('Random Forest Predictions') plt.title('Random Forest Predictions vs. True Values') plt.show() # Create a scatter plot for Gradient Boosting Regressor predictions plt.scatter(y_test, gradient_boosting_predictions, color='orange') plt.xlabel('True Values') plt.ylabel('Gradient Boosting Predictions') plt.title('Gradient Boosting Predictions vs. True Values') plt.show() # Create a scatter plot for SVR predictions plt.scatter(y_test, svr_predictions, color='purple') plt.xlabel('True Values') plt.ylabel('SVR Predictions') plt.title('SVR Predictions vs. True Values') plt.show()