import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("test_scores.csv")
df
df["school"].value_counts().plot(kind="barh")
my_plot = df[["school_setting", "posttest"]].boxplot(by="school_setting")
my_plot = df[["school_type", "posttest"]].boxplot(by="school_type")
my_plot = df[["teaching_method", "posttest"]].boxplot(by="teaching_method")
df[["n_student", "posttest"]].plot.scatter(x="n_student", y="posttest")
my_plot = df[["lunch", "posttest"]].boxplot(by="lunch")
df.groupby(["school_setting"])["lunch"].value_counts(normalize=True)
df[["pretest", "posttest"]].plot.scatter(x="pretest", y="posttest")
df.groupby(["school_setting"])["gender"].value_counts(normalize=True)
df["school_setting"].value_counts().plot.bar()
df.groupby(["teaching_method"])["lunch"].value_counts(normalize=True)
df.groupby(["school_setting"])["teaching_method"].value_counts(normalize=True)
df.groupby(["school_type"])["teaching_method"].value_counts(normalize=True)
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
X = df[["school_setting", "school_type", "teaching_method", "n_student", "gender", "lunch"]]
y = df[["posttest"]]
one_hot_columns = ["school_setting", "school_type", "teaching_method", "gender", "lunch"]
scaling_columns = ["n_student"]
transformation_pipeline = ColumnTransformer([
("one_hot_encoder", OneHotEncoder(), one_hot_columns),
("standard_scaler", StandardScaler(), scaling_columns)
])
X_transformed_raw = transformation_pipeline.fit_transform(X)
X_transformed = pd.DataFrame(X_transformed_raw, columns=[
"Is_Rural", "Is_Suburban", "Is_Urban", "Is_Non_Public", "Is_Public", "Is_Non_Standard_Teaching",
"Is_Standard_Teaching", "Is_Female", "Is_Male", "Is_Not_Qualify_Lunch",
"Is_Qualify_Lunch", "Num_Students"
])
X_transformed
df