import pandas as pd
import plotly.express as px
from dash import dcc, html, Input, Output
from jupyter_dash import JupyterDash
from sdv.metadata import Metadata
from sdv.single_table import (
CTGANSynthesizer, TVAESynthesizer,
GaussianCopulaSynthesizer, CopulaGANSynthesizer
)
from sdv.evaluation.single_table import (
run_diagnostic, evaluate_quality, get_column_plot
)
from sdmetrics.single_table import (
DCRBaselineProtection, DCROverfittingProtection
)
Run to view results
original
# Fixing the issue where 'original' is not a pandas DataFrame.
# The error indicates that 'original' is being passed as a string (file path) instead of a DataFrame.
import pandas as pd
from sdv.metadata import Metadata
from sdv.single_table import (
CTGANSynthesizer, TVAESynthesizer,
GaussianCopulaSynthesizer, CopulaGANSynthesizer
)
from sdv.evaluation.single_table import (
run_diagnostic, evaluate_quality
)
from sdmetrics.single_table import (
DCRBaselineProtection, DCROverfittingProtection
)
# Load the dataset into a pandas DataFrame
original = pd.read_csv('file_input_uploads/adult_dataset.csv')
num_rows = int(input("š¢ How many synthetic rows to generate? "))
target_variable = input("šÆ Target variable (optional): ").strip() or None
# ===============================
# 5. Detect Metadata
# ===============================
metadata = Metadata.detect_from_dataframe(data=original, table_name="real_data_table")
# ===============================
# 6. Train 4 Synthesizers
# ===============================
synthesizers = {
"CTGAN": CTGANSynthesizer(metadata, epochs=10, verbose=True),
"TVAE": TVAESynthesizer(metadata, epochs=10, verbose=True),
"GaussianCopula": GaussianCopulaSynthesizer(metadata),
"CopulaGAN": CopulaGANSynthesizer(metadata, epochs=10, verbose=True)
}
synthetic_data_dict = {}
loss_dict = {}
for name, model in synthesizers.items():
print(f"š§ Training {name}...")
model.fit(original)
synthetic = model.sample(num_rows)
synthetic_data_dict[name] = synthetic
if hasattr(model, "get_loss_values"):
loss_dict[name] = model.get_loss_values()
# ===============================
# 7. Evaluation (Modern SDV Style)
# ===============================
models = list(synthetic_data_dict.keys())
quality_scores = []
diagnostic_scores = []
column_quality_scores = {m: {} for m in models}
privacy_scores = {'DCRBaseline': [], 'Overfitting': []}
for model in models:
synth_df = synthetic_data_dict[model]
diagnostic = run_diagnostic(original, synth_df, metadata)
quality = evaluate_quality(original, synth_df, metadata)
diagnostic_scores.append(diagnostic.get_score())
quality_scores.append(quality.get_score())
details = quality.get_details(property_name="Column Shapes")
column_quality_scores[model] = details.set_index("Column")["Score"].to_dict()
m = metadata.to_dict()['tables']['real_data_table']
privacy_scores['DCRBaseline'].append(DCRBaselineProtection.compute(original, synth_df, m))
privacy_scores['Overfitting'].append(DCROverfittingProtection.compute(original, synth_df, original, m))
# ===============================
# 9. Excel Export
# ===============================
download_choice = input("\nš„ Would you like to download all synthetic datasets as a single Excel file with multiple tabs? (yes/no): ").strip().lower()
if download_choice == "yes":
excel_filename = "synthetic_datasets.xlsx"
with pd.ExcelWriter(excel_filename, engine='xlsxwriter') as writer:
for name, df in synthetic_data_dict.items():
df.to_excel(writer, sheet_name=name[:31], index=False)
print(f"\nā
Excel file created: {excel_filename}")
# Uncomment the following line if running in an environment that supports file downloads
# files.download(excel_filename)
else:
print("š Download skipped.")
Run to view results