# Importing necessary libraries import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns

Run to view results

create or replace table `dbt-bigquery-415514.james_profiling.ga_sessions_profiling` as SELECT visitId, timestamp_seconds(visitStartTime) as visitStartTime, date, cast(totals.bounces as int64) as bounces, cast(geoNetwork.country as string) as country, cast(geoNetwork.region as string) as region, cast(geoNetwork.city as string) as city, cast(trafficSource.source as string) as source, cast(trafficSource.medium as string) as medium, cast(totals.timeOnSite as string) as timeOnSite, fullVisitorId FROM `dbt-bigquery-415514.raw.ga_sessions`

Run to view results

select * from dbt-bigquery-415514.james_profiling.ga_sessions_profiling

Run to view results

Data Quality Assessment

This involves examining the overall quality of the data, including checking for missing values, duplicate records, and inconsistencies. It helps identify data quality issues that need to be addressed before further analysis.

# Check for missing values print("Missing Values:") print(data.isnull().sum()) # Check for duplicate records duplicate_rows = data[data.duplicated()] print("Duplicate Records:") print(duplicate_rows)

Run to view results

Data Type Analysis

Understanding the data types of different columns in the dataset is crucial for proper data processing. Data profiling involves identifying the data types (e.g., numerical, categorical, date/time) of each column and ensuring they are correctly interpreted.

# Data type distribution print("Data Type Distribution:") print(data.dtypes.value_counts())

Run to view results

Summary Statistics

Calculating summary statistics such as mean, median, mode, standard deviation, minimum, and maximum values provides a high-level overview of the dataset's distribution and central tendencies. It helps identify outliers and anomalies in the data.

# Summary statistics print("Summary Statistics:") print(data.describe())

Run to view results

Data Distribution Analysis

Analyzing the distribution of numerical and categorical variables helps understand their underlying patterns and relationships. Visualization techniques such as histograms, box plots, and bar charts are commonly used to visualize data distributions.

numerical_cols = data.select_dtypes(include=[np.number]).columns col = 'bounces' plt.figure(figsize=(8, 6)) data[col].hist(bins=20) plt.title(f'Distribution of {col}') plt.xlabel(col) plt.ylabel('Frequency') plt.show() # Visualizing categorical columns categorical_cols = data.select_dtypes(include=['object']).columns print(categorical_cols)

Run to view results

Cardinality Assessment

Cardinality refers to the number of unique values in a column. Analyzing the cardinality of categorical variables helps understand their diversity and potential impact on analysis tasks such as grouping and aggregation.

# Cardinality of categorical variables print("Cardinality of Categorical Variables:") print(data.select_dtypes(include=['object']).nunique())

Run to view results

Data Relationship Analysis

Exploring relationships between different variables in the dataset helps uncover correlations, dependencies, and patterns. Techniques such as correlation analysis, scatter plots, and heatmap visualizations are used to analyze relationships between numerical variables.

# Correlation matrix correlation_matrix = data.corr() plt.figure(figsize=(10, 8)) sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5) plt.title("Correlation Matrix") plt.show()

Run to view results

# Scatter plot plt.figure(figsize=(8, 6)) plt.scatter(data['column1'], data['column2']) plt.title("Scatter Plot of column1 vs column2") plt.xlabel("column1") plt.ylabel("column2") plt.show()

Run to view results

Data Skewness and Kurtosis

Skewness and kurtosis are measures of the shape of the distribution of numerical variables. Analyzing skewness and kurtosis helps understand the symmetry and tail heaviness of the distributions, which is important for modeling assumptions.

# Skewness and kurtosis skewness = data.select_dtypes(include=['float64', 'int64']).skew() kurtosis = data.select_dtypes(include=['float64', 'int64']).kurtosis() print("Skewness:") print(skewness) print("Kurtosis:") print(kurtosis)

Run to view results

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Data Quality Assessment

Data Type Analysis

Summary Statistics

Data Distribution Analysis

Cardinality Assessment

Data Relationship Analysis

Data Skewness and Kurtosis

Data Quality Assessment