SQL
Saved to variable
df_1SELECT DISTINCT location
FROM 'CovidDeaths.csv'
WHERE continent IS NULL
SQL
Saved to variable
df_2SELECT *
FROM 'CovidDeaths.csv'
WHERE continent IS NOT NULL
ORDER BY 3,4
LIMIT 10;
SQL
Saved to variable
df_3SELECT
COALESCE(continent, 'UNKNOWN') AS continent,
COUNT(DISTINCT location) AS number_of_unique_countries
FROM 'CovidDeaths.csv'
GROUP BY continent
ORDER BY continent
SQL
Saved to variable
df_3SELECT COUNT(DISTINCT date)
FROM 'CovidDeaths.csv';
SQL
Saved to variable
df_3SELECT
Location,
date,
total_cases,
new_cases,
total_deaths,
population
FROM 'CovidDeaths.csv'
WHERE continent IS NOT NULL
ORDER BY 1,2;
SQL
Saved to variable
df_3SELECT
Location,
date,
total_cases,
total_deaths,
(total_deaths/total_cases)*100 as DeathPercentage
FROM 'CovidDeaths.csv'
WHERE location like '%malay%'
and continent IS NOT NULL
ORDER BY 1,2;
SQL
Saved to variable
df_3SELECT
Location,
date,
Population,
total_cases,
(total_cases/population)*100 as PercentPopulationInfected
FROM 'CovidDeaths.csv'
--WHERE location like '%malay%'
ORDER BY 1,2;
SQL
Saved to variable
df_3SELECT
Location,
Population,
MAX(total_cases) as HighestInfectionCount,
Max((total_cases/population))*100 as PercentPopulationInfected
FROM 'CovidDeaths.csv'
--WHERE location like '%malay%'
GROUP BY Location, Population
ORDER BY Location ASC;
SQL
Saved to variable
df_3SELECT Location, MAX(cast(Total_deaths as int)) as TotalDeathCount
FROM 'CovidDeaths.csv'
--WHERE location like '%malay%'
WHERE continent IS NOT NULL
GROUP BY Location
ORDER BY TotalDeathCount desc;
SQL
Saved to variable
df_3SELECT continent, MAX(cast(Total_deaths as int)) as TotalDeathCount
FROM 'CovidDeaths.csv'
--WHERE location like '%malay%'
WHERE continent IS NOT NULL
GROUP BY continent
ORDER BY TotalDeathCount desc;
SQL
Saved to variable
df_3SELECT
SUM(new_cases) as total_cases,
SUM(cast(new_deaths as int)) as total_deaths,
SUM(cast(new_deaths as int)) / SUM(New_Cases)*100 as DeathPercentage
FROM 'CovidDeaths.csv'
--WHERE location like '%malay%'
WHERE continent IS NOT NULL
--GROUP BY date
ORDER BY 1,2;
SQL
Saved to variable
df_3SELECT
dea.continent,
dea.location,
dea.date,
dea.population,
vac.new_vaccinations,
vac.total_vaccinations
-- SUM(CAST(vac.new_vaccinations AS INT))
-- OVER (PARTITION BY
-- dea.location
-- ORDER BY dea.location, dea.Date)
-- AS total_vaccinations_calc
FROM 'CovidDeaths.csv' dea
JOIN 'CovidVaccinations.csv' vac
ON dea.location = vac.location
AND dea.date = vac.date
WHERE
dea.continent IS NOT NULL
AND vac.new_vaccinations IS NOT NULL
ORDER BY 2,3;
RuntimeError: Invalid Input Error: Conversion Error: Could not convert string '2156688722' to INT32 in column "total_vaccinations", between line 94210 and 95233. Parser options: DELIMITER=',' (auto detected), QUOTE='"' (auto detected), ESCAPE='' (auto detected), HEADER=1 (auto detected), SAMPLE_SIZE=10240, ALL_VARCHAR=0. Consider either increasing the sample size (SAMPLE_SIZE=X [X rows] or SAMPLE_SIZE=-1 [all rows]), or skipping column conversion (ALL_VARCHAR=1)
SQL
Saved to variable
df_3WITH PopvsVac AS (
SELECT
dea.continent,
dea.location,
dea.date,
dea.population,
vac.new_vaccinations,
CAST(vac.total_vaccinations AS BIGINT) AS total_vaccinations
FROM 'CovidDeaths.csv' dea
JOIN 'CovidVaccinations.csv' vac
ON dea.location = vac.location
AND dea.date = vac.date
WHERE
dea.continent IS NOT NULL
)
SELECT *, (total_vaccinations/Population)*100 AS PctTotalVaccinations
FROM PopvsVac
RuntimeError: Invalid Input Error: Conversion Error: Could not convert string '2156688722' to INT32 in column "total_vaccinations", between line 94210 and 95233. Parser options: DELIMITER=',' (auto detected), QUOTE='"' (auto detected), ESCAPE='' (auto detected), HEADER=1 (auto detected), SAMPLE_SIZE=10240, ALL_VARCHAR=0. Consider either increasing the sample size (SAMPLE_SIZE=X [X rows] or SAMPLE_SIZE=-1 [all rows]), or skipping column conversion (ALL_VARCHAR=1)
SQL
Saved to variable
df_3DROP TABLE IF EXISTS #CombinedData
CREATE TABLE #CombinedData (
Continent VARCHAR(255),
Location VARCHAR(255),
Date DATETIME,
Month_of_year DATETIME,
Population NUMERIC,
New_cases NUMERIC,
total_cases NUMERIC,
New_vaccinations NUMERIC,
-- A new calculated cumulative vaccinations
total_vaccinations_calc NUMERIC
)
INSERT INTO #CombinedData
SELECT
dea.continent,
dea.location,
dea.date,
-- new column to truncate the month from the date
-- RIGHT(CONVERT(varchar, dea.date, 3), 5) AS month_of_year,
DATEFROMPARTS(YEAR(dea.date), MONTH(dea.date), 1) AS month_of_year,
dea.population,
dea.new_cases,
dea.total_cases,
vac.new_vaccinations,
-- Temp table seems like can only accept a maximum of 9 columns
-- vac.total_vaccinations,
-- new calculated cumulative total vaccinations
SUM(CONVERT(INT, vac.new_vaccinations))
OVER (PARTITION BY
dea.Location
ORDER BY dea.location, dea.Date)
AS total_vaccinations_calc
FROM 'CovidDeaths.csv' dea
JOIN 'CovidVaccinations.csv' vac
ON dea.location = vac.location
AND dea.date = vac.date
WHERE
dea.continent IS NOT NULL
ValueError: Not supported query type!
SQL
Saved to variable
df_3SELECT *, (total_vaccinations_calc/Population)*100 AS PctTotalVaccinations
FROM #CombinedData
ORDER BY Location, Date;
SQL
Saved to variable
df_3SELECT *, (total_vaccinations_calc/Population)*100 AS PctTotalVaccinations
FROM #CombinedData
WHERE new_vaccinations IS NOT NULL
ORDER BY Location, Date;
SQL
Saved to variable
df_3SELECT
combi.Location,
combi.Month_of_year,
SUM(combi.New_cases) AS monthly_cases,
SUM(CAST(dea.new_deaths AS INT)) AS monthly_deaths,
SUM(combi.New_vaccinations) AS monthly_vaccinations
FROM #CombinedData combi
-- WHERE location LIKE 'malay%'
JOIN 'CovidDeaths.csv' dea
ON combi.Location = dea.location
AND combi.Date = dea.date
GROUP BY combi.Location, combi.Month_of_year
ORDER BY combi.Location, combi.Month_of_year
ExecuteSqlError: There is no Dataframe named #CombinedData in scope!
SQL
Saved to variable
df_3SELECT
continent,
location,
date,
population,
new_cases,
SUM(new_cases)
OVER (PARTITION BY location
ORDER BY date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW)
AS cases_7d,
ROUND
(
AVG(new_cases)
OVER (PARTITION BY location
ORDER BY date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW),
4
) AS rolling_avg_7d,
total_cases,
-- a calculated field for total cases to compare with original data
SUM(new_cases)
OVER (PARTITION BY location ORDER BY location, date)
AS total_cases_calc
FROM 'CovidDeaths.csv'
WHERE
continent IS NOT NULL;